## Meeting 03.06. Data Preprocessing

In [1]:
import os
# this directory for the original data file
os.makedirs('./data', exist_ok=True)
# this directory to later save the prepared data
os.makedirs('./data/prepped', exist_ok=True)

In [2]:
# Standard libraries - run pip install if necessary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime


### 1.1 Read and display datafile

In [3]:
# Data file not included in the project, needs to be downloaded individually. This step can take a few minutes due to size of the original file
df = pd.read_csv("data/taxidata.csv")

In [5]:
# Convert time types to check if entries are from correct range
df["trip_start_timestamp"] = pd.to_datetime(df["trip_start_timestamp"])
df["trip_end_timestamp"] = pd.to_datetime(df["trip_end_timestamp"])

# Convert other 
df["trip_seconds"] =  pd.to_numeric(df['trip_seconds'])

# In range of 2022:
print(f"Min date: {df['trip_start_timestamp'].min()}")
print(f"Max date: {df['trip_start_timestamp'].max()}")

Min date: 2022-01-01 00:00:00
Max date: 2022-12-31 23:45:00


In [6]:
# Convert trip duration to hours/km for easier visualization later
df['trip_km'] = df['trip_miles'] * 1.609344
df['trip_hours'] = df['trip_seconds'] / 3600

In [7]:
# Columns to sum
columns_to_sum = ['fare', 'tips', 'tolls', 'extras']

# Check if the sum of columns 'A', 'B', and 'C' equals the 'Sum' column
df['Check'] = df[columns_to_sum].sum(axis=1) == df['trip_total']

# Select rows where the condition is not met
df_not_matching = df[~df['Check']]
df_matching = df[df['Check']]

print(df_matching[['fare', 'tips', 'tolls', 'extras','trip_total']])
print(df_not_matching[['fare', 'tips', 'tolls', 'extras','trip_total']])

          fare  tips  tolls  extras  trip_total
0        20.50  0.00    0.0     0.0       20.50
1        13.84  2.73    0.0     0.0       16.57
2         7.00  2.00    0.0     3.0       12.00
3         6.50  0.00    0.0     0.0        6.50
4         6.25  0.00    0.0     0.0        6.25
...        ...   ...    ...     ...         ...
6382419  30.00  0.00    0.0     0.0       30.00
6382420  14.75  0.00    0.0     0.0       14.75
6382421   9.75  0.00    0.0     1.5       11.25
6382423  63.27  0.00    0.0     0.0       63.27
6382424   6.25  0.00    0.0     2.0        8.25

[4533086 rows x 5 columns]
          fare  tips  tolls  extras  trip_total
9        24.50  6.25    0.0     0.0       31.25
12       20.00  4.50    0.0     2.0       27.00
16        3.25  0.00    0.0    11.0       14.75
19        6.00  3.00    0.0     1.0       10.50
20       18.75  4.81    0.0     0.0       24.06
...        ...   ...    ...     ...         ...
6382396  23.00  5.20    0.0     2.5       31.20
6382409  12.

##### For 1.849.339 the sum of ['fare', 'tips', 'tolls', 'extras'] does not match 'trip_total', but it should according to the description of the column on the website. For 'tips' they say: 'The tip for the trip. Cash tips generally will not be recorded.'. Maybe they are not recorded in the 'tips' column but somehow in the 'total_trip' - however.

In [8]:
df_not_matching['diff'] = df_not_matching[columns_to_sum].sum(axis=1) - df_not_matching['trip_total']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_not_matching['diff'] = df_not_matching[columns_to_sum].sum(axis=1) - df_not_matching['trip_total']


In [9]:
print(df_not_matching['diff'].describe())

count    1.845803e+06
mean    -5.422637e-01
std      4.182485e-01
min     -2.500000e+01
25%     -5.000000e-01
50%     -5.000000e-01
75%     -5.000000e-01
max      1.818989e-12
Name: diff, dtype: float64


In [11]:
std = df_not_matching['diff'].describe(include='all').loc['std']

In [12]:
df_not_matching[np.abs(df_not_matching['diff']) > 3*std]

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,trip_km,trip_hours,Check,diff
55857,a7610ff8f0dd7d7f88f8d04d7b1026d769996638,4ab868a2a19b95f50f355eb8d0d7638f1912b37f49ebd6...,2022-12-28 19:00:00,2022-12-28 19:15:00,814.0,7.49,,,3.0,32.0,...,41.965812,-87.655879,POINT (-87.6558787862 41.96581197),41.878866,-87.625192,POINT (-87.6251921424 41.8788655841),12.053987,0.226111,False,-1.95
74894,61e40e17db1cfab7e1499a2abfacfad1bdc26c35,13016372e777da1289d557edbe4ce2be8a68e77bc64768...,2022-12-27 13:45:00,2022-12-27 14:30:00,2527.0,16.40,,,7.0,76.0,...,41.922686,-87.649489,POINT (-87.6494887289 41.9226862843),41.980264,-87.913625,POINT (-87.913624596 41.9802643146),26.393242,0.701944,False,-1.95
89047,3bdafedf93fc7dc344e4ee57413a09ac46257fec,1a248eb87ae578f15af2a14c2eb7b4b264f6babab9ad9c...,2022-12-26 09:15:00,2022-12-26 09:15:00,156.0,0.47,,,8.0,8.0,...,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111),0.756392,0.043333,False,-1.95
97501,41f8e8ea4a530edf22a8bf18094140efc5e38b78,d41ab2be597b82c3e6b0b0ecccf98883a84db0d9aed4f6...,2022-12-24 19:30:00,2022-12-24 21:00:00,4750.0,87.29,,,56.0,,...,41.792592,-87.769615,POINT (-87.7696154528 41.7925923603),,,,140.479638,1.319444,False,-1.50
104424,eba93b8e1a89ef53b988122cd9e4fce4b70b4627,734a9df762115dd5478c4d03400c8712f440f72a6b606d...,2022-12-24 06:45:00,2022-12-24 07:00:00,910.0,4.51,,,8.0,28.0,...,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.874005,-87.663518,POINT (-87.6635175498 41.874005383),7.258141,0.252778,False,-1.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6375868,e981977319a4b637d81182d9cb69e41a33e754cf,ab31561a5548b1d49b0352bb8d00d18c0e136d5f674c1c...,2022-01-01 18:30:00,2022-01-01 18:45:00,695.0,2.36,,,7.0,6.0,...,41.922686,-87.649489,POINT (-87.6494887289 41.9226862843),41.944227,-87.655998,POINT (-87.6559981815 41.9442266014),3.798052,0.193056,False,-1.95
6376620,3a9de10af5724bdb7df57d5d7843768046fb82d2,398e9ade4dcb28cfd9fc422739cf04efedce1853f93cc7...,2022-01-01 16:15:00,2022-01-01 16:15:00,834.0,5.12,,,24.0,33.0,...,41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),41.857184,-87.620335,POINT (-87.6203346241 41.8571838585),8.239841,0.231667,False,-1.95
6378124,794525be3585b82b7569b7664f14ec9c499865c7,f6858e3bb618a392ec3c5f9674bdc03598943806e369eb...,2022-01-01 12:45:00,2022-01-01 12:45:00,439.0,0.00,,,24.0,8.0,...,41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),41.899602,-87.633308,POINT (-87.6333080367 41.899602111),0.000000,0.121944,False,-1.95
6380498,02610ad16abe0f0518b07bf4f043c07033482ff4,b50eb90cadd2a392b75b725c90d6784b429ca6854e0947...,2022-01-01 03:30:00,2022-01-01 03:30:00,345.0,1.46,,,24.0,7.0,...,41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),41.922686,-87.649489,POINT (-87.6494887289 41.9226862843),2.349642,0.095833,False,-1.95


##### Maximum difference between ['fare', 'tips', 'tolls', 'extras'] and 'trip_total' is ca. 7 dollars, ca. 2 dollars in the mean. For most of the cases 'trip_total' is greater than the sum of the other variables. 

#### Obviously too man rowas are affected. What to do? just trust in 'trip_total' and drop ['fare', 'tips', 'tolls', 'extras']? For which tasks could ['fare', 'tips', 'tolls', 'extras'] be relevant? I think maximum for task 3 - cluster analysis (customer types). The other tasks refer more to spatial and temporal aspects. Another idea could be that if we use one of these variables, we emphasize that there are some differences which we cant explain and which might limit the meaningfulness of our results. 

In [73]:
column_of_interest = 'company'

# Get the number of unique values
num_unique_values = df[column_of_interest].nunique()

# Get the array of unique values
unique_values = df[column_of_interest].unique()

print(f"Number of unique values in '{column_of_interest}': {num_unique_values}")
print(f"Unique values in '{column_of_interest}': {unique_values}")

Number of unique values in 'company': 38
Unique values in 'company': ['Flash Cab' 'Taxi Affiliation Services' 'Taxicab Insurance Agency Llc'
 'City Service' 'Taxicab Insurance Agency, LLC' 'Sun Taxi'
 'Choice Taxi Association' 'Globe Taxi' 'Chicago Independents'
 'Star North Taxi Management Llc' 'Blue Ribbon Taxi Association'
 '5 Star Taxi' 'Medallion Leasin' 'Patriot Taxi Dba Peace Taxi Associat'
 'Koam Taxi Association' 'Top Cab Affiliation' 'U Taxicab'
 '312 Medallion Management Corp' '4053 - 40193 Adwar H. Nikola'
 'Chicago Taxicab' 'Top Cab' 'Setare Inc' '3556 - 36214 RC Andrews Cab'
 '6574 - Babylon Express Inc.' 'Metro Jet Taxi A.' 'Leonard Cab Co'
 '5062 - 34841 Sam Mestas' 'Petani Cab Corp' '4787 - 56058 Reny Cab Co'
 '4623 - 27290 Jay Kim' '2733 - 74600 Benny Jona' '24 Seven Taxi'
 'KOAM Taxi Association' 'Blue Ribbon Taxi Association Inc.'
 'Chicago Medallion Management' '3620 - 52292 David K. Cab Corp.'
 'Nova Taxi Affiliation Llc' 'Chicago Carriage Cab Corp']


In [10]:
print(df_not_matching[df_not_matching['payment_type']=='Credit Card'].shape)
print(df_not_matching[df_not_matching['payment_type']=='Mobile'].shape)
print(df_not_matching[df_not_matching['payment_type']=='Cash'].shape)
print(df_not_matching[df_not_matching['payment_type']=='Prcard'].shape)
print(df_not_matching[df_not_matching['payment_type']=='Unknown'].shape)
print(df_not_matching[df_not_matching['payment_type']=='No Charge'].shape)
print(df_not_matching[df_not_matching['payment_type']=='Prepaid'].shape)

In [11]:
check same for df['trip_miles']==0

### 1.2 Checking data logic & removing invalid data

#### 1.2.1 Duplicate entries

In [14]:
# Check duplicates 
print("Number of duplicate entries: ", df.duplicated(subset = ['taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds', 'trip_miles', 
                                                              'pickup_census_tract','dropoff_census_tract','pickup_community_area', 'dropoff_community_area',
                                                              'pickup_centroid_latitude','pickup_centroid_longitude','pickup_centroid_location','dropoff_centroid_latitude',
                                                              'dropoff_centroid_longitude','dropoff_centroid_location']).sum())

Number of duplicate entries:  21772


In [15]:
df = df.drop_duplicates(subset = ['taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_seconds', 'trip_miles', 
                                                              'pickup_census_tract','dropoff_census_tract','pickup_community_area', 'dropoff_community_area',
                                                              'pickup_centroid_latitude','pickup_centroid_longitude','pickup_centroid_location','dropoff_centroid_latitude',
                                                              'dropoff_centroid_longitude','dropoff_centroid_location'], keep='first')
df.shape

(6360653, 26)

In [16]:
df['kmh'] = np.where(df['trip_hours'] != 0, df['trip_km'] / df['trip_hours'], np.nan)

In [17]:
print(df['kmh'].describe())

count    6.211460e+06
mean     3.382633e+01
std      3.043221e+03
min      0.000000e+00
25%      1.305730e+01
50%      2.134498e+01
75%      3.905000e+01
max      6.047255e+06
Name: kmh, dtype: float64


In [18]:
print(df[df['kmh'] >= 100].shape)

(12901, 27)


In [19]:
print(df[(df['kmh'] <= 100)])

                                          trip_id  \
0        4404c6835b9e74e9f74d70f235200a8ce09db14a   
1        466473fd2a196ebe92fb2983cb7e8af32e39aa1f   
2        3f5cd3f78e5cab455606a31372a95d3204b2fb3f   
3        38292159642750da7b20419330566f9eb0961cde   
4        3e01498f8ff771ad7eb37e4844cef20201b6c339   
...                                           ...   
6382420  54d812a0b88f8f9707825261014b3563a0a60ace   
6382421  7125b9e03a0f16c2dfb5eaf73ed057dc51eb68ef   
6382422  52d1bd00d97eaed338bd98faf80c5709e22fef3d   
6382423  0f0c856e620e6b4dfd2bb1e921d966dd179eeca1   
6382424  50719da0933d6056acca25c91c253012288c70c6   

                                                   taxi_id  \
0        7e179f8ef66ae99ec2d1ec89224e0b7ee5469fe5627f6d...   
1        d1d88b89ceb6d753007b6e795e3c24f4bea905a51e9d47...   
2        847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...   
3        81092e4881f56106fae845c3ae4492f8b3c3213c33c920...   
4        4ae32e2eb244ce143800e0c40055e537cc50e3358a07

#### Find some generous threshold to drop rows where kmh is not reasonable. maybe 70 or 80 because we are in the city? Or 100, because this is the general speed limit on highways (I believe) so of course in the city it will be less, but then it is no threshold out of the blue?

##### And an alternative to dropping trips that last longer than 4 hours (to e.g. not lose round trips) we could set a minimum kmh that should be reached? 

In [20]:
print(df[(df['kmh'] <= 10)].shape)

(1178540, 27)


But there are many even under 10 kmh. Because there are many with trip_km == 0:

In [21]:
print(df[(df['trip_km'] == 0)]['trip_total'])

1          16.57
15          7.25
16         14.75
22         15.00
29          3.25
           ...  
6382407    17.35
6382408     3.25
6382409    15.70
6382411    29.38
6382417    11.25
Name: trip_total, Length: 784587, dtype: float64


#### But how can it be that there are so many trips with trip_km == 0 (784587)? More than: 

Number of rows with no census and no community (PICKUP): 507118

Number of rows with no census and no community (DROPOFF): 586044


Number of rows with no census and no community (BOTH):  224130

Number of rows with no community DROPOFF and no community PICKUP: 225948

So are they related to each other or not?

In [22]:
df.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,pickup_centroid_latitude,pickup_centroid_longitude,pickup_centroid_location,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_centroid_location,trip_km,trip_hours,Check,kmh
0,4404c6835b9e74e9f74d70f235200a8ce09db14a,7e179f8ef66ae99ec2d1ec89224e0b7ee5469fe5627f6d...,2022-12-31 23:45:00,2023-01-01 00:15:00,2081.0,4.42,,,2.0,3.0,...,42.001571,-87.695013,POINT (-87.6950125892 42.001571027),41.965812,-87.655879,POINT (-87.6558787862 41.96581197),7.1133,0.578056,True,12.305565
1,466473fd2a196ebe92fb2983cb7e8af32e39aa1f,d1d88b89ceb6d753007b6e795e3c24f4bea905a51e9d47...,2022-12-31 23:45:00,2023-01-01 00:00:00,812.0,0.0,,,8.0,24.0,...,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.901207,-87.676356,POINT (-87.6763559892 41.9012069941),0.0,0.225556,True,0.0
2,3f5cd3f78e5cab455606a31372a95d3204b2fb3f,847cf962bd6f62040673e6c24c24940aeb2d7fdaa54677...,2022-12-31 23:45:00,2023-01-01 00:00:00,600.0,0.9,,,8.0,8.0,...,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111),1.44841,0.166667,True,8.690458
3,38292159642750da7b20419330566f9eb0961cde,81092e4881f56106fae845c3ae4492f8b3c3213c33c920...,2022-12-31 23:45:00,2023-01-01 00:00:00,546.0,0.85,,,8.0,8.0,...,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111),1.367942,0.151667,True,9.0194
4,3e01498f8ff771ad7eb37e4844cef20201b6c339,4ae32e2eb244ce143800e0c40055e537cc50e3358a07ce...,2022-12-31 23:45:00,2023-01-01 00:00:00,574.0,0.33,,,8.0,8.0,...,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111),0.531084,0.159444,True,3.330837


In [82]:
df_test = df.drop(columns=['trip_id','taxi_id'])

In [24]:
print(df_test[(df_test['trip_km']!=0) & (df_test['trip_miles'] == 0)].shape)
print(df_test[(df_test['trip_km']==0) & (df_test['trip_miles']!=0)].shape)

(0, 23)
(0, 23)


In [25]:
print(df_test[(df_test['trip_km']==0) & (df_test['Check'])].shape)

(698024, 23)


In [26]:
print(df_test[(df_test['trip_km']==0) & (~df_test['Check'])].shape)

(86563, 23)


##### Location Data:

- <ins>pickup/dropoff_census_tract:</ins> Can be missing for two reasons - 1. Privacy 2. Outside Chicago: "This column often will be blank for locations outside Chicago" -> not necessarily
- <ins>pickup/dropoff_community_area:</ins> Can be missing for one reason: Outside Chicago: "This column will be blank for locations outside Chicago" -> necessarily
- <ins>"centroid columns"</ins>: Location of respective census_tract or when missing then of community_area: Consequently "These columns often will be blank for locations outside Chicago"

For 666 pickup_centroid_location is NaN despite one of census_tract or community_area is available:

In [27]:
print(df_test[(df_test['pickup_centroid_location'].isna()) & ((~df_test['pickup_census_tract'].isna()) | (~df_test['pickup_community_area'].isna()))].shape)


(666, 23)


6.837 for the same with dropoff:

In [28]:
print(df_test[(df_test['dropoff_centroid_location'].isna()) & ((~df_test['dropoff_census_tract'].isna()) | (~df_test['dropoff_community_area'].isna()))].shape)


(6837, 23)


In [29]:
print(df_test[(df_test['pickup_census_tract'].isna()) & (~df_test['dropoff_census_tract'].isna())].shape)

(149746, 23)


In [30]:
print(df_test[(~df_test['pickup_census_tract'].isna()) & (df_test['dropoff_census_tract'].isna())].shape)

(98119, 23)


In [31]:
print(df_test[(df_test['pickup_census_tract'].isna()) & (df_test['dropoff_census_tract'].isna())].shape)

(3595758, 23)


##### For 784.587 entries trip_miles is 0:

##### Stornierte Fahrten - miles = 0, seconds = 0, pickup=dropoff:

In [83]:
print(df_test[df_test['trip_miles']==0].shape)

(784587, 25)


In [76]:
print(df_test[(df_test['trip_miles']==0) & (df_test['trip_seconds']==0) & (df_test['pickup_centroid_location'] == df_test['dropoff_centroid_location']) & (~df_test['dropoff_centroid_location'].isna()) & (~df_test['pickup_centroid_location'].isna())].shape)


(105621, 25)


In [79]:
df_miles_zero = df_test[~((df_test['trip_miles']==0) & (df_test['trip_seconds']==0) & (df_test['pickup_centroid_location'] == df_test['dropoff_centroid_location']) & (~df_test['dropoff_centroid_location'].isna()) & (~df_test['pickup_centroid_location'].isna()))]


In [87]:
#noch nicht erklärbare miles 0

In [85]:
df_miles_zero = df_miles_zero[df_miles_zero['trip_miles']==0]

In [86]:
print(df_miles_zero.shape)

(678966, 25)


In [64]:
print(df_test[(df_test['trip_miles']==0) & (df_test['payment_type']=='Cash') & (df_test['trip_start_timestamp'] < '2022-12-31') & (df_test['trip_start_timestamp'] > '2022-01-02')][['fare', 'tips', 'tolls', 'extras','trip_total','payment_type', 'trip_seconds', 'trip_start_timestamp']])


          fare  tips  tolls  extras  trip_total payment_type  trip_seconds  \
24478    60.00   0.0    0.0     0.0       60.00         Cash           0.0   
24490     4.75   0.0    0.0     0.0        4.75         Cash         240.0   
24498     3.75   0.0    0.0     0.0        3.75         Cash         120.0   
24514     3.25   0.0    0.0     0.0        3.25         Cash          60.0   
24520    10.50   0.0    0.0     2.5       13.00         Cash         480.0   
...        ...   ...    ...     ...         ...          ...           ...   
6374385   3.25   0.0    0.0     0.0        3.25         Cash           4.0   
6374390   3.25   0.0    0.0     0.0        3.25         Cash          14.0   
6374429   3.25   0.0    0.0     0.0        3.25         Cash          60.0   
6374432   3.25   0.0    0.0     0.0        3.25         Cash           0.0   
6374433   7.75   0.0    0.0     0.0        7.75         Cash         600.0   

        trip_start_timestamp  
24478    2022-12-30 23:45:00  
2

In [54]:
print(df_test[(df_test['trip_miles']==0) & (df_test['trip_seconds']==0)][['fare', 'tips', 'tolls', 'extras','trip_total', 'trip_seconds', 'trip_start_timestamp']])

          fare  tips  tolls  extras  trip_total  trip_seconds  \
29        3.25  0.00    0.0     0.0        3.25           0.0   
43        3.25  0.00    0.0     0.0        3.25           0.0   
58       50.00  0.00    0.0     0.0       50.00           0.0   
101       3.25  0.00    0.0     0.0        3.25           0.0   
179       3.25  0.00    0.0     0.0        3.25           0.0   
...        ...   ...    ...     ...         ...           ...   
6382188  67.25  0.00    0.0     0.0       67.25           0.0   
6382335   3.25  4.15    0.0    17.0       24.40           0.0   
6382386   3.25  0.00    0.0     0.0        3.25           0.0   
6382399   3.25  0.00    0.0     0.0        3.25           0.0   
6382408   3.25  0.00    0.0     0.0        3.25           0.0   

        trip_start_timestamp  
29       2022-12-31 23:45:00  
43       2022-12-31 23:45:00  
58       2022-12-31 23:45:00  
101      2022-12-31 23:45:00  
179      2022-12-31 23:30:00  
...                      ...  
63

In [48]:
print(df_test[(df_test['trip_miles']==0)].shape)

(784587, 25)


In [49]:
print(df_test[(df_test['trip_miles']==0) & (df_test['trip_total']==3.25)].shape)

(147135, 25)


In [50]:
print(df_test[(df_test['trip_total']==3.25)].shape)

(177026, 25)


##### Only for 89.893 of that (12%) pickup_centroid_location or dropoff_centroid_location is NaN:

In [None]:
print(df_test[(df_test['trip_miles']==0) & ((df_test['pickup_centroid_location'].isna()) | (df_test['dropoff_centroid_location'].isna()))].shape)

In [None]:
print(df_test[(df_test['trip_miles']==0) & ((df_test['pickup_census_tract'].isna()) | (df_test['dropoff_census_tract'].isna()))].shape)


In [None]:
print(df_test[(df_test['trip_miles']==0) & ((df_test['pickup_community_area'].isna()) | (df_test['dropoff_community_area'].isna()))].shape)



In [None]:
df_test[(df_test['trip_miles']==0)].isna().sum()

In [None]:
df_test[(df_test['trip_miles']==0)].head()

In [None]:
print(df_test[(df_test['trip_miles']==0) & 
( (df_test['pickup_census_tract'].isna()) 
| (df_test['dropoff_census_tract'].isna()) ) & (df_test['Check'])].shape)



In [None]:
print(df_test[(df_test['trip_miles']==0) & 
((df_test['pickup_community_area'].isna()) & (df_test['pickup_census_tract'].isna()) & (df_test['pickup_centroid_location'].isna())) 
| ((df_test['dropoff_community_area'].isna()) & (df_test['dropoff_census_tract'].isna()) & (df_test['dropoff_centroid_location'].isna()))].shape)


In [None]:
print(df_test[(df_test['trip_miles']==0) & 
((~df_test['pickup_community_area'].isna()) & (~df_test['pickup_census_tract'].isna()) & (~df_test['pickup_centroid_location'].isna())) 
& ((~df_test['dropoff_community_area'].isna()) & (~df_test['dropoff_census_tract'].isna()) & (~df_test['dropoff_centroid_location'].isna()))].shape)


In [None]:
df_test.head()

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
import matplotlib.pyplot as plt

# Load your dataframe
# df = pd.read_csv('your_dataframe.csv')
df_test = df_test.dropna(subset=["trip_miles"])

# Define target and features
target_column = 'trip_miles'
features = df_test.drop(columns=[target_column,'trip_km','kmh','payment_type','company','pickup_centroid_location','dropoff_centroid_location'])
target = df_test[target_column]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Initialize and train the decision tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Visualize the decision tree
plt.figure(figsize=(20,10))
tree.plot_tree(clf, feature_names=features.columns, class_names=['Non-Zero', 'Zero'], filled=True)
plt.show()

# Get feature importances
feature_importances = pd.DataFrame(clf.feature_importances_, index=features.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)
