In [1]:
import pandas as pd
from pandas import Timedelta
import numpy as np
import re
from shapely.geometry import Point
from shapely import wkt
import geopandas as gpd
import matplotlib.pyplot as plt
import warnings

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        Load the data in chunks
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [2]:
# Import the dataset
df = pd.read_csv('taxi_main copy.csv')

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Formatting
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Change the variable names
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [3]:
    
df.rename(columns = dict(zip(['Taxi ID', 'Trip Start Timestamp', 'Trip End Timestamp', 'Trip Seconds',
       'Trip Miles', 'Pickup Census Tract', 'Dropoff Census Tract',
       'Pickup Community Area', 'Dropoff Community Area', 'Fare', 
       'Extras','Pickup Centroid Location',
       'Dropoff Centroid  Location'], ['taxi_id', 'trip_start', 'trip_end', 'trip_seconds',
       'trip_miles', 'pickup_census', 'dropoff_census',
       'pickup_comm_area', 'dropoff_comm_area', 'fare', 
       'extras',  'pickup_location',
       'dropoff_location'])), inplace = True
    )

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Map the Taxi Id's to numeric values
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [4]:
taxi_id_map = dict(zip(df['taxi_id'].unique(),range(len(df['taxi_id'].unique() ))))
df['taxi_id'] = df['taxi_id'].apply(lambda x: taxi_id_map[x])

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        3. Reduce the precision of the GPS data for consistency
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

We reduce the GPS coordinates to 10 digits for now, as this is the minimum precision all points have. We can reuse the function later to reduce the precision to 6 or 7 digits for the neural networks.

In [5]:
# use pythons regular expressions to find all the seperating '.' in a Point object. Then truncate both coordinates after 10 digits.

def format_point(point):
    if isinstance(point, str):  
        # Extract the numbers
        matches = re.findall(r'(-?\d+\.\d+)', point)
        
        # If there exactly two coordinates with a '.' , we can truncate the coordinates to 10 digits.
        if matches and len(matches) == 2:
            lon, lat = matches
            # Truncate to 10 decimal places
            lon = lon[:lon.index('.') + 11]
            lat = lat[:lat.index('.') + 11]
            return f"POINT ({lon} {lat})"
    return point

In [6]:
df.loc[:, 'pickup_location'] = df['pickup_location'].apply(format_point)
df.loc[:, 'dropoff_location'] = df['dropoff_location'].apply(format_point)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4. Change the datatype of the timestamps
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [7]:
# Convert columns to datetime
df['trip_start'] = pd.to_datetime(df['trip_start'])
df['trip_end'] = pd.to_datetime(df['trip_end'])

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        5. Change the datatype of the Census Tracts
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [8]:
# convert the census tracts and fill the NA's with zeros
df['pickup_census'] = pd.to_numeric(df['pickup_census'], errors='coerce').fillna(0).astype('Int64')
df['dropoff_census'] = pd.to_numeric(df['dropoff_census'], errors='coerce').fillna(0).astype('Int64')

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        6. Split timestamps into 'Day' and  'Hour' variables
        </span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [9]:
df['start_day'] = df['trip_start'].dt.date
df['end_day'] = df['trip_end'].dt.date

df['start_time'] = df['trip_start'].dt.time
df['end_time'] = df['trip_end'].dt.time

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        7. Calculate estimate for the trip duration with the timestamps
        </span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [10]:
# calculate the trip duration with the time stamps, because some trip seconds data are erroneous
df['trip_duration'] = df['trip_end'] - df['trip_start']

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        8. Convert the locations to shapely Point objects
        </span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [11]:
def string_to_point(point_str):
    if pd.isna(point_str):
        return None
    return wkt.loads(point_str)

In [12]:
df['dropoff_location'] = df['dropoff_location'].apply(string_to_point)
df['pickup_location'] = df['pickup_location'].apply(string_to_point)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        9. Drop unnecessary variables
        </span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [13]:
df.drop(['Trip ID','Trip Total','Tips','Payment Type','Pickup Centroid Latitude','Pickup Centroid Longitude', 'Tolls', 'Company',
         'Dropoff Centroid Latitude','Dropoff Centroid Longitude'],inplace=True,axis=1)

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Dropping outliers by looking at scenarios
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Drop rides that ended before they started


In [14]:
# filter the indexes of rides that ended before they started and subsequently drop them
inval_time = df[df['trip_end'] < df['trip_start']].index
df.drop(inval_time, inplace = True)

In [15]:
len(inval_time)

16

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Drop  rides with inconsistent timestamps

In [16]:
# filter the indexes of rides that started and ended in the same 15 minute interval but where the ride  took longer than 15 minutes
time_errors = df[(df['trip_end'] == df['trip_start']) & 
(df['trip_seconds']>900)].index
df.drop(time_errors, inplace = True)

In [17]:
len(time_errors)

0

After handling these scenarios, the timestamps are no longer needed and can be dropped.

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
         Drop the timestamps

In [18]:
df.drop(columns=['trip_start', 'trip_end'], inplace=True)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        3. Drop rides without valid fares


In [19]:
# filter the indexes of rides with a fare of less than 3.25$ (minimum fare) dollars and subsequently drop them
inval_fare = df[df['fare']<3.25].index
df.drop(inval_fare, inplace = True)

# filter the indexes of rides with a fare of exactly 3.25$ but more than a mile traveled (each additional mile costs 2,25$)
inval_fare2 = df[(df['fare']==3.25) & (df['trip_miles'] >1)].index
df.drop(inval_fare2, inplace = True)

In [20]:
print(len(inval_fare),len(inval_fare2))

2122 96


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4. Drop extremely long rides


In [21]:
# use pandas timedelta for filtering rides  that lasted  more than 8 hours and drop them, as these are likely outliers.
long_rides =  df[df['trip_duration']>Timedelta('0 days 04:00:00')].index
df.drop(long_rides, inplace = True)

# lets also  drop  the rides that were longer than 8 hours according  to the trip seconds.
longer_rides =  df[df['trip_seconds'] >= 14400].index
df.drop(longer_rides, inplace = True)

In [22]:
print(len(long_rides),len(longer_rides))

509 14


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        5. Drop missing fares, trip seconds and trip miles


As there are only 1500 missing values combined for the 3 variables: 'Fare', 'Trip Seconds', 'Trip Miles', we decide to drop them.

In [23]:
missing_miles = df[df['trip_miles'].isna()].index
df.drop(missing_miles, inplace = True)


missing_seconds = df[df['trip_seconds'].isna()].index
df.drop(missing_seconds, inplace = True)


missing_fares = df[df['fare'].isna()].index
df.drop(missing_fares, inplace = True)

In [24]:
print(len(missing_seconds), len(missing_miles), len(missing_fares))

7 29 44


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        6. Drop rides where the taxi was stationary
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

We assume that in order for a ride to be valid, the taxi has to move some time. Thus, the trip seconds or trip miles shouldnt be 0 or the dropoff location is different to the pickup location. A fare of more than 0 dollars alone doesnt constitute a valid ride, because there may have been problems with the taxometer.

In [25]:
# filter rides that have the  same start and endtime, 0 trip seconds and 0 trip miles and the same pickup and dropoff location

invalid_rides = df[(df['trip_seconds']==0) & (df['trip_duration'] == Timedelta('0 days 00:00:00'))
& (df['pickup_location'] == df['dropoff_location'])
& (df['trip_miles'] == 0)].index
df.drop(invalid_rides, inplace = True)

In [26]:
len(invalid_rides)

8635

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        7. Drop rides with illogical prices
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

Official prices for taxis in chicago in 2020 can be found here: https://www.chicago.gov/content/dam/city/depts/bacp/publicvehicleinfo/Chicabs/chicagotaxiplacard20200629.pdf.
The prices for 2019 cannot be found on the offical pages anymore, but we only use them as a loose orientation.
Since we only have the total fare and not all the components contributing to the fare, it is quite difficult to catch all outliers.
Due to this, we look for outliers by using the percentiles of the 'Fare' variable in combination with other variables.
A fare in the 99.9 percentile (92 Dollars or more) should correspond to a very far or long ride or someone had to pay 50 Dollars for throwing up in the cab.
We are not interested in the latter and thus filter out all the rides that paid more than 92 Dollars but were not among either the 99,5% longest rides in terms of mileage or trip duration. These percentiles may seeem arbitray but make sense, when considering the cost from the link provided above. The minimum fare is 3,25USD, each mile is 2,25USD and each minute is 0,34USD. The 99.5th percentile for the trip duration is 75 minutes, the 99.5th percentile for the trip duration is 25 miles. This yields an expected price of 3,25+(0,34*75)+(25*2,25)=85USD. This still leaves a margin of 7 dollars for the airport tax and other fees


In [27]:
# A price in the 99.9 percentile should correspond to a very far or long ride         
drop_fares = df[(df['fare']>np.percentile(df['fare'],99.9)) & 
                       ((df['trip_miles'] < df['trip_miles'].quantile(0.995)) &
                        (df['trip_duration'] < df['trip_duration'].quantile(0.995)))].index
# drop the outliers
df.drop(drop_fares, inplace = True)

In [28]:
len(drop_fares)

322

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        8. Drop rides that took 0 seconds
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [29]:
zero_trips = df[(df['trip_seconds']==0) & (df['trip_duration']==Timedelta('0 days 00:00:00'))].index
df.drop(zero_trips, inplace = True)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        9. Drop rides that exceeded the extras limit
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

When considering the prices for extras from the website cited above, it is highly unlikely to observe a rider where the customer paid more than 60 Dollars for extras alone. Thus, we flag these rides as outliers and drop them.

In [30]:

idx2 = df[df['extras']>=60].index

df.drop(idx2, inplace=True)

In [31]:
len(idx2)

290

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        10. Drop rides that have 0 trip miles, 0 trip duration and the same start and end location
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

These rides could be the result of errors in the taxometer. And even if they are not, since they have 0 trip miles and the same start and end location, we argue that they were not actual rides. A valid fare does not constitute a valid ride. The taxi should bring a person from A to B, else we should not consider the ride valid.

In [32]:
short_rides = df[(df['trip_miles']==0) & (df['trip_duration']==Timedelta('0 days 00:00:00'))&
(df['pickup_location']==df['dropoff_location'])].index

df.drop(short_rides, inplace=True)

In [33]:
len(short_rides)

18432

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Dropping outliers with quantiles
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

We assume that the scenarios in the previous section did not catch all outliers, because there are probably 100 more scenarios. Since we cannot check each ride, we now remove rides by looking at the percentile values of the 2 KPI's 'Trip Miles' and 'Fare'. We do not remove outliers based on the variable 'Trip Seconds' or 'Trip Duration', because we already filtered out all the rides that were longer than 8 hours. We decide to drop all rides with 'Fares' or 'Trip Miles' more than 5 standard deviations away from the mean. 

In [34]:
df['fare'].quantile([0.25,0.5,0.75,0.9,0.99,0.995,0.999,0.9995,0.9999, 0.99995, 0.99999])

0.25000      6.000000
0.50000      8.000000
0.75000     13.500000
0.90000     36.000000
0.99000     53.000000
0.99500     63.750000
0.99900     87.500000
0.99950    100.000000
0.99990    148.750000
0.99995    194.984875
0.99999    867.360954
Name: fare, dtype: float64

<div class="alert alert-block alert-warning">
<b>Question:</b> All of the prices seem feasible.... Maybe don't do this? Maybe look at the quantiles in the table and choose one?
</div>

In [35]:
high_fares = df[df['fare']>df['fare'].mean() + 5 *df['fare'].std()].index
df.drop(high_fares, inplace = True)

In [36]:
len(high_fares)

199

In [37]:
df['trip_miles'].quantile([0.25,0.5,0.75,0.9,0.99,0.995,0.999,0.9995,0.9999, 0.99995, 0.99999])

0.25000      0.670000
0.50000      1.280000
0.75000      3.100000
0.90000     12.750000
0.99000     20.300000
0.99500     24.830000
0.99900     33.400000
0.99950     36.920390
0.99990     44.858234
0.99995     47.526858
0.99999    116.288624
Name: trip_miles, dtype: float64

In [38]:
high_mileage = df[df['trip_miles']>df['trip_miles'].mean() + 5 *df['trip_miles'].std()].index
df.drop(high_mileage, inplace = True)

In [39]:
len(high_mileage)

1186

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Spatial data cleaning
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.  Drop rides with no spatial information
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [40]:
idx = df[(df['pickup_census']==0) & (df['pickup_comm_area'].isna()) &
     (df['pickup_location'].isna()) ].index
df.drop(idx, inplace=True)

In [41]:
len(idx)

42444

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2.  Drop rides that started outside the city
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

There are some rides that have no pickup community area and no pickup location but a pickup census tract. When filtering these rides and looking up the location of these census tracts on https://www.chicagocityscape.com/maps/index.php#/?search_term=17031803005&places_type=censustract, we can observe that these rides started outside of the city. Hence, we drop them.

In [42]:
out_of_chic = df[(df.pickup_location.isna()) & (df.pickup_comm_area.isna())].index
df.drop(out_of_chic, inplace=True)
len(out_of_chic)

46

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        3.  Reassign some rides to census tracts entirely within Chicago
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

Some rides started within Chicago, but their census tracts are only partially inside the city. Take census  tract 17031770700 which is barely touching the Census tract 17031980000 (chicago O'hare airport). The locations for this rides are the centroids for the part of the census tract that is within chicago. As we only have the areas of the census tracts inside the city and this problem only applies to this one census tract, we decide to manually reassign these rides to the census tract of the airport. We do this instead of completely droping these rides, because these are quite a few rides and they clearly belong to the census tract with the chicago O'hare airport.
https://www.chicagocityscape.com/maps/index.php#/?search_term=17031770700&places_type=censustract&place=communityarea-roseland.
Therefore, we will map the rides from the census tract 17031770700 onto the census tract 17031980000, because this is the only one that overlaps.

In [43]:
### let us retrieve the location data for the census tract 17031980000

new_cens = 17031980000
new_loc = df[df.pickup_census==17031980000].iloc[0].pickup_location

In [44]:
# get the location of census tract 17031770700
old_loc = df[df.pickup_census==17031770700].iloc[0].pickup_location

# reassign the rides
df.loc[df.pickup_census == 17031770700, 'pickup_census'] = new_cens
df.loc[df.pickup_census == old_loc, 'pickup_location'] = new_loc

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.  Assign census tracts via the community areas for the censored rides
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

Problem Description: There are roughly 4 Million rides that have no census tract but only a community area. Refering to the description of the dataset, this is the case, because the location of these rides has been censored for whatever reason. This is a problem, because the remaining 12 million rides have location on a census tract resolution. Thus, the majority of the data have more precise location information than the 4 million rides that have the centroids of the community areas as the locations. This is particularly problematic for the clustering, as the 4 million  rides basically have been aggregated. That leaves us with 2 options: 1. We could assign the rides with census tract level resolution to the community areas. This would of course mean that we lose data accuracy. 2. We could reallocate the 4 million rides to get census tract level resolution for out entire sample. Of course this will assign rides to census tracts that might not actually had any rides in it.
We still choose to the latter option, because this should yield a more accurate distribution of rides than the one provided in the dataset.

We randomly assign the rides with no census tract to one census tract that is within the community area where the ride started or ended. The random allocation ensures that we do not introduce any unnecessary bias. 

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.1 Load the data for the census tracts and community areas from the city of chicago

In [45]:
census = gpd.read_file('Boundaries.geojson')
comm_area = gpd.read_file('comm_area.geojson')

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.2  Create a dictionary that maps the community areas to all the census tracts thats in it

Note: There are two scenarios: 1. The census tracts is entirely inside a community area. In that case, we simply map the census tract to the community area. 2. Some census tracts are in several community areas. In that case, we calculate the area of overlap and assign the census tract to the community area with the largest overlap.

In [46]:
# Create a dict that contains each comm area as the key
commarea_to_census = {int(comm):[] for comm in census.commarea_n.unique()}

for k in range(len(census)):
    
    # Create a list to store the corresponding comm areas for each census tract
    results = []
    
    for i in range(len(comm_area)):  
    
        # Check whether the comm area overlaps with the census tract and save the area
        if census.geometry[k].overlaps(comm_area.geometry[i]) == True:
    
            intersection = census.geometry[k].intersection(comm_area.geometry[i])
            intersection_area = intersection.area/census.geometry[k].area
            
            
            results.append([int(comm_area.iloc[i][4]),intersection_area]) 
        
        # Check whether the census tract is within the comm area
        elif census.geometry[k].within(comm_area.geometry[i]):
            results.append([int(comm_area.iloc[i][4]),1])
      
    # Sort the list to get the comm area with the biggest overlap
    results = sorted(results, key=lambda x: x[1], reverse=True)
    
    # Append the census tract to the corresponding comm area
    commarea_to_census[results[0][0]].append(int(census.geoid10[k]))

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.3  Map the pickup census tracts to the pickup locations

There is a 1:1 relationship between the unique values of the pickup census tracts and pickup centroid locations, as the locations are the centroids of the census tracts. However, there are 800 census tracts in the city of chicago, but just 537 in our dataset, thus, we must calculate the centroids of the other census tracts with geopandas. 

In [47]:
# Create a dict to map each census tract to its location
# no 0 in the keys
census_map_pickup = {int(x): [] for x in census.geoid10}

# Populate the dict witht the corresponding centroid locations of the census tracts
for key in census_map_pickup:
    
    if key in df['pickup_census'].unique():
    
        # Search the DataFrame for the Census Tract and get the corresponding location
        index = df[df['pickup_census']==key].index[0]
        location = df.loc[index,'pickup_location']
    
        census_map_pickup[key] = location
    else:
        loc = census.geoid10[census.geoid10 == str(key)].index
        
        # Supress the warning about the accuracy of centroid
        with warnings.catch_warnings():           
            warnings.filterwarnings('ignore', message="Geometry is in a geographic CRS.*centroid", category=UserWarning)
    
        
            centroid = census.loc[loc,'geometry'].centroid.iloc[0]
        
            
             # Convert coordinates to strings and truncate after 10 decimal places
            centroid_x = float(str(centroid.x)[:str(centroid.x).find('.') + 11])
            centroid_y = float(str(centroid.y)[:str(centroid.y).find('.') + 11])

            # Create a new Point with the truncated coordinates
            trunc_centroid = Point(centroid_x, centroid_y)

            # populate the dictionary with the calculated centroid
            census_map_pickup[key] = trunc_centroid

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.4  Map the dropoff census tracts to the dropoff locations

Since there are slightly more unique dropoff census tracts in the taxi trip datasets than there are unique census tracts, we create a seperate dictionary, using the same methodolgy, to create consistency of locations. We do this , because we donot exactly know which CRS was used to calculate the centroids in the taxi trip dataset.

In [48]:
# Create a dict to map each census tract to its location
# no 0 in the keys
census_map_dropoff = {int(x): [] for x in census.geoid10}

# Populate the dict witht the corresponding centroid locations of the census tracts
for key in census_map_dropoff:
    
    if key in df['dropoff_census'].unique():
    
        # Search the DataFrame for the Census Tract and get the corresponding location
        index = df[df['dropoff_census']==key].index[0]
        location = df.loc[index,'dropoff_location']
    
        census_map_dropoff[key] = location
    else:
        loc = census.geoid10[census.geoid10 == str(key)].index
        
        # Supress the warning about the accuracy of centroid
        with warnings.catch_warnings():           
            warnings.filterwarnings('ignore', message="Geometry is in a geographic CRS.*centroid", category=UserWarning)
    
        
            centroid = census.loc[loc,'geometry'].centroid.iloc[0]
            
            # Convert coordinates to strings and truncate after 10 decimal places
            centroid_x = float(str(centroid.x)[:str(centroid.x).find('.') + 11])
            centroid_y = float(str(centroid.y)[:str(centroid.y).find('.') + 11])

            # Create a new Point with the truncated coordinates
            trunc_centroid = Point(centroid_x, centroid_y)

        
            census_map_dropoff[key] = trunc_centroid

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.5  Assign the trips with no pickup census tract to a random census tract in the community area and change the location accordingly

In [49]:
def reassign_pickup_location(comm_area:int):
    # randomly assign a census tract for the community area from the commarea_to_census  dictionary
    # get the number of census tracts in the community area
    n_cens = len(commarea_to_census[comm_area])
    
    # sample a random number between 0 and the number of census tracts in the community area minus one
    rand_cens = np.random.randint(n_cens)
    # get the corresponding census tract
    new_cens = commarea_to_census[comm_area][rand_cens]

    # get the corresponding location 
    new_loc = census_map_pickup[new_cens]

    return new_cens, new_loc

    

In [50]:
def apply_reassign_pickup(row):
    new_cens, new_loc = reassign_pickup_location(row['pickup_comm_area'])
    row['pickup_census'] = new_cens
    row['pickup_location'] = new_loc
    return row

In [51]:
indexes_to_update = df[df.pickup_census==0].index
df.loc[indexes_to_update] = df.loc[indexes_to_update].apply(apply_reassign_pickup, axis=1)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4.6  Assign the trips with no dropoff census tract to a random census tract in the community area and change the location accordingly

In [52]:
def reassign_dropoff_location(comm_area:int):
    # randomly assign a census tract for the community area from the commarea_to_census  dictionary
    # get the number of census tracts in the community area
    n_cens = len(commarea_to_census[comm_area])
    
    # sample a random number between 0 and the number of census tracts in the community area minus one
    rand_cens = np.random.randint(n_cens)
    # get the corresponding census tract
    new_cens = commarea_to_census[comm_area][rand_cens]

    # get the corresponding location 
    new_loc = census_map_dropoff[new_cens]

    return new_cens, new_loc

In [53]:
def apply_reassign_dropoff(row):
    if  pd.notna(row['dropoff_comm_area']):
        
        new_cens, new_loc = reassign_dropoff_location(row['dropoff_comm_area'])
        row['dropoff_census'] = new_cens
        row['dropoff_location'] = new_loc
    
    return row

In [54]:
# only apply the function the rides where we have no dropoff census tract
indexes_to_update = df[df.dropoff_census==0].index
df.loc[indexes_to_update] = df.loc[indexes_to_update].apply(apply_reassign_dropoff, axis=1)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        5.  Check whether any rides started outside of the city
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

Let us check for each unique pickup location whether it started within a community area or a census tract of the city of chicago.
We do not check this for the dropoff locations, as we did not drop rides that had no dropoff community area/ dropoff location/dropoff census tract.

In [55]:
# find the boundary of chicago by joining the boundaries of the census tracts and community areas
chic_boundary = comm_area.dissolve().unary_union.union(census.dissolve().unary_union)

# find the locations that are in the census tracts or community areas of chicago
locations_within = gpd.GeoSeries(df.pickup_location.unique()).within(chic_boundary)

# Find the pickup locations that are not within the Chicago boundary
gpd.GeoSeries(df.pickup_location.unique())[~locations_within]

GeoSeries([], dtype: geometry)

We can observe that there aren't ay rides that started outside the city.

<span style ="font-size: 24px; font-weight: bold;color: #43556A;">
         Drop more variables that will not be needed hereafter

The variables below were used/created for the data cleaning process, but they do not contain any information needed in the machine learning models.

In [56]:
df.drop(columns=['pickup_comm_area', 'dropoff_comm_area', 'extras',  'trip_duration'],inplace=True)

In [57]:
df.head()

Unnamed: 0,taxi_id,trip_seconds,trip_miles,pickup_census,dropoff_census,fare,pickup_location,dropoff_location,start_day,end_day,start_time,end_time
16,13,600.0,0.0,17031081402,17031839100,9.0,POINT (-87.6129454143 41.8919715078),POINT (-87.6327464887 41.8809944707),2019-01-01,2019-01-01,00:00:00,00:15:00
18,15,1260.0,0.6,17031030604,17031832900,29.5,POINT (-87.6537935286 41.9854724225),POINT (-87.6618611238 41.8741766252),2019-01-01,2019-01-01,00:00:00,00:30:00
19,16,120.0,0.3,17031839100,17031320400,4.0,POINT (-87.6327464887 41.8809944707),POINT (-87.6219716519 41.8774061234),2019-01-01,2019-01-01,00:00:00,00:00:00
20,17,360.0,0.8,17031081300,17031081500,5.75,POINT (-87.6207628651 41.8983317935),POINT (-87.6262149064 41.8925077809),2019-01-01,2019-01-01,00:00:00,00:15:00
22,19,360.0,1.0,17031081403,17031081700,6.25,POINT (-87.6188683546 41.8909220259),POINT (-87.6318639497 41.8920421365),2019-01-01,2019-01-01,00:00:00,00:15:00
