In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
from shapely.geometry import Point, MultiPolygon, Polygon
from shapely import wkt
import matplotlib.pyplot as plt

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Load the data
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

In [2]:
# load the census tract areas as a geodataframe
census_gdf = gpd.read_file('Boundaries.geojson')

# load the poi data as a geodataframe
poi_gdf = gpd.read_file('POI_data.geojson')

# load the landuse data as a geodataframe
landuse_gdf = gpd.read_file('landuse_chicago.geojson')

# load the taxi trips as a dataframe
taxi_df = pd.read_csv('Cleaned_data_final.csv', nrows=500000)

# load the weather data, but only a subset of weather variables
weather_df = pd.read_csv('Weather_chic.csv')[['datetime', 'temp', 'feelslike', 'dew', 'humidity', 'precip',
                                           'precipprob', 'snow', 'snowdepth', 
                                           'windspeed', 'winddir','cloudcover', 'visibility', 'conditions']]


<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Formatting
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>


<div class="alert alert-block alert-warning">
<b>Disclaimer:</b> Only perform that operation for the cluster analysis
</div>

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Drop rides with no dropoff location
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [3]:
# for the clustering only trips i.e. start and dropoff can be used
taxi_df.drop(taxi_df[taxi_df.dropoff_location.isna()].index, inplace = True)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Recover the formatting of some variables
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


In [4]:
# drop the 'Unnamed: 0' column
taxi_df.drop(columns=['Unnamed: 0'],inplace = True)

#  convert the strings to point objects
taxi_df['pickup_location'] = taxi_df['pickup_location'].apply(wkt.loads)
taxi_df['dropoff_location'] = taxi_df['dropoff_location'].apply(lambda x: wkt.loads(x) if pd.notnull(x) else None)

# create a trip start timestamp (needed for creating the weather feautures)
taxi_df['trip_start'] = taxi_df['start_day'] + ' ' + taxi_df['start_time']
taxi_df['trip_start'] = pd.to_datetime(taxi_df['trip_start'])


# create a trip start timestamp (needed for creating the weather feautures)
taxi_df['trip_end'] = taxi_df['end_day'] + ' ' + taxi_df['end_time']
taxi_df['trip_end'] = pd.to_datetime(taxi_df['trip_end'])

# convert the strings to time objects
taxi_df['start_day'] = pd.to_datetime(taxi_df['start_day'])
taxi_df['end_day'] = pd.to_datetime(taxi_df['end_day'])

# convert the strings to time objects
taxi_df['start_time'] = pd.to_datetime(taxi_df['start_time']).dt.floor('H').dt.time
taxi_df['end_time'] = pd.to_datetime(taxi_df['end_time']).dt.floor('H').dt.time


### format the weather data
# convert the datatype of the datetime variable such that it matches the format of the taxi data
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        3. Create two seperate GeoDataFrames
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


In [5]:
# only one geometry can be used in a geodataframe
# use a crs that is better suited for calculated distances in chicago
taxi_start_gdf = gpd.GeoDataFrame(taxi_df[['trip_start', 'pickup_census', 'pickup_location', 'start_day', 'start_time' ]],
                                  geometry='pickup_location', crs='epsg:4326').to_crs(epsg=26971)

In [6]:
# use a crs that is better suited for calculated distances in chicago
taxi_end_gdf = gpd.GeoDataFrame(taxi_df[['trip_end','dropoff_census', 'dropoff_location', 'end_day', 'end_time' ]],
                                geometry='dropoff_location', crs='epsg:4326').to_crs(epsg=26971)

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Feature Engineering
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Calculate the distance to the closest airport
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


In [7]:
# Project the locations of the airports to a crs better fitted for calculating distances (in North America)
airports = poi_gdf[poi_gdf.Category=='Airport'].to_crs(epsg=26971)

In [8]:
unique_loc1 = gpd.GeoSeries(taxi_start_gdf['pickup_location'].unique(), crs="EPSG:26971")
unique_loc2 = gpd.GeoSeries(taxi_end_gdf['dropoff_location'].unique(), crs="EPSG:26971")

# Combine the unique locations from both datasets
combined_unique_loc = pd.concat([unique_loc1, unique_loc2]).unique()


# Create a GeoSeries with the combined unique locations
unique_loc = gpd.GeoSeries(combined_unique_loc, crs="EPSG:26971")

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Calculate distance to each airport for each census tract centroid and choose the minimum

In [9]:
airport_distances = dict(zip(unique_loc, unique_loc.apply(lambda x: airports.distance(x).min())))


In [10]:
taxi_start_gdf['min_dist_airport'] = taxi_start_gdf.pickup_location.apply(lambda x: airport_distances[x])
taxi_end_gdf['min_dist_airport'] = taxi_end_gdf.dropoff_location.apply(lambda x: airport_distances[x])

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Count the number of hotels in each census tract
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a geoSeries with the locations of the hotels in chicago 


In [11]:
hotels = poi_gdf[poi_gdf.Category=='Hotel']

In [12]:
# find all the census tracts that have hotels in it
hotels_in_census = gpd.sjoin(hotels, census_gdf, how='inner', predicate='within')

# convert the census id to integer as this type is used in the taxi trip dataset
hotels_in_census.geoid10 = hotels_in_census.geoid10.astype('Int64')

hotel_counter = dict(hotels_in_census['geoid10'].value_counts())




<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of hotels in the respective census tract to each ride 

In [13]:
taxi_start_gdf['num_hotels'] = taxi_start_gdf.pickup_census.apply(lambda x: hotel_counter[x] if x in hotel_counter else 0)
taxi_end_gdf['num_hotels'] = taxi_end_gdf.dropoff_census.apply(lambda x: hotel_counter[x] if x in hotel_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        3. Count the number of bars/clubs in each census tract
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a geoSeries with the locations of the bars/clubs in chicago 


In [14]:
bars_clubs = poi_gdf[poi_gdf.Category=='Bar/ Night Club']

In [15]:
# find all the census tracts that have bars/clubs in it
bar_club_in_census = gpd.sjoin(bars_clubs, census_gdf, how='inner', predicate='within')

# convert the census id to integer, as this type is used in the taxi trip dataset
bar_club_in_census.geoid10 = bar_club_in_census.geoid10.astype('Int64')

bar_club_counter = dict(bar_club_in_census['geoid10'].value_counts())



<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of bars/clubs in the respective census tract to each ride 

In [16]:
taxi_start_gdf['num_bars_clubs'] = taxi_start_gdf.pickup_census.apply(lambda x: bar_club_counter[x] if x in bar_club_counter else 0)
taxi_end_gdf['num_bars_clubs'] = taxi_end_gdf.dropoff_census.apply(lambda x: bar_club_counter[x] if x in bar_club_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        4. Count the number of stadiums in each census tract
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

We did not filter out the smaller stadiums which obviously should explain fewer rides as the stadium of the chicago bears or chicago cubs. However, when looking at the heat map with the POI locations, the stadiums seems to be a weak predictor for taxi rides. 

In [17]:
stadiums = poi_gdf[poi_gdf.Category=='Stadium']

In [18]:
# find all the census tracts that have stadiums in it
stadiums_in_census = gpd.sjoin(stadiums, census_gdf, how='inner', predicate='within')

# convert the census id to integer, as this type is used in the taxi trip dataset
stadiums_in_census.geoid10 = stadiums_in_census.geoid10.astype('Int64')

stadium_counter = dict(stadiums_in_census['geoid10'].value_counts())



<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of stadiums in the respective census tract to each ride 

In [19]:
taxi_start_gdf['num_stadiums'] = taxi_start_gdf.pickup_census.apply(lambda x: stadium_counter[x] if x in stadium_counter else 0)
taxi_end_gdf['num_stadiums'] = taxi_end_gdf.dropoff_census.apply(lambda x: stadium_counter[x] if x in stadium_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        5. Calculate the distance from the city centre
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

The centre or downtown chicago is in the community area 32 called loop. This information was retrieved from: https://www.chicago.gov/content/dam/city/depts/doit/general/GIS/Chicago_Maps/Citywide_Maps/Community_Areas_W_Numbers.pdf and https://de.wikipedia.org/wiki/Chicago_Loop.
We visualized the community areas to verify that the 32 commarea actually matches the shape of the loop community area. Use census[census.commarea=='32'].plot() to check this.


In [20]:
# index the community area 32 to get the census tracts that lie within that
# also project to a crs that is suited for calculating distances
loop = census_gdf[census_gdf.commarea=='32'].to_crs(epsg=26971)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        Calculate distance to census tract in chicago downtown for each census tract centroid and choose the minimum

We can reuse the projected unique locations that were used to calculate the distances to the airports. We we define the distance to the centre as the minimum distance to one of the census tracts that are within the community area 32 which is chicago downtown. Note that there are 5 census tracts within community area 32, thus, there should be 5 census tracts with a minimum distance of 0 which is the case here.


In [21]:
# create a dictionary with all the unique locations as keys and the minimum distance to chicago downtown as the value
# we do this to avoid calculating the distance 14 million times
centre_distances = dict(zip(unique_loc, unique_loc.apply(lambda x: loop.distance(x).min())))

In [22]:
# fill in the distance to the centre for each ride

taxi_start_gdf['dist_centre'] = taxi_start_gdf.pickup_location.apply(lambda x: centre_distances[x])
taxi_end_gdf['dist_centre'] = taxi_end_gdf.dropoff_location.apply(lambda x: centre_distances[x])

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        6. Add land use data as percentage of the census tract
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

We do not intend to create a feature for each of the 11 landuse categories. Instad, we pick 4 categories that may explain high or low demand census tracts. Large areas of "open space" correspond to a lower population density, unlike census tracts with a high percentage of "residential" buildings. As "commercial" parcels include shopping malls and hotels, they may be a good predictor for high demand census tracts. The same may apply to "transportation/communication/utilities" parcels, which include airports and train stations. However, they also include landfill, which should be poor predictors of taxi trips. Adding features with a higher resolution would yield better outcomes, but is too computationally expensive in this project.

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        6.1 Create a dictionary with landuse percentages per census tract


We create a dictionary containing the percentage of area usage for each of the categories. We do this so that we need not calculate the area usage 14 million times, which would be computationally expensive.

In [27]:
landuse_gdf.to_crs(epsg=26971, inplace=True)

In [28]:
# calculate the area of landuse for each parcel
landuse_gdf['area'] = landuse_gdf['geometry'].area

In [29]:
# Group by census tract and land use category to sum the areas
landuse_by_category = landuse_gdf.groupby(['geoid10', 'LANDUSE'])['area'].sum().reset_index()

In [35]:
# Calculate the total area for each census tract
total_area = landuse_by_category.groupby('geoid10')['area'].sum().reset_index()

# rename the variable
total_area = total_area.rename(columns={'area': 'total_area'})

In [36]:
# Merge total_area with landuse_by_category and calculate the percentage
landuse_by_category = landuse_by_category.merge(total_area, on='geoid10')

# divide the area for each parcel by the area of the census tract is is within
landuse_by_category['percentage'] = ((landuse_by_category['area'] / landuse_by_category['total_area'])*100).round(4)

# Create a dictionary to store results
landuse_dict = {}

# Populate the dictionary with the data
for _, row in landuse_by_category.iterrows():
    geoid = int(row['geoid10'])
    landuse = row['LANDUSE']
    percentage = row['percentage']
    
    if geoid not in landuse_dict:
        landuse_dict[geoid] = {}
    
    landuse_dict[geoid][landuse] = percentage

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        6.2 Percentage of residential areas in a census tract


In [40]:
# add percentage of residential areas in the census tract where the ride started
taxi_start_gdf['perc_resid'] = taxi_start_gdf.pickup_census.apply(lambda x: landuse_dict[x]['RESIDENTIAL']  \
                                                                 if 'RESIDENTIAL' in landuse_dict[x] \
                                                                 else 0.0)

# add percentage of residential areas in the census tract where the ride ended

taxi_end_gdf['perc_resid'] = taxi_end_gdf.dropoff_census.apply(lambda x: landuse_dict[x]['RESIDENTIAL']  \
                                                                 if 'RESIDENTIAL' in landuse_dict[x] \
                                                                 else 0.0)
    

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        6.3 Percentage of open space areas in a census tract


In [43]:
# add percentage of residential areas in the census tract where the ride started
taxi_start_gdf['perc_open'] = taxi_start_gdf.pickup_census.apply(lambda x: landuse_dict[x]['OPEN SPACE']  \
                                                                 if 'OPEN SPACE' in landuse_dict[x] \
                                                                 else 0.0)

# add percentage of residential areas in the census tract where the ride ended

taxi_end_gdf['perc_open'] = taxi_end_gdf.dropoff_census.apply(lambda x: landuse_dict[x]['OPEN SPACE']  \
                                                                 if 'OPEN SPACE' in landuse_dict[x] \
                                                                 else 0.0)
    

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        6.4 Percentage of open commercial areas in a census tract


In [45]:
# add percentage of residential areas in the census tract where the ride started
taxi_start_gdf['perc_commerc'] = taxi_start_gdf.pickup_census.apply(lambda x: landuse_dict[x]['COMMERCIAL']  \
                                                                 if 'COMMERCIAL' in landuse_dict[x] \
                                                                 else 0.0)

# add percentage of residential areas in the census tract where the ride ended

taxi_end_gdf['perc_commerc'] = taxi_end_gdf.dropoff_census.apply(lambda x: landuse_dict[x]['COMMERCIAL']  \
                                                                 if 'COMMERCIAL' in landuse_dict[x] \
                                                                 else 0.0)
    

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        6.5 Percentage of transportation/communication/utilities areas in a census tract


In [46]:
# add percentage of residential areas in the census tract where the ride started
taxi_start_gdf['perc_transport'] = taxi_start_gdf.pickup_census.apply(lambda x: landuse_dict[x]['TRANSPORTATION/COMMUNICATION/UTILITIES']  \
                                                                 if 'TRANSPORTATION/COMMUNICATION/UTILITIES' in landuse_dict[x] \
                                                                 else 0.0)

# add percentage of residential areas in the census tract where the ride ended

taxi_end_gdf['perc_transport'] = taxi_end_gdf.dropoff_census.apply(lambda x: landuse_dict[x]['TRANSPORTATION/COMMUNICATION/UTILITIES']  \
                                                                 if 'TRANSPORTATION/COMMUNICATION/UTILITIES' in landuse_dict[x] \
                                                                 else 0.0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        7. Create new temporal features
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [48]:
taxi_start_gdf

Unnamed: 0,trip_start,pickup_census,pickup_location,start_day,start_time,min_dist_airport,num_hotels,num_bars_clubs,num_stadiums,dist_centre,perc_resid,perc_open,perc_commerc,perc_transport
0,2019-01-01 00:00:00,17031081402,POINT (359784.209 580354.908),2019-01-01,00:00:00,16440.233888,0,2,0,409.854755,10.1577,6.0507,5.4939,2.4307
1,2019-01-01 00:00:00,17031030703,POINT (355979.792 589976.966),2019-01-01,00:00:00,20812.265848,0,0,0,10326.265967,42.7144,0.7026,20.7995,5.7809
2,2019-01-01 00:00:00,17031839100,POINT (358150.884 579122.089),2019-01-01,00:00:00,14420.628220,18,13,0,0.000000,3.6550,0.3274,38.6992,8.3371
3,2019-01-01 00:00:00,17031081300,POINT (359129.577 581055.923),2019-01-01,00:00:00,16513.005515,7,2,0,1047.752564,27.7549,7.2353,22.6228,1.1982
4,2019-01-01 00:00:00,17031081403,POINT (359293.639 580234.232),2019-01-01,00:00:00,16013.596362,9,7,0,271.306902,13.3707,10.6925,11.4934,4.2183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,2019-01-15 15:00:00,17031320100,POINT (359122.792 579573.589),2019-01-15,15:00:00,15419.239456,16,2,0,0.000000,16.3317,22.6979,21.6553,2.9455
499996,2019-01-15 15:00:00,17031839000,POINT (358271.170 578014.686),2019-01-15,15:00:00,13719.408614,0,3,0,0.000000,33.2197,2.4896,11.9031,7.9407
499997,2019-01-15 15:00:00,17031070300,POINT (356567.540 584446.842),2019-01-15,15:00:00,17885.069257,0,7,0,4803.608272,45.1534,1.0211,12.9949,4.3408
499998,2019-01-15 15:00:00,17031320100,POINT (359122.792 579573.589),2019-01-15,15:00:00,15419.239456,16,2,0,0.000000,16.3317,22.6979,21.6553,2.9455
