In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
import h3
import matplotlib.pyplot as plt

from shapely.geometry import  Polygon

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Load the data
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

In [2]:
# load the census tract areas as a geodataframe
census_gdf = gpd.read_file('Boundaries.geojson', crs='epsg:4326')

# load the poi data as a geodataframe
poi_gdf = gpd.read_file('POI_data.geojson', crs='epsg:4326')

# load the landuse data as a geodataframe
landuse_gdf = gpd.read_file('landuse_chicago.geojson',  crs='epsg:4326')

# load the taxi trips as a dataframe (only 500k rides are needed for this notebook)
taxi_df = pd.read_csv('clean_taxi_data.csv', nrows=500000)

# load the weather data, but only a subset of weather variables
weather_df = pd.read_csv('Weather_chic.csv')[['datetime', 'temp', 'feelslike', 'dew', 'humidity', 'precip',
                                           'precipprob', 'snow', 'snowdepth', 
                                           'windspeed', 'winddir','cloudcover', 'visibility', 'conditions']]


<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Formatting
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Drop rides with no dropoff location
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [3]:
# for the clustering only trips i.e. start and dropoff can be used
taxi_df.drop(taxi_df[taxi_df.dropoff_location.isna()].index, inplace = True)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Recover the formatting of some variables
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


In [4]:
# drop the 'Unnamed: 0' column
taxi_df.drop(columns=['Unnamed: 0'],inplace = True)

#  convert the strings to point objects
taxi_df['pickup_location'] = taxi_df['pickup_location'].apply(wkt.loads)
taxi_df['dropoff_location'] = taxi_df['dropoff_location'].apply(lambda x: wkt.loads(x) if pd.notnull(x) else None)

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Feature Engineering
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Get all the unique pairs of census tract and location

In [5]:
# Extract unique pair of pickup census tract and pickup location
unique_pickup = taxi_df[['pickup_census', 'pickup_location']].drop_duplicates()

# Extract unique pair of dropoff census tract and dropoff location
unique_dropoff = taxi_df[['dropoff_census', 'dropoff_location']].drop_duplicates()

# Rename columns 
unique_pickup.columns = ['census', 'location']
unique_dropoff.columns = ['census', 'location']

# Combine both dataframes
unique_loc = pd.concat([unique_pickup, unique_dropoff]).drop_duplicates()

# Reset index 
unique_loc.reset_index(drop=True, inplace=True)

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Get the corresponding hexagon (resolution = 7)

In [6]:
unique_loc["h3_hex_id"] = unique_loc.apply(
    lambda row: h3.geo_to_h3(row['location'].y, row['location'].x, 7) 
    if row['location'] 
    else 0,
    axis=1
)

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Get the unique hexagons (resolution = 7)

In [38]:
df_unique_hex = unique_loc.drop_duplicates(subset="h3_hex_id")[['census', 'h3_hex_id' ]]

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Get the polygon of the hexagons

In [8]:
df_unique_hex['geometry_hex'] = df_unique_hex.apply(lambda x: Polygon(h3.h3_to_geo_boundary(x["h3_hex_id"], \
                                                                                          geo_json=True)), axis=1)


<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Get the centroid of each polygon

In [9]:
# Calculate the centroid for each polygon
df_unique_hex['centroid'] = df_unique_hex['geometry_hex'].apply(lambda x: x.centroid)


<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a feature map with the hexagon information

In [10]:
feature_map = gpd.GeoDataFrame(df_unique_hex, geometry = 'centroid', crs = 'epsg:4326').to_crs(epsg=26971)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Create POI features
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.1 Distance to closest airport

In [12]:
# Project the locations of the airports to a crs better fitted for calculating distances (in North America)
airports = poi_gdf[poi_gdf.Category=='Airport'].to_crs(epsg=26971)

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Calculate distance to each airport for each census tract centroid and choose the minimum

In [13]:
# calculate the distance to the closest airport in chicago in metres for each heexagon centroid
feature_map['min_dist_airport'] = feature_map.centroid.apply(lambda x: airports.distance(x).round().min())

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.2 Count the number of hotels in each census tract

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a geoSeries with the locations of the hotels in chicago 


In [14]:
hotels = poi_gdf[poi_gdf.Category=='Hotel']

In [15]:
# we must change the geometry of the feature_map to the polygon for the spatial join
feature_map.set_geometry('geometry_hex', inplace=True)

feature_map.set_crs('EPSG:4326', inplace=True) 

# find all the hexagons that have hotels in it
hotels_in_hex = gpd.sjoin(hotels, feature_map, how='inner', predicate='within')

In [16]:
# count the number of hotels in each census tract
hotel_counter = dict(hotels_in_hex['h3_hex_id'].value_counts())

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of hotels in the respective census tract 

In [17]:
feature_map['num_hotels'] = feature_map.h3_hex_id	.apply(lambda x: hotel_counter[x] \
                                                     if x in hotel_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.3 Count the number of bars/clubs in each census tract


<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a geoSeries with the locations of the bars/clubs in chicago 


In [18]:
bars_clubs = poi_gdf[poi_gdf.Category=='Bar/ Night Club']

In [19]:
# find all the hexagons that have bars/clubs in it
bar_club_in_hex = gpd.sjoin(bars_clubs, feature_map, how='inner', predicate='within')


# count the number of bars/clubs in each hexagon
bar_club_counter = dict(bar_club_in_hex['h3_hex_id'].value_counts())

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of bars/clubs in the respective census tract to each ride 

In [20]:
feature_map['num_bars'] = feature_map.h3_hex_id.apply(lambda x: bar_club_counter[x] \
                                                     if x in bar_club_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.4 Count the number of stadiums in each census tract


We did not filter out the smaller stadiums which obviously should explain fewer rides as the stadium of the chicago bears or chicago cubs. However, when looking at the heat map with the POI locations, the stadiums seems to be a weak predictor for taxi rides. 

In [21]:
stadiums = poi_gdf[poi_gdf.Category=='Stadium']

In [22]:
# find all the hexagons that have stadiums in it
stadiums_in_hex = gpd.sjoin(stadiums, feature_map, how='inner', predicate='within')



# count the number of stadiums in each census tract
stadium_counter = dict(stadiums_in_hex['h3_hex_id'].value_counts())

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of stadiums in the respective census tract to each ride 

In [23]:
feature_map['num_stadiums'] = feature_map.h3_hex_id.apply(lambda x: stadium_counter[x] \
                                                     if x in stadium_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.5 Calculate the distance from the city centre


The centre or downtown chicago is in the community area 32 called loop. This information was retrieved from: https://www.chicago.gov/content/dam/city/depts/doit/general/GIS/Chicago_Maps/Citywide_Maps/Community_Areas_W_Numbers.pdf and https://de.wikipedia.org/wiki/Chicago_Loop.
We visualized the community areas to verify that the 32 commarea actually matches the shape of the loop community area. Use census[census.commarea=='32'].plot() to check this.


In [25]:
# index the community area 32 to get the census tracts that lie within that
# also project to a crs that is suited for calculating distances
loop = census_gdf[census_gdf.commarea=='32'].to_crs(epsg=26971)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        Calculate distance to census tract in chicago downtown for each census tract centroid and choose the minimum

We can reuse the projected unique locations that were used to calculate the distances to the airports. We we define the distance to the centre as the minimum distance to one of the census tracts that are within the community area 32 which is chicago downtown.


In [26]:
# we must change the geometry of the feature_map to the centroid for the distance calculation
feature_map.set_geometry('centroid', inplace=True)

# reset the crs
feature_map.set_crs('EPSG:26971', inplace=True) 

# calculate the distance to the city centre in metres for each hexagon centroid
feature_map['dist_centre'] = feature_map.centroid.apply(lambda x: loop.distance(x).round().min())

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Create land use  features
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

We do not intend to create a feature for each of the 11 landuse categories. Instad, we pick 4 categories that may explain high or low demand census tracts. Large areas of "open space" correspond to a lower population density, unlike census tracts with a high percentage of "residential" buildings. As "commercial" parcels include shopping malls and hotels, they may be a good predictor for high demand census tracts. The same may apply to "transportation/communication/utilities" parcels, which include airports and train stations. However, they also include landfill, which should be poor predictors of taxi trips. Adding features with a higher resolution would yield better outcomes, but is too computationally expensive in this project.

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.1 Create a dictionary with landuse percentages per census tract


We create a dictionary containing the percentage of area usage for each of the categories. We do this so that we need not calculate the area usage 14 million times, which would be computationally expensive.

In [27]:
# change the crs of the landuse data such that it matches the other data
landuse_gdf.to_crs(epsg=26971, inplace=True)

In [28]:
# create a dictionary that maps the census tract id to the hexagon id
census_hex_dict = df_unique_hex.set_index('census')['h3_hex_id'].to_dict()

In [29]:
# get the hexagon id for each parcel in the landuse geodataframe

landuse_gdf['hex_id'] = landuse_gdf.geoid10.apply(lambda x: census_hex_dict[int(x)] \
                                                 if int(x) in census_hex_dict
                                                 else None)

In [30]:
# calculate the area for each parcel
landuse_gdf['area'] = landuse_gdf['geometry'].area

In [31]:
# Group by hexagon id and land use category to sum the areas (also reset the indexes)
landuse_by_category = landuse_gdf.groupby(['hex_id', 'LANDUSE'])['area'].sum().reset_index()

In [32]:
# Calculate the total area for each census tract
total_area = landuse_by_category.groupby('hex_id')['area'].sum().reset_index()

# rename the variable
total_area = total_area.rename(columns={'area': 'total_area'})

In [33]:
# Merge total_area with landuse_by_category and calculate the percentage
landuse_by_category = landuse_by_category.merge(total_area, on='hex_id')

# divide the area for each parcel by the area of the hexagon is is within
landuse_by_category['percentage'] = ((landuse_by_category['area'] / landuse_by_category['total_area'])*100).round(2)

# Create a dictionary to store results
landuse_dict = {}

# Populate the dictionary with the data
for _, row in landuse_by_category.iterrows():
    geoid = row['hex_id']
    landuse = row['LANDUSE']
    percentage = row['percentage']
    
    if geoid not in landuse_dict:
        landuse_dict[geoid] = {}
    
    landuse_dict[geoid][landuse] = percentage

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.2 Percentage of residential areas in a census tract


In [34]:
# get the percentage of residential area for each hexagon
feature_map['perc_resid'] = feature_map.h3_hex_id.apply(lambda x: landuse_dict[x]['RESIDENTIAL'] \
                                                    if 'RESIDENTIAL' in landuse_dict[x]\
                                                    else 0.0)
                                                     

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.3 Percentage of open space areas in a census tract


In [35]:
# get the percentage of open area for each hexagon
feature_map['perc_open'] = feature_map.h3_hex_id.apply(lambda x: landuse_dict[x]['OPEN SPACE'] \
                                                    if 'OPEN SPACE' in landuse_dict[x]\
                                                    else 0.0)
     

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.4 Percentage of open commercial areas in a census tract


In [36]:
# get the percentage of commercial area for each hexagon
feature_map['perc_commerc'] = feature_map.h3_hex_id.apply(lambda x: landuse_dict[x]['COMMERCIAL'] \
                                                    if 'COMMERCIAL' in landuse_dict[x]\
                                                    else 0.0)
     

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.5 Percentage of transportation/communication/utilities areas in a census tract


In [37]:
# get the percentage of transportation/communication/utilities area for each hexagon
feature_map['perc_transport'] = feature_map.h3_hex_id.apply(lambda x:\
                                                    landuse_dict[x]['TRANSPORTATION/COMMUNICATION/UTILITIES']
                                                    if 'TRANSPORTATION/COMMUNICATION/UTILITIES' in landuse_dict[x]\
                                                    else 0.0)
     

In [44]:
# we must change the geometry of the feature_map to the polygon for the spatial join
feature_map.set_geometry('geometry_hex', inplace=True)

feature_map.set_crs('EPSG:4326', inplace=True) 

feature_map.drop(columns=['census', 'centroid'], inplace=True)

In [45]:
# write feature map to geojson file (change the crs to the original one)
feature_map.to_crs(epsg=4326).to_file('spatial_features.geojson')

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        3. Create weather  features
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">