In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Load the data
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

In [2]:
# load the census tract areas as a geodataframe
census_gdf = gpd.read_file('Boundaries.geojson', crs='epsg:4326')

# load the poi data as a geodataframe
poi_gdf = gpd.read_file('POI_data.geojson', crs='epsg:4326')

# load the landuse data as a geodataframe
landuse_gdf = gpd.read_file('landuse_chicago.geojson',  crs='epsg:4326')

# load the taxi trips as a dataframe (only 500k rides are needed for this notebook)
taxi_df = pd.read_csv('clean_taxi_data.csv', nrows=500000)


<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Formatting
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Drop rides with no dropoff location
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

In [3]:
# for the clustering only trips i.e. start and dropoff can be used
taxi_df.drop(taxi_df[taxi_df.dropoff_location.isna()].index, inplace = True)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Recover the formatting of some variables
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


In [4]:
# drop the 'Unnamed: 0' column
taxi_df.drop(columns=['Unnamed: 0'],inplace = True)

#  convert the strings to point objects
taxi_df['pickup_location'] = taxi_df['pickup_location'].apply(wkt.loads)
taxi_df['dropoff_location'] = taxi_df['dropoff_location'].apply(lambda x: wkt.loads(x) if pd.notnull(x) else None)

<div>
    <span style ="font-size: 40px; font-weight: bold; color: #8EB944">
        Feature Engineering
    </span>
    
<hr style="color: #8EB944; height: 3px;background-color: #8EB944;border: none">
</div>

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Get all the unique pairs of census tract and location

In [5]:
# Extract unique pair of pickup census tract and pickup location
unique_pickup = taxi_df[['pickup_census', 'pickup_location']].drop_duplicates()

# Extract unique pair of dropoff census tract and dropoff location
unique_dropoff = taxi_df[['dropoff_census', 'dropoff_location']].drop_duplicates()

# Rename columns 
unique_pickup.columns = ['census', 'location']
unique_dropoff.columns = ['census', 'location']

# Combine both dataframes
unique_loc = pd.concat([unique_pickup, unique_dropoff]).drop_duplicates()

# Reset index 
unique_loc.reset_index(drop=True, inplace=True)

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Use the locations for the creating a feature map

In [6]:
feature_map = gpd.GeoDataFrame(unique_loc, geometry='location', crs='epsg:4326').to_crs(epsg=26971)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1. Create POI features
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">


<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.1 Distance to closest airport

In [7]:
# Project the locations of the airports to a crs better fitted for calculating distances (in North America)
airports = poi_gdf[poi_gdf.Category=='Airport'].to_crs(epsg=26971)

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Calculate distance to each airport for each census tract centroid and choose the minimum

In [8]:
# calculate the distance to the closest airport in chicago in metres for each location
feature_map['min_dist_airport'] = feature_map.location.apply(lambda x: airports.distance(x).round().min())

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.2 Count the number of hotels in each census tract

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a geoSeries with the locations of the hotels in chicago 


In [9]:
hotels = poi_gdf[poi_gdf.Category=='Hotel']

In [10]:
# find all the census tracts that have hotels in it
hotels_in_census = gpd.sjoin(hotels, census_gdf, how='inner', predicate='within')


# convert the census id to integer as this type is used in the taxi trip dataset
hotels_in_census.geoid10 = hotels_in_census.geoid10.astype('Int64')

# count the number of hotels in each census tract
hotel_counter = dict(hotels_in_census['geoid10'].value_counts())

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of hotels in the respective census tract 

In [11]:
feature_map['num_hotels'] = feature_map.census.apply(lambda x: hotel_counter[x] \
                                                     if x in hotel_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.3 Count the number of bars/clubs in each census tract


<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Create a geoSeries with the locations of the bars/clubs in chicago 


In [12]:
bars_clubs = poi_gdf[poi_gdf.Category=='Bar/ Night Club']

In [13]:
# find all the census tracts that have bars/clubs in it
bar_club_in_census = gpd.sjoin(bars_clubs, census_gdf, how='inner', predicate='within')

# convert the census id to integer, as this type is used in the taxi trip dataset
bar_club_in_census.geoid10 = bar_club_in_census.geoid10.astype('Int64')

# count the number of bars/clubs in each census tract
bar_club_counter = dict(bar_club_in_census['geoid10'].value_counts())

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of bars/clubs in the respective census tract to each ride 

In [14]:
feature_map['num_bars'] = feature_map.census.apply(lambda x: bar_club_counter[x] \
                                                     if x in bar_club_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.4 Count the number of stadiums in each census tract


We did not filter out the smaller stadiums which obviously should explain fewer rides as the stadium of the chicago bears or chicago cubs. However, when looking at the heat map with the POI locations, the stadiums seems to be a weak predictor for taxi rides. 

In [15]:
stadiums = poi_gdf[poi_gdf.Category=='Stadium']

In [16]:
# find all the census tracts that have stadiums in it
stadiums_in_census = gpd.sjoin(stadiums, census_gdf, how='inner', predicate='within')

# convert the census id to integer, as this type is used in the taxi trip dataset
stadiums_in_census.geoid10 = stadiums_in_census.geoid10.astype('Int64')

# count the number of stadiums in each census tract
stadium_counter = dict(stadiums_in_census['geoid10'].value_counts())

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        Add the number of stadiums in the respective census tract to each ride 

In [17]:
feature_map['num_stadiums'] = feature_map.census.apply(lambda x: stadium_counter[x] \
                                                     if x in stadium_counter else 0)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.5 Calculate the distance from the city centre


The centre or downtown chicago is in the community area 32 called loop. This information was retrieved from: https://www.chicago.gov/content/dam/city/depts/doit/general/GIS/Chicago_Maps/Citywide_Maps/Community_Areas_W_Numbers.pdf and https://de.wikipedia.org/wiki/Chicago_Loop.
We visualized the community areas to verify that the 32 commarea actually matches the shape of the loop community area. Use census[census.commarea=='32'].plot() to check this.


In [18]:
# index the community area 32 to get the census tracts that lie within that
# also project to a crs that is suited for calculating distances
loop = census_gdf[census_gdf.commarea=='32'].to_crs(epsg=26971)

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        Calculate distance to census tract in chicago downtown for each census tract centroid and choose the minimum

We can reuse the projected unique locations that were used to calculate the distances to the airports. We we define the distance to the centre as the minimum distance to one of the census tracts that are within the community area 32 which is chicago downtown. Note that there are 5 census tracts within community area 32, thus, there should be 5 census tracts with a minimum distance of 0 which is the case here.


In [19]:
# create a dictionary with all the unique locations as keys and the minimum distance to chicago downtown as the value
# we do this to avoid calculating the distance 14 million times

# calculate the distance to the closest airport in chicago in metres for each location
feature_map['dist_centre'] = feature_map.location.apply(lambda x: loop.distance(x).round().min())

<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        1.6 Check whether one of the two airports is in the census tract


We already calculated thee minimum distance to the airports, but it may the case that census tracts close to the airports do not get more rides than the ones further away. Therefore, we also create a binary feauture that is 1 when there is a airport in the census tract. We reuse the airport dataframe from above.


In [20]:
# Filter the POIs by the specified category
airports = poi_gdf[poi_gdf.Category == 'Airport']
    
# Reproject the airports to a suitable CRS for distance calculations
airports = airports.to_crs(epsg=26971)
    
# Apply a 1000-meter buffer around the POI locations (we only have the airport locations as a single point)
airports_buffer = airports.copy()
airports_buffer['geometry'] = airports_buffer.geometry.buffer(1100)
    
# Reproject the feature_map to the same CRS
feature_map_projected = feature_map.to_crs(epsg=26971)
    
# Perform a spatial join to find all census tra that intersect with the buffered airports
airport_in_hex = gpd.sjoin(airports_buffer, feature_map_projected, how='inner', predicate='intersects')
    

 # Reproject feature_map back to the original CRS
feature_map = feature_map_projected.to_crs(epsg=4326)
    
# Add the count of airports to the respective hexagon in feature_map
feature_map['airport_in_census'] = feature_map.census.apply(
        lambda x: 1 if x in list(airport_in_hex.census) else 0.0)




<span style ="font-size: 18px; font-weight: bold;color: #43556A;">
        2. Create land use  features
</span>
<hr style="color: #8EB944; height: 1px;background-color: #43556A;border: none">

We do not intend to create a feature for each of the 11 landuse categories. Instad, we pick 4 categories that may explain high or low demand census tracts. Large areas of "open space" correspond to a lower population density, unlike census tracts with a high percentage of "residential" buildings. As "commercial" parcels include shopping malls and hotels, they may be a good predictor for high demand census tracts. The same may apply to "transportation/communication/utilities" parcels, which include airports and train stations. However, they also include landfill, which should be poor predictors of taxi trips. Adding features with a higher resolution would yield better outcomes, but is too computationally expensive in this project.

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.1 Create a dictionary with landuse percentages per census tract


We create a dictionary containing the percentage of area usage for each of the categories. We do this so that we need not calculate the area usage 14 million times, which would be computationally expensive.

In [21]:
# change the crs of the landuse data such that it matches the other data
landuse_gdf.to_crs(epsg=26971, inplace=True)

In [22]:
# calculate the area for each parcel
landuse_gdf['area'] = landuse_gdf['geometry'].area

In [23]:
# Group by census tract and land use category to sum the areas (also reset the indexes)
landuse_by_category = landuse_gdf.groupby(['geoid10', 'LANDUSE'])['area'].sum().reset_index()

In [24]:
# Calculate the total area for each census tract
total_area = landuse_by_category.groupby('geoid10')['area'].sum().reset_index()

# rename the variable
total_area = total_area.rename(columns={'area': 'total_area'})

In [25]:
# Merge total_area with landuse_by_category and calculate the percentage
landuse_by_category = landuse_by_category.merge(total_area, on='geoid10')

# divide the area for each parcel by the area of the census tract is is within
landuse_by_category['percentage'] = ((landuse_by_category['area'] / landuse_by_category['total_area'])*100).round(2)

# Create a dictionary to store results
landuse_dict = {}

# Populate the dictionary with the data
for _, row in landuse_by_category.iterrows():
    geoid = int(row['geoid10'])
    landuse = row['LANDUSE']
    percentage = row['percentage']
    
    if geoid not in landuse_dict:
        landuse_dict[geoid] = {}
    
    landuse_dict[geoid][landuse] = percentage

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.2 Percentage of residential areas in a census tract


In [26]:
# get the percentage of residential area for each census tract
feature_map['perc_resid'] = feature_map.census.apply(lambda x: landuse_dict[x]['RESIDENTIAL'] \
                                                    if 'RESIDENTIAL' in landuse_dict[x]\
                                                    else 0.0)
                                                     

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.3 Percentage of open space areas in a census tract


In [27]:
# get the percentage of open area for each census tract
feature_map['perc_open'] = feature_map.census.apply(lambda x: landuse_dict[x]['OPEN SPACE'] \
                                                    if 'OPEN SPACE' in landuse_dict[x]\
                                                    else 0.0)
     

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.4 Percentage of open commercial areas in a census tract


In [28]:
# get the percentage of commercial area for each census tract
feature_map['perc_commerc'] = feature_map.census.apply(lambda x: landuse_dict[x]['COMMERCIAL'] \
                                                    if 'COMMERCIAL' in landuse_dict[x]\
                                                    else 0.0)
     

<span style ="font-size: 16px; font-weight: bold;color: #43556A;">
        2.5 Percentage of transportation/communication/utilities areas in a census tract


In [29]:
# get the percentage of transportation/communication/utilities area for each census tract
feature_map['perc_transport'] = feature_map.census.apply(lambda x:\
                                                    landuse_dict[x]['TRANSPORTATION']
                                                    if 'TRANSPORTATION' in landuse_dict[x]\
                                                    else 0.0)
     

In [30]:
# write feature map to geojson file (change the crs to the original one)
feature_map.to_crs(epsg=4326).to_file('census_spatial_features.geojson')

In [31]:
feature_map

Unnamed: 0,census,location,min_dist_airport,num_hotels,num_bars,num_stadiums,dist_centre,airport_in_census,perc_resid,perc_open,perc_commerc,perc_transport
0,17031081402,POINT (-87.61295 41.89197),16440.0,0,2,0,410.0,0.0,10.16,6.05,5.49,2.43
1,17031030800,POINT (-87.66419 41.97991),20288.0,0,1,0,10544.0,0.0,54.15,0.35,7.83,0.36
2,17031839100,POINT (-87.63275 41.88099),14421.0,18,13,0,0.0,0.0,3.66,0.33,38.70,8.09
3,17031081300,POINT (-87.62076 41.89833),16513.0,7,2,0,1048.0,0.0,27.75,7.24,22.62,1.20
4,17031081403,POINT (-87.61887 41.89092),16014.0,9,7,0,271.0,0.0,13.37,10.69,11.49,4.22
...,...,...,...,...,...,...,...,...,...,...,...,...
796,17031670700,POINT (-87.66673 41.78307),7116.0,0,0,0,9732.0,0.0,29.94,0.00,6.77,0.00
797,17031671400,POINT (-87.67623 41.77379),6478.0,0,0,0,10949.0,0.0,33.17,0.00,1.46,17.94
798,17031520200,POINT (-87.53979 41.71380),19437.0,0,1,0,18163.0,0.0,20.54,0.18,9.54,19.15
799,17031671800,POINT (-87.65645 41.76140),8444.0,0,1,0,11906.0,0.0,41.39,0.00,2.16,5.29
