<h2>Data Preparation & Feature Engineering for Predictive Analysis</h2>

<h3>Pre-requisites: Loading the libraries and data set</h3>

In [2]:
pip install papermill

Collecting papermill
  Downloading papermill-2.6.0-py3-none-any.whl (38 kB)
Collecting ansicolors
  Downloading ansicolors-1.1.8-py2.py3-none-any.whl (13 kB)
Installing collected packages: ansicolors, papermill
Successfully installed ansicolors-1.1.8 papermill-2.6.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing the required libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import h3
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import papermill as pm

In [3]:
# loading the cleaned data 
BASE_H3_DATASET_ON_CENSUS_DATASET = True
file_path = "./data/"

taxi_df = pd.read_csv(f"{file_path}clean_taxi_data.csv")

In [1]:
taxi_df.head()


NameError: name 'taxi_df' is not defined

<h3>Census tract and h3 spatial resolutions </h3

<h4>Creating separate copies of our data set, one for census tracts and one for h3 resolution</h4>

In [20]:
# Copy to have a separate frames for census tract
taxi_df_census = taxi_df.copy()

#load census tract border data from the city of chicago
census_tracts = gpd.read_file("Boundaries.geojson")

#prepare census tract data
census_tracts = census_tracts.rename({"geoid10": "geo_id", "name10": "census_tract_short"}, axis=1)
census_tracts = census_tracts.filter(['geo_id', 'census_tract_short', 'geometry'])
census_tracts['geometry'] = census_tracts['geometry'].to_crs('EPSG:4326')
census_tracts['centroids'] = census_tracts["geometry"].centroid




  census_tracts['centroids'] = census_tracts["geometry"].centroid


In [24]:
#change census tract feature in the taxi trip data to fit the census tract data from the city of chicago
taxi_df_census['pickup_census'] = taxi_df_census['pickup_census'].apply(lambda x: str(x).split('.')[0][:12])
taxi_df_census['dropoff_census'] = taxi_df_census['dropoff_census'].apply(lambda x: str(x).split('.')[0][:12])

<h3>Creating the h3 polygons</h3>
<p>Data Preparation for h3 hexagons. As this part has not been done in the data preparation it is implemented as part of the predictive feature engineering.</p>

In [25]:
#calculate hexagon for each census tracts based on different hexagon resolutions (4-9)
for i in range(4,10):
    census_tracts[f'h3_{i}'] = census_tracts['centroids'].apply(lambda x: h3.geo_to_h3(x.y, x.x, resolution=i))

In [42]:
# create separate data frame for h3 spatial units
taxi_df_hex = taxi_df_census.copy()

# Ensure the key columns are of the same type
taxi_df_hex['pickup_census'] = taxi_df_hex['pickup_census'].astype(str)
census_tracts['geo_id'] = census_tracts['geo_id'].astype(str)

#Perform the left join to merge the two data frames based on the pickup_census column
taxi_df_hex = pd.merge(taxi_df_hex, census_tracts, left_on='pickup_census', right_on='geo_id', how='left')

# Check the result
taxi_df_hex.head()


Unnamed: 0.1,Unnamed: 0,taxi_id,trip_start,trip_end,trip_seconds,trip_miles,pickup_census,dropoff_census,fare,pickup_location,...,geo_id,census_tract_short,geometry,centroids,h3_4,h3_5,h3_6,h3_7,h3_8,h3_9
0,16,13,2019-01-01 00:00:00,2019-01-01 00:15:00,600.0,0.0,17031081402,17031839100,9.0,POINT (-87.6129454143 41.8919715078),...,17031081402,814.02,"MULTIPOLYGON (((-87.60979 41.89213, -87.60979 ...",POINT (-87.60892 41.89297),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1cffffff,882664c1c7fffff,892664c1c6bffff
1,18,15,2019-01-01 00:00:00,2019-01-01 00:30:00,1260.0,0.6,17031030800,17031841900,29.5,POINT (-87.6641882421 41.9799124453),...,17031030800,308.0,"MULTIPOLYGON (((-87.66876 41.98349, -87.66853 ...",POINT (-87.66419 41.97991),842664dffffffff,852664dbfffffff,862664d8fffffff,872664d88ffffff,882664d883fffff,892664d8833ffff
2,19,16,2019-01-01 00:00:00,2019-01-01 00:00:00,120.0,0.3,17031839100,17031320400,4.0,POINT (-87.6327464887 41.8809944707),...,17031839100,8391.0,"MULTIPOLYGON (((-87.63581 41.88738, -87.63544 ...",POINT (-87.63275 41.88103),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1affffff,882664c1a9fffff,892664c1a8bffff
3,20,17,2019-01-01 00:00:00,2019-01-01 00:15:00,360.0,0.8,17031081300,17031081500,5.75,POINT (-87.6207628651 41.8983317935),...,17031081300,813.0,"MULTIPOLYGON (((-87.61665 41.89679, -87.61667 ...",POINT (-87.62075 41.89834),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1e9fffff,892664c1e8fffff
4,22,19,2019-01-01 00:00:00,2019-01-01 00:15:00,360.0,1.0,17031081403,17031081700,6.25,POINT (-87.6188683546 41.8909220259),...,17031081403,814.03,"MULTIPOLYGON (((-87.60953 41.89096, -87.60484 ...",POINT (-87.60997 41.89191),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1ebfffff,892664c1ea7ffff


In [46]:
# Drop the duplicate column geo_id (same as pickup_census) and not needed column census_tract_short
taxi_df_hex = taxi_df_hex.drop(columns=['geo_id', 'census_tract_short'])




In [45]:
# Check the result
taxi_df_hex.head()

Unnamed: 0.1,Unnamed: 0,taxi_id,trip_start,trip_end,trip_seconds,trip_miles,pickup_census,dropoff_census,fare,pickup_location,...,start_time,end_time,geometry,centroids,h3_4,h3_5,h3_6,h3_7,h3_8,h3_9
0,16,13,2019-01-01 00:00:00,2019-01-01 00:15:00,600.0,0.0,17031081402,17031839100,9.0,POINT (-87.6129454143 41.8919715078),...,00:00:00,00:15:00,"MULTIPOLYGON (((-87.60979 41.89213, -87.60979 ...",POINT (-87.60892 41.89297),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1cffffff,882664c1c7fffff,892664c1c6bffff
1,18,15,2019-01-01 00:00:00,2019-01-01 00:30:00,1260.0,0.6,17031030800,17031841900,29.5,POINT (-87.6641882421 41.9799124453),...,00:00:00,00:30:00,"MULTIPOLYGON (((-87.66876 41.98349, -87.66853 ...",POINT (-87.66419 41.97991),842664dffffffff,852664dbfffffff,862664d8fffffff,872664d88ffffff,882664d883fffff,892664d8833ffff
2,19,16,2019-01-01 00:00:00,2019-01-01 00:00:00,120.0,0.3,17031839100,17031320400,4.0,POINT (-87.6327464887 41.8809944707),...,00:00:00,00:00:00,"MULTIPOLYGON (((-87.63581 41.88738, -87.63544 ...",POINT (-87.63275 41.88103),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1affffff,882664c1a9fffff,892664c1a8bffff
3,20,17,2019-01-01 00:00:00,2019-01-01 00:15:00,360.0,0.8,17031081300,17031081500,5.75,POINT (-87.6207628651 41.8983317935),...,00:00:00,00:15:00,"MULTIPOLYGON (((-87.61665 41.89679, -87.61667 ...",POINT (-87.62075 41.89834),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1e9fffff,892664c1e8fffff
4,22,19,2019-01-01 00:00:00,2019-01-01 00:15:00,360.0,1.0,17031081403,17031081700,6.25,POINT (-87.6188683546 41.8909220259),...,00:00:00,00:15:00,"MULTIPOLYGON (((-87.60953 41.89096, -87.60484 ...",POINT (-87.60997 41.89191),842664dffffffff,852664c3fffffff,862664c1fffffff,872664c1effffff,882664c1ebfffff,892664c1ea7ffff


<h3>Further feature engineering</h3>

<h4>Distance to city center</h4>

In [47]:
def calculate_distance(lat1, lon1, lat2, lon2):
    # Convert decimal degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    # Radius of the Earth in kilometers
    radius = 6371
    # Calculate the distance
    distance = radius * c
    
    # if the first coordinate is south of the second one, negate the distance
    if lat1 < lat2:
        distance *=-1

    return distance

# City center location:
center_lat = 41.879519
center_lon = -87.633026

<h4>Adding demand: New data frames for each spatial resolution</h4>

In [49]:
def calculate_hourly_demand(input_dataframe, census_tract_column):
    input_df = input_dataframe.copy()
    input_df['start_time'] = pd.to_datetime(input_df['start_time'])
    
    result_data = []

    start_time_min = input_df['start_time'].min().replace(minute=0, second=0)
    start_time_max = input_df['start_time'].max().replace(minute=0, second=0) + pd.Timedelta(hours=1)
    
    time_range = pd.date_range(start=start_time_min, end=start_time_max, freq=TIME_RESOLUTION)
    df_time_range = pd.DataFrame({'start_time': time_range})
    
    # Iterate through unique census tract IDs
    unique_census_tracts = input_df[census_tract_column].unique()
    for census_tract in unique_census_tracts:
        temp_df = input_df[input_df[census_tract_column] == census_tract]
        temp_resampled = temp_df.resample(TIME_RESOLUTION, on='start_time').size().reset_index(name='demand')
        
        # Merge the two DataFrames to ensure all time steps are included
        temp_resampled = df_time_range.merge(temp_resampled, on=['start_time'], how='left').fillna(0)
        temp_resampled['spatial_unit_id'] = census_tract

        result_data.append(temp_resampled)
    
    # Concatenate the result dataframes
    result_df = pd.concat(result_data, ignore_index=True)
    
    return result_df

# Defining time resolution (e.g., "H" for hourly)
TIME_RESOLUTION = 'H'

# caluclating demand for census and different h3 resolutions
df_demand_census = calculate_hourly_demand(taxi_df_census, 'pickup_census')
df_demand_h3_4 = calculate_hourly_demand(taxi_df_hex, 'h3_4')
df_demand_h3_5 = calculate_hourly_demand(taxi_df_hex, 'h3_5')
df_demand_h3_6 = calculate_hourly_demand(taxi_df_hex, 'h3_6')
df_demand_h3_7 = calculate_hourly_demand(taxi_df_hex, 'h3_7')
df_demand_h3_8 = calculate_hourly_demand(taxi_df_hex, 'h3_8')
df_demand_h3_9 = calculate_hourly_demand(taxi_df_hex, 'h3_9')

In [52]:
# Define the time resolution (e.g., "H" for hourly)
TIME_RESOLUTION = 'H'

# List of h3 columns to process
h3_columns = ['h3_4', 'h3_5', 'h3_6', 'h3_7', 'h3_8', 'h3_9']

# Dictionary to store the demand DataFrames for different h3 levels
df_demand_h3 = {}

# Loop through each h3 column and calculate the demand
for h3_col in h3_columns:
    df_demand_h3[h3_col] = calculate_hourly_demand(taxi_df_hex, h3_col)




In [54]:
# Example: Access the DataFrame for h3_4
print(df_demand_h3['h3_4'].head())

# If you need to concatenate all h3 demand DataFrames into one:
df_demand_all_h3 = pd.concat(df_demand_h3.values(), ignore_index=True)

           start_time    demand  spatial_unit_id
0 2024-08-12 00:00:00  297246.0  842664dffffffff
1 2024-08-12 01:00:00  241950.0  842664dffffffff
2 2024-08-12 02:00:00  196851.0  842664dffffffff
3 2024-08-12 03:00:00  153392.0  842664dffffffff
4 2024-08-12 04:00:00  116405.0  842664dffffffff


In [55]:
df_demand_all_h3

Unnamed: 0,start_time,demand,spatial_unit_id
0,2024-08-12 00:00:00,297246.0,842664dffffffff
1,2024-08-12 01:00:00,241950.0,842664dffffffff
2,2024-08-12 02:00:00,196851.0,842664dffffffff
3,2024-08-12 03:00:00,153392.0,842664dffffffff
4,2024-08-12 04:00:00,116405.0,842664dffffffff
...,...,...,...
38295,2024-08-12 20:00:00,4.0,8926641948fffff
38296,2024-08-12 21:00:00,4.0,8926641948fffff
38297,2024-08-12 22:00:00,1.0,8926641948fffff
38298,2024-08-12 23:00:00,3.0,8926641948fffff
