In [13]:
from datetime import datetime
import local_helpers as lh
import vaex
import h3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely
from shapely import wkt
from shapely.geometry import Point, MultiPolygon


# Feature Engineering

### Previous demand as input

As we have given time series data, it is a common approach to use the demand of previous hours (or days etc.) as an input for the prediction. The assumption we hereby make is that the factors that influence the demand have not changed dramatically within the used time frames. We have decided to construct the following features from previous demand:

* 2 hour: The asssumption is that the demand should not change dramatically between three hours.
* 24 hours: The asssumption is that the current demand should be comparable to the demand exactly one day ago, as factors such as season, time of the day are the same.
* Average demand of the past week at the same day time: This feature is the average of all 7 demand observations of the past week at same time of the day. 

In [2]:
# This functions loads the dataset either with hexagons or census tract
def load_dataset_to_pandas(resolution=10, location_type_hexagon = True):
    df = vaex.open('./data/trips_prepared.hdf5')
    df.head()

    df["trip_start_day"] = df.trip_start_timestamp.dt.day
    df["trip_start_month"] = df.trip_start_timestamp.dt.month
    df["trip_start_hour"] = df.trip_start_timestamp.dt.hour
    df["trip_start_minute"] = df.trip_start_timestamp.dt.minute

    if location_type_hexagon:
        RESOLUTION = resolution
        def geo_to_h3(row1, row2):
            return h3.geo_to_h3(row1,row2, RESOLUTION)

        # Step 1: For each pickup and drop-off calculate the correct hexagon in the resolution
        df['pickup_loc'] = df.apply(geo_to_h3, [df['pickup_centroid_latitude'], df['pickup_centroid_longitude']])
    else:
        df.rename('pickup_census_tract', 'pickup_loc')

    ### Group by hour
    df_demand = df.groupby(['trip_start_hour', 'trip_start_month', 'trip_start_day', 'pickup_loc']).agg({'demand': 'count'})
        
    # Add timestamp as preparation for resampling
    df_demand['timestamp'] = pd.to_datetime({'year': 2017, 'month': df_demand['trip_start_month'].to_numpy(), 'day': df_demand['trip_start_day'].to_numpy(), 'hour': df_demand['trip_start_hour'].to_numpy()}).to_numpy()

    # convert to pandas df
    df_demand = df_demand.to_pandas_df()
    
    return df_demand

In [3]:
def resample_to_hourly(df):
    ### Creation of dummy df which contains hourly data dummy data over an entire year per hexagon
    # Create a DateTimeIndex with hourly intervals for the year 2017
    start_date = '2017-01-01 00:00:00'
    end_date = '2017-12-31 23:00:00'
    hourly_range = pd.date_range(start=start_date, end=end_date, freq='H')
    num_entries_per_year = len(hourly_range)
    hour_range = np.tile(hourly_range.hour,len(np.unique(df.pickup_loc)))
    month_range = np.tile(hourly_range.month,len(np.unique(df.pickup_loc)))
    day_range = np.tile(hourly_range.day,len(np.unique(df.pickup_loc)))
    hourly_range = np.tile(hourly_range,len(np.unique(df.pickup_loc)))

    # -1 values will indacte that these rows were artificially generated later on
    data = {
        'trip_start_hour': hour_range,
        'trip_start_month': month_range,
        'trip_start_day': day_range,
        'pickup_loc': np.repeat(np.unique(df.pickup_loc), num_entries_per_year),
        'demand': 0,
    }

    df_hourly = pd.DataFrame(data, index=hourly_range)
    df_hourly= df_hourly.set_index([df_hourly.index, 'pickup_loc'])

    # introduce multiindex for filling up the df with hourly index later on
    df=df.set_index(['timestamp', 'pickup_loc'])

    # insert df
    df_hourly.update(df)

    # clear up multi-index
    df_hourly=df_hourly.reset_index()
    df_hourly.columns = ['timestamp','pickup_loc','trip_start_hour','trip_start_month','trip_start_day','demand']
    return df_hourly
    

In [31]:
# import census tract data
df_census_tracts = pd.read_csv("data/chicago_census_tracts.csv")
df_census_tracts['the_geom'] = df_census_tracts['the_geom'].apply(shapely.wkt.loads)
df_census_tracts = df_census_tracts.rename(columns={'the_geom': 'geometry'})
df_census_tracts = df_census_tracts[["geometry", "GEOID10"]]

gdf = gpd.GeoDataFrame(df_census_tracts, geometry='geometry')
gdf = gdf.set_crs(epsg=4326, allow_override=True)

In [4]:
def get_past_demand(df):
    # insert features 1, 2 and 24 hours previous demand
    df['demand_h-1'] = df.sort_values('timestamp').groupby('pickup_loc')['demand'].shift(1)
    df['demand_h-2'] = df.sort_values('timestamp').groupby('pickup_loc')['demand'].shift(2)
    df['demand_h-24'] = df.sort_values('timestamp').groupby('pickup_loc')['demand'].shift(24)
    return df
    

# Weather features
In the descriptive analysis, particularly the analysis of temporal demand patterns, we found that the temperature and demand curves follow similar directions. Therefore, we construct features based on temperature to enable models that capture this relationship.

### Include weather data
First, we have to include the weather data into the dataframe. For this we just need to merge the two datasets, as both are already in hourly frequency. The weather data propose data for minute 53 of an hour. Therefore, we round up to the nearest hour for each row. We suppose that the weather changes in seven minutes can be disregarded.

In [5]:
def merge_weather(df):
    # read and merge weather data
    df_weather = pd.read_csv('data/weather_data_final.csv')
    df_weather['date_time'] = pd.to_datetime(df_weather['date_time'])
    df_weather['date_time'] = df_weather['date_time'].dt.ceil('H')
    df_weather.rename(columns={'date_time': 'timestamp'}, inplace=True)

    # cast data types
    df_weather['temp'] = df_weather['temp'].str.replace('\xa0°F', '').astype(float).fillna(np.nan)
    df_weather['dew_point'] = df_weather['dew_point'].str.replace('\xa0°F', '').astype(float).fillna(np.nan)
    df_weather['humidity'] = df_weather['humidity'].str.replace('\xa0°%', '').astype(float).fillna(np.nan)
    df_weather['wind_speed'] = df_weather['wind_speed'].str.replace('\xa0°mph', '').astype(float).fillna(np.nan)
    df_weather['wind_gust'] = df_weather['wind_gust'].str.replace('\xa0°mph', '').astype(float).fillna(np.nan)
    df_weather['pressure'] = df_weather['pressure'].str.replace('\xa0°in', '').astype(float).fillna(np.nan)
    df_weather['precip'] = df_weather['precip'].str.replace('\xa0°in', '').astype(float).fillna(np.nan)
    df = df.merge(df_weather, on='timestamp', how='left')
    return df

### Temperature features
In addition to the current temperature, we are add the temperature from 1, 2, and 3 hours prior to the time of taxi demand. We suggest that past temperature conditions could potentially impact the decision to hire a taxi.

In [6]:
def get_past_temperature(df):
    df['temp_h-1'] = df.sort_values('timestamp').groupby('pickup_loc')['temp'].shift(1)
    df['temp_h-2'] = df.sort_values('timestamp').groupby('pickup_loc')['temp'].shift(2)
    df['temp_h-3'] = df.sort_values('timestamp').groupby('pickup_loc')['temp'].shift(3)
    return df

### Precipitation
We hypothesize that precipitation has a significant impact on demand. Therefore, we construct features that describe whether it has rained in the last 1-3 hours.

In [7]:
def get_past_precip(df):
    df['precip_h-1'] = df.sort_values('timestamp').groupby('pickup_loc')['precip'].shift(1)
    df['precip_h-2'] = df.sort_values('timestamp').groupby('pickup_loc')['precip'].shift(2)
    df['precip_h-3'] = df.sort_values('timestamp').groupby('pickup_loc')['precip'].shift(3)
    return df

### Season
We suggest that demand changes over seasons.

In [8]:
# long loading time
def determine_season(row):
    if datetime(2017, 12, 22) <= row.timestamp or row.timestamp < datetime(2017, 3, 20):
        return 'winter'
    elif datetime(2017, 3, 20) <= row.timestamp < datetime(2017, 6, 21):
        return 'spring'
    elif datetime(2017, 6, 21) <= row.timestamp < datetime(2017, 9, 23):
        return 'summer'
    else:
        return 'autumn'

def get_season(df):
    df['season'] = df.apply((lambda x: determine_season(x)), axis=1)
    df = pd.get_dummies(df, columns=['season'])
    return df

### Weekend feature
In the descriptive analysis we have seen that demand changes depending on weekend or not. Hence we engineer a feature "weekend" which is 1 for all rides on saturday & sunday and zero for all other days.

In [9]:
def get_weekend(df):
    df['weekend'] = df.apply((lambda x: 0 if x.timestamp.weekday() < 5 else 1), axis=1)
    return df

### Daytime features
In addition, descriptive analysis has shown that the time of day, i.e., whether it is night, morning, afternoon, or evening, plays an important role in determining demand. Therefore, we developed four characteristics, each indicating whether a trip occurs during the following times.
* Morning: 6 a.m. - 12 p.m.
* Afternoon: 12 noon - 6 p.m.
* Evening: 6 p.m. - 11 p.m.
* Night: 12 a.m. - 6 a.m.

In [10]:
def get_daytime(df):
    df['daytime'] = df["timestamp"].apply(lambda x: lh.get_pnt_day_with_pnt_week(x))
    df = pd.get_dummies(df, columns=['daytime'])
    return df

### Event Features
Events such as public holidays might influence the demand. In order to capture these patterns we introduce the public holiday feature

There were several public holidays in Boston:
* Martin Luther King Day: Monday, January 16, 2017
* Lincoln's Birthday: Monday, February 13, 2017
* Washington's Birthday (President's Day): Monday, February 20, 2017
* Memorial Day: Monday, May 29, 2017
* Independence Day: Tuesday, July 04, 2017
* Labor Day: Monday, September 04, 2017
* Columbus Day: Monday, October 09, 2017
* Veterans' Day: Friday, November 10, 2017
* Thanksgiving Day: Thursday, November 23, 2017
* Thanksgiving Day: Friday, November 24, 2017
* Christmas Day: Monday, December 25, 2017

These events might have influenced the demand

In [11]:
holiday_dates = [
    datetime(2017, 1, 2),   # New Year's Day
    datetime(2017, 1, 16),  # Martin Luther King Day
    datetime(2017, 2, 13),  # Lincoln's Birthday
    datetime(2017, 2, 20),  # Washington's Birthday (President's Day)
    datetime(2017, 5, 29),  # Memorial Day
    datetime(2017, 7, 4),   # Independence Day
    datetime(2017, 9, 4),   # Labor Day
    datetime(2017, 10, 9),  # Columbus Day
    datetime(2017, 11, 10), # Veterans' Day
    datetime(2017, 11, 23), # Thanksgiving Day
    datetime(2017, 11, 24), # Day after Thanksgiving
    datetime(2017, 12, 25), # Christmas Day
]

def get_holiday_dates(df):
    df['public_holiday'] = df.apply((lambda x: 1 if x.timestamp in holiday_dates else 0), axis=1)
    return df

### Neighbor demand feature
We expect a high correlation between the demand of one hexagon and the demand in the surrounding hexagons. With this feature we can observe demand patterns in a greater radius than only in the observed location.

In [16]:
def get_neighbor_demand(df):

    if h3.h3_is_valid(df["pickup_loc"].iloc[0]):
        pickup_locs = df["pickup_loc"].unique()
        neighbors_dict = {}
        for loc in pickup_locs:
            neighbors_dict[loc] = h3.k_ring(loc, k=1)
    else:

        buffer_distance = 500  # buffer distance in meters (500 m here)
        gdf_buffered = gdf.copy()
        gdf_buffered['geometry'] = gdf.buffer(buffer_distance)
        gdf['geometry'] = gdf_buffered.geometry  # Replace original geometry with buffered one
        census_tract_neighbors = gpd.sjoin(gdf, gdf, how='left', predicate='intersects')
        
        # Remove self-matches (where polygons intersect with themselves)
        census_tract_neighbors = census_tract_neighbors[census_tract_neighbors.index != census_tract_neighbors.index_right]
        census_tract_neighbors_grouped = census_tract_neighbors.groupby('GEOID10_left')['GEOID10_right'].apply(list)
        neighbors_dict = census_tract_neighbors_grouped.to_dict()
    
    def get_neighbor_demand_by_ID(df_timestamp, pickup_loc):
        neighbors = neighbors_dict[pickup_loc]
        neighbor_demand = df_timestamp[df_timestamp['pickup_loc'].isin(neighbors)]['demand_h-1'].mean()
        return neighbor_demand

    df['neighbor_demand_h-1'] = df.apply(lambda row: get_neighbor_demand_by_ID(df[df['timestamp'] == row['timestamp']], row['pickup_loc']), axis=1)
    return df

### Point Of Interest Feature
We expect a correlation between the taxi demand and certain points of interest in Chicago, e.g. airports, train stations etc.

In [213]:
df_poi = load_dataset_to_pandas(location_type_hexagon=True, resolution=5)
df_poi = resample_to_hourly(df_poi)
df_poi.head(1)

Unnamed: 0,timestamp,pickup_loc,trip_start_hour,trip_start_month,trip_start_day,demand
0,2017-01-01,85266453fffffff,0.0,1.0,1.0,1.0


In [217]:
import requests

query = """
[out:json];
area["name"="Chicago"]["admin_level"="2"];
node["amenity"="hospital"](area);
out;
"""

url = "https://overpass-api.de/api/interpreter"
response = requests.post(url, data=query)

if response.status_code == 200:
    json_data = response.json()
    # process the JSON data as needed
else:
    print("Error:", response.status_code)


In [218]:
print(json_data)

{'version': 0.6, 'generator': 'Overpass API 0.7.61.4 df4c946a', 'osm3s': {'timestamp_osm_base': '2023-08-04T16:40:19Z', 'timestamp_areas_base': '2023-08-04T11:25:41Z', 'copyright': 'The data included in this document is from www.openstreetmap.org. The data is made available under ODbL.'}, 'elements': []}


In [214]:
airports = [
                (41.97924019065309, -87.90661241904155), # Chicago O’Hare International Airport
                (41.78710495069415, -87.74984855079984) # Chicago Midway Airport
            ]

train_stations = [
                (41.87883345977425, -87.64027881617824), # Chicago Union Station
                (41.88335453890163, -87.64009493575466), # Ogilvie Transportation Center
                (41.8845718845907, -87.62431116459061), # Millennium Station
                (41.87544369615934, -87.63208332701406), # LaSalle Street
                (41.87726330360876, -87.62268573680238), # Van Buren Street
                (41.56218299780981, -87.667482422276), # Homewood Amtrak Station
                (41.52476507667369, -88.078927942358) # Joliet Union Station
            ]

bus_station = [
    (41.87468081872225, -87.64272876273913), # Greyhound Terminal
    (41.872045729513, -87.63996133152241), # Megabus Stop
    (41.87883345977425, -87.64027881617824), # Chicago Union Station
]

hotels = [
    (41.88808447010053, -87.62033102433828), # Hyatt Regency Chicago
    (41.88067758292453, -87.6260833940384), # Palmer House a Hilton Hotel
    (41.87236302899726, -87.62432049342709), # Hotel Hilton Chicago
    (41.88923470157756, -87.61947060453758), # Sheraton Grand Chicago Riverwalk
    (41.892276829520576, -87.62463834871738) # Chicago Marriott Downtown Magnificent Mile
]

hospitals = [
    (41.937145804630454, -87.65093723222746), # Advocate Illinois Masonic Medical Center
    (41.97336360281823, -87.65143250715799), # 
]

def get_distance_to_nearest_airport(df):

    def get_min_distance(pickup_loc):
        
        if h3.h3_is_valid(df["pickup_loc"].iloc[0]):
            distances = []
            airports = [
                (41.97924019065309, -87.90661241904155), # Chicago O’Hare International Airport
                (41.78710495069415, -87.74984855079984) # Chicago Midway Airport
            ]
            for airport in airports:
                distances.append(h3.point_dist(airport, h3.h3_to_geo(pickup_loc), unit='km'))
            return min(distances)
        else:
            airports = [
                Point(-87.90661241904155, 41.97924019065309), # Chicago O’Hare International Airport
                Point(-87.74984855079984, 41.78710495069415)  # Chicago Midway Airport
            ]

            airport_points = gpd.GeoSeries(airports, crs="EPSG:4326")
            airport_points = airport_points.to_crs("EPSG:3857")

            tract_geometry = df_census_tracts[df_census_tracts["GEOID10"]==pickup_loc]["geometry"].iloc[0]
            census_tract = gpd.GeoSeries([tract_geometry], crs="EPSG:4326")
            census_tract = census_tract.to_crs("EPSG:3857")

            distances = airport_points.distance(census_tract.iloc[0])
            return distances.min() / 1000 # convert in km

    unique_pickup_locs = df['pickup_loc'].unique()
    unique_distances = {pickup_loc: get_min_distance(pickup_loc) for pickup_loc in unique_pickup_locs}
    df['distance_to_nearest_airport'] = df['pickup_loc'].map(unique_distances)
    return df

In [215]:
get_distance_to_nearest_airport(df_poi)

Unnamed: 0,timestamp,pickup_loc,trip_start_hour,trip_start_month,trip_start_day,demand,distance_to_nearest_airport
0,2017-01-01 00:00:00,85266453fffffff,0.0,1.0,1.0,1.0,7.436425
1,2017-01-01 01:00:00,85266453fffffff,1.0,1.0,1.0,1.0,7.436425
2,2017-01-01 02:00:00,85266453fffffff,2.0,1.0,1.0,0.0,7.436425
3,2017-01-01 03:00:00,85266453fffffff,3.0,1.0,1.0,0.0,7.436425
4,2017-01-01 04:00:00,85266453fffffff,4.0,1.0,1.0,0.0,7.436425
...,...,...,...,...,...,...,...
52555,2017-12-31 19:00:00,85275937fffffff,19.0,12.0,31.0,55.0,3.008351
52556,2017-12-31 20:00:00,85275937fffffff,20.0,12.0,31.0,23.0,3.008351
52557,2017-12-31 21:00:00,85275937fffffff,21.0,12.0,31.0,15.0,3.008351
52558,2017-12-31 22:00:00,85275937fffffff,22.0,12.0,31.0,13.0,3.008351


In [211]:
def get_distance_to_nearest_train_station(df):

    def get_min_distance(pickup_loc):
        
        if h3.h3_is_valid(df["pickup_loc"].iloc[0]):
            distances = []
            train_stations = [
                (41.87883345977425, -87.64027881617824), # Chicago Union Station
                (41.88335453890163, -87.64009493575466), # Ogilvie Transportation Center
                (41.8845718845907, -87.62431116459061), # Millennium Station
                (41.87544369615934, -87.63208332701406), # LaSalle Street
                (41.87726330360876, -87.62268573680238), # Van Buren Street
                (41.56218299780981, -87.667482422276), # Homewood Amtrak Station
                (41.52476507667369, -88.078927942358) # Joliet Union Station
            ]
            for train_station in train_stations:
                distances.append(h3.point_dist(train_station, h3.h3_to_geo(pickup_loc), unit='km'))
            return min(distances)
        else:
            train_stations = [
                Point(-87.64027881617824, 41.87883345977425), # Chicago Union Station
                Point(-87.64009493575466, 41.88335453890163), # Ogilvie Transportation Center
                Point(-87.62431116459061, 41.8845718845907), # Millennium Station
                Point(-87.63208332701406, 41.87544369615934), # LaSalle Street
                Point(-87.62268573680238, 41.87726330360876), # Van Buren Street
                Point(-87.667482422276, 41.56218299780981), # Homewood Amtrak Station
                Point(-88.078927942358, 41.52476507667369) # Joliet Union Station
            ]

            train_station_points = gpd.GeoSeries(train_stations, crs="EPSG:4326")
            train_station_points = train_station_points.to_crs("EPSG:3857")

            tract_geometry = df_census_tracts[df_census_tracts["GEOID10"]==pickup_loc]["geometry"].iloc[0]
            census_tract = gpd.GeoSeries([tract_geometry], crs="EPSG:4326")
            census_tract = census_tract.to_crs("EPSG:3857")

            distances = train_station_points.distance(census_tract.iloc[0])
            return distances.min() / 1000 # convert in km

    unique_pickup_locs = df['pickup_loc'].unique()
    unique_distances = {pickup_loc: get_min_distance(pickup_loc) for pickup_loc in unique_pickup_locs}
    df['distance_to_nearest_train_station'] = df['pickup_loc'].map(unique_distances)
    return df

In [216]:
get_distance_to_nearest_train_station(df_poi)

Unnamed: 0,timestamp,pickup_loc,trip_start_hour,trip_start_month,trip_start_day,demand,distance_to_nearest_airport,distance_to_nearest_train_station
0,2017-01-01 00:00:00,85266453fffffff,0.0,1.0,1.0,1.0,7.436425,20.909254
1,2017-01-01 01:00:00,85266453fffffff,1.0,1.0,1.0,1.0,7.436425,20.909254
2,2017-01-01 02:00:00,85266453fffffff,2.0,1.0,1.0,0.0,7.436425,20.909254
3,2017-01-01 03:00:00,85266453fffffff,3.0,1.0,1.0,0.0,7.436425,20.909254
4,2017-01-01 04:00:00,85266453fffffff,4.0,1.0,1.0,0.0,7.436425,20.909254
...,...,...,...,...,...,...,...,...
52555,2017-12-31 19:00:00,85275937fffffff,19.0,12.0,31.0,55.0,3.008351,26.337889
52556,2017-12-31 20:00:00,85275937fffffff,20.0,12.0,31.0,23.0,3.008351,26.337889
52557,2017-12-31 21:00:00,85275937fffffff,21.0,12.0,31.0,15.0,3.008351,26.337889
52558,2017-12-31 22:00:00,85275937fffffff,22.0,12.0,31.0,13.0,3.008351,26.337889


In [17]:
def get_prepared_data(location_type_hexagon=True, resolution=10):
    df = load_dataset_to_pandas(location_type_hexagon=location_type_hexagon, resolution=resolution)
    df = resample_to_hourly(df)
    df = get_past_demand(df)
    df = merge_weather(df)
    df = get_past_temperature(df)
    df = get_past_precip(df)
    df = get_season(df)
    df = get_weekend(df)
    df = get_daytime(df)
    df = get_holiday_dates(df)
    df = get_neighbor_demand(df)

    return df

In [18]:
df_demand = get_prepared_data(resolution=4, location_type_hexagon=True)
df_demand.head()

Unnamed: 0,timestamp,pickup_loc,trip_start_hour,trip_start_month,trip_start_day,demand,demand_h-1,demand_h-2,demand_h-24,date,...,daytime_afternoon_week,daytime_afternoon_weekend,daytime_evening_week,daytime_evening_weekend,daytime_morning_week,daytime_morning_weekend,daytime_night_week,daytime_night_weekend,public_holiday,neighbor_demand_h-1
0,2017-01-01 00:00:00,842664dffffffff,0.0,1.0,1.0,1813.0,,,,,...,0,0,0,0,0,0,0,1,0,
1,2017-01-01 01:00:00,842664dffffffff,1.0,1.0,1.0,2738.0,1813.0,,,2017-01-01,...,0,0,0,0,0,0,0,1,0,915.5
2,2017-01-01 02:00:00,842664dffffffff,2.0,1.0,1.0,2758.0,2738.0,1813.0,,2017-01-01,...,0,0,0,0,0,0,0,1,0,1371.5
3,2017-01-01 03:00:00,842664dffffffff,3.0,1.0,1.0,1889.0,2758.0,2738.0,,2017-01-01,...,0,0,0,0,0,0,0,1,0,1379.0
4,2017-01-01 04:00:00,842664dffffffff,4.0,1.0,1.0,1051.0,1889.0,2758.0,,2017-01-01,...,0,0,0,0,0,0,0,1,0,944.5


# Correlation Analysis
We have different measurement scales:

Ordinal (natural order, but no quantifiable difference between values or binary):
- season_x 
- daytime_x
- hour_of_day
Metric (equidistant scale):
- temp
- demand
- precipitation
Nominal:
- public_holiday
- weekend

We do 2 different analysis:
- Metric <-> Metric (Pearson)
- Ordinal & Nominal <-> Metric, Ordinal & Nominal <-> Ordinal & Nominal (Spearman)

In [None]:
metric = df_demand[["demand", "temp", "temp_h-1", "temp_h-2", "temp_h-3", "demand_h-1", "demand_h-2", "demand_h-24"]]
sns.heatmap(metric.corr(method="pearson"), annot=True, cmap="RdYlGn")

In [None]:
ordinal_and_nominal = df_demand[
        ["season_autumn", "season_spring", "season_summer", "season_winter", "daytime_morning_week",
        "daytime_afternoon_week", "daytime_evening_week", "daytime_night_week", "daytime_morning_weekend",
        "daytime_afternoon_weekend", "daytime_evening_weekend", "daytime_night_weekend", "public_holiday",
        "weekend"]] #"hour"
plt.figure(figsize=(20, 20))
sns.heatmap(pd.concat([ordinal_and_nominal, metric], axis=1).corr(method="spearman"), annot=True, cmap="RdYlGn")

# Prediction Models

## Split the dataset

In [None]:
import sklearn
def train_validation_test_split(X, y, train_ratio=0.6, validation_ratio=0.2, test_ratio=0.2, random_state=None):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)

    # Calculate the remaining ratio after train-test split
    remaining_ratio = 1.0 - test_ratio

    # Calculate the proportional validation ratio based on the remaining ratio
    validation_ratio_prop = validation_ratio / remaining_ratio

    X_train, X_validation, y_train, y_validation = train_test_split(
        X_train, y_train, test_size=validation_ratio_prop, random_state=random_state
    )

    return X_train, X_validation, X_test, y_train, y_validation, y_test