In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
#Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Reading from a csv file, into a data frame
business_df = pd.read_csv('model_business_df_2.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
business_df

Unnamed: 0,taxi_zone,business_type,License Expiration Date,License Creation Date,Counts
0,3,entertainment_and_recreation,2024-03-31,1997-08-01,1
1,3,entertainment_and_recreation,2025-1-1,2012-01-04,1
2,3,parking_and_automotive_services,2022-10-31,2017-02-03,1
3,3,parking_and_automotive_services,2022-10-31,2019-01-08,1
4,3,parking_and_automotive_services,2022-10-31,2019-01-23,1
...,...,...,...,...,...
60753,263,transportation,2023-11-01,2023-06-05,1
60754,263,transportation,2024-04-30,2009-11-18,1
60755,263,transportation,2024-04-30,2009-11-25,1
60756,263,transportation,2024-04-30,2021-11-18,1


In [3]:
business_df.dtypes

taxi_zone                   int64
business_type              object
License Expiration Date    object
License Creation Date      object
Counts                      int64
dtype: object

In [4]:
taxi_df = pd.read_parquet("basic_taxi_df_2.2.parquet")
taxi_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR
...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan


# connect taxi data and business data, match the time range

In [5]:
# First, create a pivot table from business_df with 'business_type' as columns
pivot_df = pd.pivot_table(business_df, values='Counts', 
                          index=['taxi_zone', 'License Creation Date', 'License Expiration Date'], 
                          columns=['business_type'], 
                          aggfunc=np.sum, 
                          fill_value=0).reset_index()

# Sort pivot_df by 'License Creation Date'
pivot_df.sort_values('License Creation Date', inplace=True)
pivot_df

business_type,taxi_zone,License Creation Date,License Expiration Date,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation
41162,181,1989-10-25,2023-12-31,0,0,0,0,0,1,0,0
35086,157,1989-10-25,2023-12-31,0,0,0,0,0,1,0,0
19200,82,1993-12-22,2024-10-31,0,0,0,1,0,0,0,0
6729,26,1994-01-05,2024-03-31,0,0,0,0,0,0,1,0
28689,129,1994-01-19,2024-10-31,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5943,22,2023-06-08,2025-02-28,0,0,0,0,0,1,0,0
51506,226,2023-06-08,2024-04-30,0,0,0,0,0,0,0,1
27037,118,2023-06-08,2024-06-30,0,0,0,0,1,0,0,0
11193,45,2023-06-08,2025-02-28,0,0,0,0,0,1,0,0


In [6]:
combine_df = taxi_df.copy()
combine_df['entertainment_and_recreation'] = 0
combine_df['financial_services'] = 0
combine_df['food_and_beverage'] = 0
combine_df['parking_and_automotive_services'] = 0
combine_df['professional_services'] = 0
combine_df['real_estate'] = 0
combine_df['retail_services'] = 0
combine_df['transportation'] = 0

combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,0,0,0,0,0,0,0,0
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,0,0,0,0,0,0,0,0
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,0,0,0,0,0,0,0,0
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,0,0,0,0,0,0,0,0


In [7]:
combine_df['datetime'] = pd.to_datetime(combine_df['datetime'])
pivot_df['License Creation Date'] = pd.to_datetime(pivot_df['License Creation Date'])
pivot_df['License Expiration Date'] = pd.to_datetime(pivot_df['License Expiration Date'])

# The code below used to mathch the time ramge, will run a long time. we have already saved it, and will read it later

In [8]:
# Perform the operation
for i in combine_df.index:
    # Get the current row in combine_df
    row_combine = combine_df.loc[i]
    
    # Filter pivot_df based on conditions
    mask = (
        (pivot_df['taxi_zone'] == row_combine['taxi_zone']) &
        (pivot_df['License Creation Date'] <= row_combine['datetime']) &
        (pivot_df['License Expiration Date'] >= row_combine['datetime'])
    )
    
    # Sum the values of entertainment_and_recreation from the selected rows in pivot_df
    combine_df.at[i, 'entertainment_and_recreation'] = pivot_df.loc[mask, 'entertainment_and_recreation'].sum()
    combine_df.at[i, 'financial_services'] = pivot_df.loc[mask, 'financial_services'].sum()
    combine_df.at[i, 'food_and_beverage'] = pivot_df.loc[mask, 'food_and_beverage'].sum()
    combine_df.at[i, 'parking_and_automotive_services'] = pivot_df.loc[mask, 'parking_and_automotive_services'].sum()
    combine_df.at[i, 'professional_services'] = pivot_df.loc[mask, 'professional_services'].sum()
    combine_df.at[i, 'real_estate'] = pivot_df.loc[mask, 'real_estate'].sum()
    combine_df.at[i, 'retail_services'] = pivot_df.loc[mask, 'retail_services'].sum()
    combine_df.at[i, 'transportation'] = pivot_df.loc[mask, 'transportation'].sum()
combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4


In [9]:
print("entertainment_and_recreation: ", combine_df['entertainment_and_recreation'].unique().tolist(), "\n")
print("financial_services: ", combine_df['financial_services'].unique().tolist(), "\n")
print("food_and_beverage: ", combine_df['food_and_beverage'].unique().tolist(), "\n")
print("parking_and_automotive_services: ", combine_df['parking_and_automotive_services'].unique().tolist(), "\n")
print("professional_services: ", combine_df['professional_services'].unique().tolist(), "\n")
print("real_estate: ", combine_df['real_estate'].unique().tolist(), "\n")
print("retail_services: ", combine_df['retail_services'].unique().tolist(), "\n")
print("transportation: ", combine_df['transportation'].unique().tolist(), "\n")

entertainment_and_recreation:  [0, 2, 48, 38, 39, 40, 41, 1, 11, 9, 61, 49, 50, 51, 52, 3, 4, 5, 27, 22, 23, 8, 14, 15, 10, 30, 31, 16, 7, 6, 12, 13, 25, 17, 18, 19, 20, 21, 26, 28, 24, 94, 82, 83, 84, 85, 86, 87, 88, 89, 59, 58, 35, 36, 37, 32, 33, 34, 29, 47, 57, 45, 46, 42, 43, 44, 69, 70, 60, 62, 63, 64, 65, 66, 67, 97, 98, 78, 79, 80, 81, 76, 77, 68, 71, 72, 73, 74, 53, 75] 

financial_services:  [0, 3, 2, 1, 4, 9, 8, 10, 11, 12, 5, 6, 7, 25, 26, 27, 28, 29, 30, 31, 32] 

food_and_beverage:  [0, 2, 1, 5, 3, 4] 

parking_and_automotive_services:  [0, 30, 31, 32, 33, 34, 24, 25, 26, 27, 28, 29, 5, 6, 22, 23, 35, 36, 37, 8, 12, 13, 14, 11, 7, 1, 19, 20, 21, 38, 39, 10, 16, 15, 17, 18, 40, 41, 42, 53, 54, 55, 56, 45, 46, 48, 49, 50, 51, 4, 2, 9, 43, 44, 47, 3, 52, 70, 71, 72, 73, 67, 68, 69, 76, 57, 58, 59, 60, 61, 62, 63, 64, 74, 75, 77, 78, 79, 80, 81, 82, 83, 65, 66] 

professional_services:  [0, 19, 20, 21, 22, 23, 24, 17, 18, 2, 8, 7, 9, 10, 11, 31, 32, 33, 34, 35, 27, 28, 29, 30

In [10]:
# check the anwser is correct, we choose taxi_zone 86, time 2023-03-18 11:00:00, the answer of the number of 
# real_estateshould be 60.
mask = (
    (pivot_df['taxi_zone'] == 86) &
    (pivot_df['License Creation Date'] < pd.Timestamp('2023-03-18 11:00:00')) &
    (pivot_df['License Expiration Date'] > pd.Timestamp('2023-03-18 11:00:00'))
)

selected_rows = pivot_df[mask]
sum_real_estate = selected_rows['real_estate'].sum()
sum_real_estate


60

In [11]:

combine_df.reset_index(drop=True).to_parquet("model_taxi_business_match_time_2.2.parquet", index=False)

# run at here

In [12]:
combine_df = pd.read_parquet("model_taxi_business_match_time_2.2.parquet")
combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4


# connect hospital dataset

In [13]:
# Reading from a csv file, into a data frame
hospital_df = pd.read_csv('basic_HospitalData_version_1.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

# modify the structure of the dataset for further connection
hospital_df = hospital_df.rename(columns={'location_id': 'taxi_zone', 'Counts': 'hospital'})
hospital_df = hospital_df.drop('Industry', axis=1)


hospital_df

Unnamed: 0,taxi_zone,hospital
0,10,1
1,112,1
2,118,1
3,121,2
4,127,1
5,130,1
6,137,2
7,144,1
8,156,1
9,166,1


In [14]:
# Merge business_df_pivot and hospital_df on 'taxi_zone' using inner join
combine_df = combine_df.merge(hospital_df, on='taxi_zone', how='left')
# Replace NaN values with 0
combine_df.fillna(0, inplace=True)

In [15]:
combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0,0.0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0,0.0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0,0.0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0,0.0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4,0.0
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4,0.0
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4,0.0
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4,0.0


# connect hotspot dataset

In [16]:
# Reading from a csv file, into a data frame
hotspot_df = pd.read_csv('basic_hotspot_df.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
hotspot_df

Unnamed: 0,location_id,Wifi Type,Counts
0,10,Free Wifi,1
1,10,Limited Free Wifi,16
2,100,Free Wifi,21
3,101,Free Wifi,1
4,102,Free Wifi,1
...,...,...,...
304,94,Free Wifi,9
305,95,Free Wifi,30
306,97,Free Wifi,46
307,98,Free Wifi,2


In [17]:
# Group the data by 'location_id' and sum 'Counts'
hotspot_df = hotspot_df.groupby('location_id')['Counts'].sum().reset_index()
hotspot_df = hotspot_df.rename(columns={'location_id': 'taxi_zone', 'Counts': 'hotspots'})
hotspot_df

Unnamed: 0,taxi_zone,hotspots
0,6,3
1,7,54
2,8,1
3,9,1
4,10,17
...,...,...
221,258,19
222,259,3
223,260,16
224,261,11


In [18]:
# Merge hotspot_df on 'taxi_zone' using inner join
combine_df = combine_df.merge(hotspot_df, on='taxi_zone', how='left')
# Replace NaN values with 0
combine_df.fillna(0, inplace=True)


In [19]:
combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0,0.0,0.0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0,0.0,0.0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0,0.0,0.0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0,0.0,0.0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0


# connect school dataset

In [20]:

# Reading from a csv file, into a data frame
school_df = pd.read_csv('basic_EducationData.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
school_df

Unnamed: 0,Taxi_zone,Educational Facility,Counts
0,10,Education,8
1,101,Education,6
2,102,Education,3
3,103,Education,1
4,106,Education,1
...,...,...,...
218,92,Education,11
219,94,Education,12
220,95,Education,12
221,97,Education,12


In [21]:
# modify the structure of the dataset for further connection
school_df = school_df.rename(columns={'Taxi_zone': 'taxi_zone', 'Counts': 'school'})
school_df = school_df.drop('Educational Facility', axis=1)

school_df

Unnamed: 0,taxi_zone,school
0,10,8
1,101,6
2,102,3
3,103,1
4,106,1
...,...,...
218,92,11
219,94,12
220,95,12
221,97,12


In [22]:
# Merge school_df on 'taxi_zone' using inner join
combine_df = combine_df.merge(school_df, on='taxi_zone', how='left')
# Replace NaN values with 0
combine_df.fillna(0, inplace=True)

In [23]:
combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots,school
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0


# Add 'total_business' column to dataset

In [24]:
combine_df['total_business'] = combine_df.iloc[:, 7:].sum(axis=1)

In [25]:
combine_df.head(100000000)

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots,school,total_business
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0


In [26]:
print("hospital: ", combine_df['hospital'].unique().tolist(), "\n")
print("hotspots: ", combine_df['hotspots'].unique().tolist(), "\n")
print("school: ", combine_df['school'].unique().tolist(), "\n")
print("total_business: ", combine_df['total_business'].unique().tolist(), "\n")


hospital:  [0.0, 1.0, 2.0, 4.0, 3.0, 7.0, 6.0] 

hotspots:  [0.0, 3.0, 54.0, 1.0, 17.0, 8.0, 2.0, 11.0, 23.0, 7.0, 5.0, 18.0, 43.0, 27.0, 10.0, 9.0, 69.0, 101.0, 13.0, 4.0, 39.0, 15.0, 32.0, 6.0, 20.0, 44.0, 82.0, 53.0, 65.0, 58.0, 22.0, 12.0, 16.0, 46.0, 30.0, 21.0, 51.0, 63.0, 25.0, 34.0, 38.0, 14.0, 26.0, 33.0, 24.0, 41.0, 52.0, 29.0, 45.0, 28.0, 42.0, 48.0, 40.0, 19.0] 

school:  [0.0, 6.0, 9.0, 2.0, 4.0, 11.0, 8.0, 5.0, 19.0, 17.0, 10.0, 3.0, 12.0, 13.0, 14.0, 1.0, 32.0, 18.0, 23.0, 28.0, 29.0, 7.0, 15.0, 27.0, 34.0, 16.0, 25.0, 30.0, 36.0, 22.0] 

total_business:  [0.0, 358.0, 359.0, 360.0, 361.0, 362.0, 363.0, 364.0, 365.0, 367.0, 368.0, 369.0, 370.0, 371.0, 372.0, 373.0, 374.0, 375.0, 376.0, 377.0, 378.0, 366.0, 161.0, 159.0, 149.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 148.0, 56.0, 57.0, 58.0, 46.0, 318.0, 317.0, 319.0, 316.0, 320.0, 322.0, 323.0, 313.0, 314.0, 315.0, 308.0, 309.0, 310.0, 311.0, 781.0, 782.0, 783.0, 784.0, 785.0, 786.0, 787.0, 788.0, 789.0, 790.0, 791.0, 7

# connect the holiday dataset

In [27]:

import holidays

# Create a dictionary of US holidays for 2022 and 2023
us_holidays = dict(holidays.US(years=[2022, 2023]))


# First, ensure that 'datetime' column is indeed a datetime object
combine_df['datetime'] = pd.to_datetime(combine_df['datetime'])

# Create a new 'holiday' column. If the date is a US holiday, it will have the holiday's name;
# otherwise, it will be "No"
combine_df['holiday'] = combine_df['datetime'].dt.date.apply(lambda x: us_holidays.get(x, "No"))
combine_df

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots,school,total_business,holiday
0,1,2022-01-01 00:00:00,0.0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,New Year's Day
1,1,2022-01-01 01:00:00,0.0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,New Year's Day
2,1,2022-01-01 02:00:00,2.0,2022-01,5,2,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,New Year's Day
3,1,2022-01-01 03:00:00,1.0,2022-01,5,3,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,New Year's Day
4,1,2022-01-01 04:00:00,8.0,2022-01,5,4,EWR,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,New Year's Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247.0,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0,No
3038036,263,2023-04-30 20:00:00,242.0,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0,No
3038037,263,2023-04-30 21:00:00,210.0,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0,No
3038038,263,2023-04-30 22:00:00,152.0,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4,0.0,14.0,4.0,363.0,No


In [28]:
 # check null value
combine_df.isnull().sum()

taxi_zone                          0
datetime                           0
passenger_count                    0
year_month                         0
week                               0
hour                               0
borough                            0
entertainment_and_recreation       0
financial_services                 0
food_and_beverage                  0
parking_and_automotive_services    0
professional_services              0
real_estate                        0
retail_services                    0
transportation                     0
hospital                           0
hotspots                           0
school                             0
total_business                     0
holiday                            0
dtype: int64

In [29]:
combine_df.dtypes

taxi_zone                                   int64
datetime                           datetime64[ns]
passenger_count                           float64
year_month                              period[M]
week                                        int32
hour                                        int32
borough                                    object
entertainment_and_recreation                int64
financial_services                          int64
food_and_beverage                           int64
parking_and_automotive_services             int64
professional_services                       int64
real_estate                                 int64
retail_services                             int64
transportation                              int64
hospital                                  float64
hotspots                                  float64
school                                    float64
total_business                            float64
holiday                                    object


In [30]:
combine_df['passenger_count'] = combine_df['passenger_count'].astype('int')
combine_df['hospital'] = combine_df['hospital'].astype('int')
combine_df['hotspots'] = combine_df['hotspots'].astype('int')
combine_df['school'] = combine_df['school'].astype('int')
combine_df['total_business'] = combine_df['total_business'].astype('int')

In [31]:
combine_df.dtypes

taxi_zone                                   int64
datetime                           datetime64[ns]
passenger_count                             int64
year_month                              period[M]
week                                        int32
hour                                        int32
borough                                    object
entertainment_and_recreation                int64
financial_services                          int64
food_and_beverage                           int64
parking_and_automotive_services             int64
professional_services                       int64
real_estate                                 int64
retail_services                             int64
transportation                              int64
hospital                                    int64
hotspots                                    int64
school                                      int64
total_business                              int64
holiday                                    object


In [32]:
combine_df.to_csv("train_data_2.2.csv", index=False)

In [33]:
combine_df.to_parquet("train_data_2.2.parquet", index=False)

# Convert 'year_month' to 'month'

In [3]:
# Reading from a csv file, into a data frame
change_month = pd.read_csv('train_data_2.2.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
change_month

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots,school,total_business,holiday
0,1,2022-01-01 00:00:00,0,2022-01,5,0,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
1,1,2022-01-01 01:00:00,0,2022-01,5,1,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
2,1,2022-01-01 02:00:00,2,2022-01,5,2,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
3,1,2022-01-01 03:00:00,1,2022-01,5,3,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
4,1,2022-01-01 04:00:00,8,2022-01,5,4,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247,2023-04,6,19,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038036,263,2023-04-30 20:00:00,242,2023-04,6,20,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038037,263,2023-04-30 21:00:00,210,2023-04,6,21,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038038,263,2023-04-30 22:00:00,152,2023-04,6,22,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No


In [4]:
# Convert 'year_month' to 'month'
change_month['year_month'] = pd.to_datetime(change_month['year_month']).dt.month
change_month

Unnamed: 0,taxi_zone,datetime,passenger_count,year_month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots,school,total_business,holiday
0,1,2022-01-01 00:00:00,0,1,5,0,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
1,1,2022-01-01 01:00:00,0,1,5,1,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
2,1,2022-01-01 02:00:00,2,1,5,2,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
3,1,2022-01-01 03:00:00,1,1,5,3,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
4,1,2022-01-01 04:00:00,8,1,5,4,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247,4,6,19,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038036,263,2023-04-30 20:00:00,242,4,6,20,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038037,263,2023-04-30 21:00:00,210,4,6,21,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038038,263,2023-04-30 22:00:00,152,4,6,22,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No


In [6]:
print("year_month: ", change_month['year_month'].unique().tolist(), "\n")

year_month:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] 



In [7]:
change_month = change_month.rename(columns={
    'year_month': 'month'
})
change_month

Unnamed: 0,taxi_zone,datetime,passenger_count,month,week,hour,borough,entertainment_and_recreation,financial_services,food_and_beverage,parking_and_automotive_services,professional_services,real_estate,retail_services,transportation,hospital,hotspots,school,total_business,holiday
0,1,2022-01-01 00:00:00,0,1,5,0,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
1,1,2022-01-01 01:00:00,0,1,5,1,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
2,1,2022-01-01 02:00:00,2,1,5,2,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
3,1,2022-01-01 03:00:00,1,1,5,3,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
4,1,2022-01-01 04:00:00,8,1,5,4,EWR,0,0,0,0,0,0,0,0,0,0,0,0,New Year's Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038035,263,2023-04-30 19:00:00,247,4,6,19,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038036,263,2023-04-30 20:00:00,242,4,6,20,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038037,263,2023-04-30 21:00:00,210,4,6,21,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No
3038038,263,2023-04-30 22:00:00,152,4,6,22,Manhattan,69,0,0,37,11,116,108,4,0,14,4,363,No


In [9]:
change_month.to_csv("train_data_2.2.1_change_month.csv", index=False)