# Predicting CitiBike Demand

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

## Part 1: Data Loading and Preprocessing

### Load Data

In [2]:
df_train = pd.read_csv('data/2024_08/df_train.csv', low_memory=False)
df_weather = pd.read_csv('data/2024_08/weather_data_2024.csv', low_memory=False)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4603575 entries, 0 to 4603574
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 456.6+ MB


In [27]:
df_train.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5CB4E29A011E918E,electric_bike,2024-08-13 22:28:13.065,2024-08-13 22:30:43.138,McKibbin St & Bogart St,5059.02,Wilson Ave & Troutman St,4864.09,40.706237,-73.933871,40.70166,-73.92754,member
1,6389E1E171CE17CD,classic_bike,2024-08-07 09:39:52.489,2024-08-07 09:43:14.975,Bialystoker Pl & Delancey St,5335.03,Norfolk St & Broome St,5374.01,40.716226,-73.982612,40.717227,-73.988021,member
2,3F4BBEBDFB7548C0,electric_bike,2024-08-10 21:04:35.143,2024-08-10 21:21:43.846,Rivington St & Chrystie St,5453.01,Kent Ave & Division Ave,5021.05,40.721101,-73.991925,40.706564,-73.968319,member
3,C0939F0CD7ED731E,classic_bike,2024-08-13 19:21:31.275,2024-08-13 19:33:43.790,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member
4,4CB3950095D804D6,electric_bike,2024-08-09 22:23:42.894,2024-08-09 22:58:04.455,E 34 St & Church Ave,3318.05,48 St & 2 Ave,3283.05,40.65116,-73.94577,40.650176,-74.015606,casual
5,585D1A3FEEF4867E,electric_bike,2024-08-02 16:58:20.753,2024-08-02 17:03:15.327,McKibbin St & Bogart St,5059.02,Suydam St & Broadway,4689.03,40.706237,-73.933871,40.69544,-73.93223,member
6,BAC50AFB465C607B,electric_bike,2024-08-07 23:12:24.982,2024-08-07 23:20:00.458,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member
7,4F4D162103E66917,electric_bike,2024-08-06 15:20:31.886,2024-08-06 15:32:23.192,8 Ave & W 16 St,6072.11,1 Ave & E 39 St,6303.01,40.740983,-74.001702,40.74714,-73.97113,member
8,DE6F03D235645CBF,classic_bike,2024-08-13 18:51:17.796,2024-08-13 19:06:27.537,Graham Ave & Grand St,5178.06,Stanton St & Norfolk St,5445.07,40.711863,-73.944024,40.720747,-73.986274,member
9,BC3A1D7C8884C727,electric_bike,2024-08-05 00:24:16.628,2024-08-05 00:58:38.898,8 Ave & W 16 St,6072.11,5 Ave & W 131 St,7735.05,40.740983,-74.001702,40.81014,-73.93973,member


In [29]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              768 non-null    int64  
 1   datetime          768 non-null    object 
 2   temp              768 non-null    float64
 3   feelslike         768 non-null    float64
 4   dew               768 non-null    float64
 5   humidity          768 non-null    float64
 6   precip            768 non-null    float64
 7   precipprob        768 non-null    int64  
 8   preciptype        78 non-null     object 
 9   snow              768 non-null    int64  
 10  snowdepth         768 non-null    int64  
 11  windgust          768 non-null    float64
 12  windspeed         768 non-null    float64
 13  winddir           768 non-null    float64
 14  sealevelpressure  768 non-null    float64
 15  cloudcover        768 non-null    float64
 16  visibility        768 non-null    float64
 1

In [24]:
df_weather.head(10)

Unnamed: 0,name,datetime,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,10021,2024-07-31T00:00:00,77.0,77.0,70.1,79.36,0.0,0,,0,...,1013.9,99.8,9.9,0,0.0,0,,Overcast,cloudy,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
1,10021,2024-07-31T01:00:00,77.0,77.0,70.1,79.4,0.0,0,,0,...,1013.3,89.1,9.9,0,0.0,0,,Partially cloudy,partly-cloudy-night,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
2,10021,2024-07-31T02:00:00,77.0,77.0,71.1,81.9,0.0,0,,0,...,1013.1,100.0,9.9,0,0.0,0,,Overcast,cloudy,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
3,10021,2024-07-31T03:00:00,77.0,77.0,71.1,82.06,0.0,0,,0,...,1012.7,90.0,9.9,0,0.0,0,,Partially cloudy,partly-cloudy-night,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
4,10021,2024-07-31T04:00:00,77.0,77.0,71.1,82.1,0.0,0,,0,...,1012.1,100.0,9.9,0,0.0,0,,Overcast,cloudy,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
5,10021,2024-07-31T05:00:00,76.9,76.9,71.1,82.3,0.0,0,,0,...,1012.2,100.0,9.9,0,0.0,0,,Overcast,cloudy,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
6,10021,2024-07-31T06:00:00,77.0,77.0,72.0,84.43,0.0,0,,0,...,1012.6,100.0,9.9,15,0.1,0,,Overcast,cloudy,"72505394728,KTEB,KLGA,F1417,KNYC,72503014732"
7,10021,2024-07-31T07:00:00,78.1,78.1,71.1,79.21,0.0,0,,0,...,1012.4,89.8,9.9,83,0.3,1,,Partially cloudy,partly-cloudy-day,"72505394728,KTEB,KLGA,F1417,KNYC,72503014732"
8,10021,2024-07-31T08:00:00,78.2,78.2,71.1,78.83,0.0,0,,0,...,1012.3,99.3,9.9,104,0.4,1,,Overcast,cloudy,"72502594741,72505394728,KTEB,KLGA,F1417,KNYC,7..."
9,10021,2024-07-31T09:00:00,78.3,78.3,70.1,75.94,0.0,0,,0,...,1012.6,99.3,9.9,139,0.5,1,,Overcast,cloudy,"72505394728,KTEB,KLGA,F1417,KNYC,72503014732"


### Merge Dataframes

In [None]:
# Sort df_train by the 'started_at' column
#df_train = df_train.sort_values('started_at')

In [None]:
# Add new 'start_date' columns to datetime format rounded to the nearest day for merging
#df_train['day'] = pd.to_datetime(df_train['started_at']).dt.round('D')
#df_weather['day'] = pd.to_datetime(df_weather['datetime']).dt.round('D')


In [None]:
#df_merged = pd.merge(df_train, df_weather, on='day', how='left')

### Clean and Prepare Data

In [34]:
df_train.head(-10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
1383398,9DB3BDC056248F66,classic_bike,2024-07-31 01:42:20.692,2024-08-01 02:42:15.120,Fairmount Pl & Prospect Ave,8272.05,,,40.843099,-73.889927,,,casual
1384468,139A97224FDBE43C,classic_bike,2024-07-31 03:32:45.226,2024-08-01 04:32:42.617,Broadway & E 14 St,5905.12,,,40.734546,-73.990741,,,casual
1377696,CB56138D86500796,classic_bike,2024-07-31 08:27:55.572,2024-08-01 09:27:46.466,Schermerhorn St & Hoyt St,4479.10,,,40.688626,-73.985191,,,member
1390889,F858DA99BCC494D5,classic_bike,2024-07-31 08:46:30.224,2024-08-01 09:46:19.770,Dean St & Hoyt St,4446.05,,,40.686444,-73.987591,40.680000,-73.990000,member
1384440,A2B45AF4616F9921,classic_bike,2024-07-31 09:25:30.091,2024-08-01 10:25:08.418,E 11 St & 3 Ave,5788.16,,,40.731270,-73.988490,,,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3213802,5C62C2514A7B9D59,electric_bike,2024-08-31 23:55:04.745,2024-08-31 23:57:00.431,Jerome Ave & W 184 St,8550.01,E 182 St & Morris Ave,8494.01,40.860814,-73.902541,40.856708,-73.902653,casual
3208308,E69383F821276E31,electric_bike,2024-08-31 23:55:04.999,2024-08-31 23:56:55.334,Jerome Ave & W 184 St,8550.01,E 182 St & Morris Ave,8494.01,40.860814,-73.902541,40.856708,-73.902653,member
3293416,785503E9A80230B9,electric_bike,2024-08-31 23:55:07.921,2024-08-31 23:57:01.590,W 95 St & Broadway,7541.01,W 100 St & Broadway,7580.01,40.793770,-73.971888,40.797372,-73.970412,member
3147653,6192E820FEE8100A,electric_bike,2024-08-31 23:55:08.485,2024-08-31 23:58:19.768,Anthony Ave & E Tremont Ave,8356.04,Webster Ave & Ford St,8472.08,40.848793,-73.903178,40.855560,-73.896150,member


In [3]:
# Check for duplicates
duplicates = df_train.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


In [4]:
# Check for missing values
missing_values = df_train.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
ride_id                   0
rideable_type             0
started_at                0
ended_at                  0
start_station_name     2962
start_station_id       2962
end_station_name      12410
end_station_id        13353
start_lat              2962
start_lng              2962
end_lat               13332
end_lng               13332
member_casual             0
dtype: int64


In [5]:
# Compute percentage of missing values for each column
missing_percentage = (df_train.isnull().sum() / len(df_train)) * 100
print("Percentage of missing values in each column:")
print(missing_percentage)

Percentage of missing values in each column:
ride_id               0.000000
rideable_type         0.000000
started_at            0.000000
ended_at              0.000000
start_station_name    0.064341
start_station_id      0.064341
end_station_name      0.269573
end_station_id        0.290057
start_lat             0.064341
start_lng             0.064341
end_lat               0.289601
end_lng               0.289601
member_casual         0.000000
dtype: float64


In [6]:
# Drop rows with missing values where & missing < 5%
threshold = 5.0
cols_to_check = missing_percentage[missing_percentage < threshold].index
df_train = df_train.dropna(subset=cols_to_check)            


In [7]:
# Check results
missing_values = df_train.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64


#### Borough Identification

In [8]:
# Extrapolate borough boundaries
from shapely.geometry import Point
from shapely import wkt

# Load the borough boundaries CSV
borough_df = pd.read_csv('data/2024_08/Borough_Boundaries_20251104.csv')

# Parse the geometry column (it's in WKT format)
boroughs = {}
for _, row in borough_df.iterrows():
    borough_name = row['BoroName']
    geometry = wkt.loads(row['the_geom'])  # Convert WKT string to Shapely geometry
    boroughs[borough_name] = geometry

print(f"Loaded {len(boroughs)} boroughs: {list(boroughs.keys())}")


Loaded 5 boroughs: ['Staten Island', 'Bronx', 'Brooklyn', 'Queens', 'Manhattan']


In [9]:
# Function to find which borough a point is in
def get_borough(lat, lng):
    point = Point(lng, lat)  # Note: Point takes (longitude, latitude)
    for borough_name, geometry in boroughs.items():
        if geometry.contains(point):
            return borough_name
    return 'Unknown'

In [10]:
# Execute the function for Start Locations
df_train['start_borough'] = df_train.apply(
    lambda row: get_borough(row['start_lat'], row['start_lng']), 
    axis=1
)

print("\nBorough distribution:")
print(df_train['start_borough'].value_counts())


Borough distribution:
start_borough
Manhattan    2847392
Brooklyn     1263642
Queens        327405
Bronx         149429
Name: count, dtype: int64


In [11]:
# Execute the function for End Locations
df_train['end_borough'] = df_train.apply(
    lambda row: get_borough(row['end_lat'], row['end_lng']), 
    axis=1
)

print("\nEnd borough distribution:")
print(df_train['end_borough'].value_counts())


End borough distribution:
end_borough
Manhattan    2840304
Brooklyn     1270190
Queens        326190
Bronx         150848
Unknown          336
Name: count, dtype: int64


In [12]:
df_train.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
0,5CB4E29A011E918E,electric_bike,2024-08-13 22:28:13.065,2024-08-13 22:30:43.138,McKibbin St & Bogart St,5059.02,Wilson Ave & Troutman St,4864.09,40.706237,-73.933871,40.70166,-73.92754,member,Brooklyn,Brooklyn
1,6389E1E171CE17CD,classic_bike,2024-08-07 09:39:52.489,2024-08-07 09:43:14.975,Bialystoker Pl & Delancey St,5335.03,Norfolk St & Broome St,5374.01,40.716226,-73.982612,40.717227,-73.988021,member,Manhattan,Manhattan
2,3F4BBEBDFB7548C0,electric_bike,2024-08-10 21:04:35.143,2024-08-10 21:21:43.846,Rivington St & Chrystie St,5453.01,Kent Ave & Division Ave,5021.05,40.721101,-73.991925,40.706564,-73.968319,member,Manhattan,Brooklyn
3,C0939F0CD7ED731E,classic_bike,2024-08-13 19:21:31.275,2024-08-13 19:33:43.790,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member,Manhattan,Manhattan
4,4CB3950095D804D6,electric_bike,2024-08-09 22:23:42.894,2024-08-09 22:58:04.455,E 34 St & Church Ave,3318.05,48 St & 2 Ave,3283.05,40.65116,-73.94577,40.650176,-74.015606,casual,Brooklyn,Brooklyn
5,585D1A3FEEF4867E,electric_bike,2024-08-02 16:58:20.753,2024-08-02 17:03:15.327,McKibbin St & Bogart St,5059.02,Suydam St & Broadway,4689.03,40.706237,-73.933871,40.69544,-73.93223,member,Brooklyn,Brooklyn
6,BAC50AFB465C607B,electric_bike,2024-08-07 23:12:24.982,2024-08-07 23:20:00.458,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member,Manhattan,Manhattan
7,4F4D162103E66917,electric_bike,2024-08-06 15:20:31.886,2024-08-06 15:32:23.192,8 Ave & W 16 St,6072.11,1 Ave & E 39 St,6303.01,40.740983,-74.001702,40.74714,-73.97113,member,Manhattan,Manhattan
8,DE6F03D235645CBF,classic_bike,2024-08-13 18:51:17.796,2024-08-13 19:06:27.537,Graham Ave & Grand St,5178.06,Stanton St & Norfolk St,5445.07,40.711863,-73.944024,40.720747,-73.986274,member,Brooklyn,Manhattan
9,BC3A1D7C8884C727,electric_bike,2024-08-05 00:24:16.628,2024-08-05 00:58:38.898,8 Ave & W 16 St,6072.11,5 Ave & W 131 St,7735.05,40.740983,-74.001702,40.81014,-73.93973,member,Manhattan,Manhattan


In [15]:
# Keep only Manhattan, Brooklyn Start and End Boroughs
df_train = df_train[df_train['start_borough'].isin(['Manhattan', 'Brooklyn'])]
df_train = df_train[df_train['end_borough'].isin(['Manhattan', 'Brooklyn'])]

In [16]:
# Check results after dropping
print("\nBorough distribution:")
print(df_train['start_borough'].value_counts())

print("\nEnd borough distribution:")
print(df_train['end_borough'].value_counts())


Borough distribution:
start_borough
Manhattan    2793422
Brooklyn     1220857
Name: count, dtype: int64

End borough distribution:
end_borough
Manhattan    2788330
Brooklyn     1225949
Name: count, dtype: int64


In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4014279 entries, 0 to 4603571
Data columns (total 15 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
 13  start_borough       object 
 14  end_borough         object 
dtypes: float64(4), object(11)
memory usage: 490.0+ MB


In [None]:
# Export cleaned dataframe to CSV
df_train.to_csv('data/2024_08/df_train_cleaned.csv', index=False)

### Feature Engineering

In [20]:
# Convert datetime columns from string to datetime format
df_train['started_at'] = pd.to_datetime(df_train['started_at'])
df_train['ended_at'] = pd.to_datetime(df_train['ended_at'])

df_train['trip_duration'] = (df_train['ended_at'] - df_train['started_at']).dt.total_seconds() / 60

In [21]:
df_train.head(5)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough,trip_duration
0,5CB4E29A011E918E,electric_bike,2024-08-13 22:28:13.065,2024-08-13 22:30:43.138,McKibbin St & Bogart St,5059.02,Wilson Ave & Troutman St,4864.09,40.706237,-73.933871,40.70166,-73.92754,member,Brooklyn,Brooklyn,2.501217
1,6389E1E171CE17CD,classic_bike,2024-08-07 09:39:52.489,2024-08-07 09:43:14.975,Bialystoker Pl & Delancey St,5335.03,Norfolk St & Broome St,5374.01,40.716226,-73.982612,40.717227,-73.988021,member,Manhattan,Manhattan,3.374767
2,3F4BBEBDFB7548C0,electric_bike,2024-08-10 21:04:35.143,2024-08-10 21:21:43.846,Rivington St & Chrystie St,5453.01,Kent Ave & Division Ave,5021.05,40.721101,-73.991925,40.706564,-73.968319,member,Manhattan,Brooklyn,17.14505
3,C0939F0CD7ED731E,classic_bike,2024-08-13 19:21:31.275,2024-08-13 19:33:43.790,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member,Manhattan,Manhattan,12.208583
4,4CB3950095D804D6,electric_bike,2024-08-09 22:23:42.894,2024-08-09 22:58:04.455,E 34 St & Church Ave,3318.05,48 St & 2 Ave,3283.05,40.65116,-73.94577,40.650176,-74.015606,casual,Brooklyn,Brooklyn,34.35935


In [22]:
print(df_train['trip_duration'].describe())

count    4.014279e+06
mean     1.370771e+01
std      2.010477e+01
min      1.000333e+00
25%      5.580983e+00
50%      9.673200e+00
75%      1.681856e+01
max      1.498657e+03
Name: trip_duration, dtype: float64


#### Remove outliers based on trip duration

In [None]:
# Calculate IQR
Q1 = df_train['trip_duration'].quantile(0.25)
Q3 = df_train['trip_duration'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Find outliers
outliers = df_train[(df_train['trip_duration'] < lower_bound) | 
                    (df_train['trip_duration'] > upper_bound)]
print(f"\nNumber of outliers: {len(outliers)} ({len(outliers)/len(df_train)*100:.2f}%)")


Number of outliers: 231226 (5.76%)


In [24]:
# Remove outliers using IQR method
df_train = df_train[(df_train['trip_duration'] >= lower_bound) & (df_train['trip_duration'] <= upper_bound)]

In [25]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3783053 entries, 0 to 4603571
Data columns (total 16 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  start_borough       object        
 14  end_borough         object        
 15  trip_duration       float64       
dtypes: datetime64[ns](2), float64(5), object(9)
memory usage: 490.7+ MB
