# Predicting CitiBike Demand

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

## Load Data

In [3]:
df_train = pd.read_csv('data/2024_08/df_train.csv', low_memory=False)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4603575 entries, 0 to 4603574
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 456.6+ MB


In [5]:
df_train.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5CB4E29A011E918E,electric_bike,2024-08-13 22:28:13.065,2024-08-13 22:30:43.138,McKibbin St & Bogart St,5059.02,Wilson Ave & Troutman St,4864.09,40.706237,-73.933871,40.70166,-73.92754,member
1,6389E1E171CE17CD,classic_bike,2024-08-07 09:39:52.489,2024-08-07 09:43:14.975,Bialystoker Pl & Delancey St,5335.03,Norfolk St & Broome St,5374.01,40.716226,-73.982612,40.717227,-73.988021,member
2,3F4BBEBDFB7548C0,electric_bike,2024-08-10 21:04:35.143,2024-08-10 21:21:43.846,Rivington St & Chrystie St,5453.01,Kent Ave & Division Ave,5021.05,40.721101,-73.991925,40.706564,-73.968319,member
3,C0939F0CD7ED731E,classic_bike,2024-08-13 19:21:31.275,2024-08-13 19:33:43.790,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member
4,4CB3950095D804D6,electric_bike,2024-08-09 22:23:42.894,2024-08-09 22:58:04.455,E 34 St & Church Ave,3318.05,48 St & 2 Ave,3283.05,40.65116,-73.94577,40.650176,-74.015606,casual
5,585D1A3FEEF4867E,electric_bike,2024-08-02 16:58:20.753,2024-08-02 17:03:15.327,McKibbin St & Bogart St,5059.02,Suydam St & Broadway,4689.03,40.706237,-73.933871,40.69544,-73.93223,member
6,BAC50AFB465C607B,electric_bike,2024-08-07 23:12:24.982,2024-08-07 23:20:00.458,Broadway & E 21 St,6098.1,1 Ave & E 39 St,6303.01,40.739888,-73.989586,40.74714,-73.97113,member
7,4F4D162103E66917,electric_bike,2024-08-06 15:20:31.886,2024-08-06 15:32:23.192,8 Ave & W 16 St,6072.11,1 Ave & E 39 St,6303.01,40.740983,-74.001702,40.74714,-73.97113,member
8,DE6F03D235645CBF,classic_bike,2024-08-13 18:51:17.796,2024-08-13 19:06:27.537,Graham Ave & Grand St,5178.06,Stanton St & Norfolk St,5445.07,40.711863,-73.944024,40.720747,-73.986274,member
9,BC3A1D7C8884C727,electric_bike,2024-08-05 00:24:16.628,2024-08-05 00:58:38.898,8 Ave & W 16 St,6072.11,5 Ave & W 131 St,7735.05,40.740983,-74.001702,40.81014,-73.93973,member


## Clean and Prepare Data

In [9]:
# Sort the DataFrame by the 'started_at' column
df_train = df_train.sort_values('started_at')

In [None]:
df_train.head(-10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
1383398,9DB3BDC056248F66,classic_bike,2024-07-31 01:42:20.692,2024-08-01 02:42:15.120,Fairmount Pl & Prospect Ave,8272.05,,,40.843099,-73.889927,,,casual
1384468,139A97224FDBE43C,classic_bike,2024-07-31 03:32:45.226,2024-08-01 04:32:42.617,Broadway & E 14 St,5905.12,,,40.734546,-73.990741,,,casual
1377696,CB56138D86500796,classic_bike,2024-07-31 08:27:55.572,2024-08-01 09:27:46.466,Schermerhorn St & Hoyt St,4479.10,,,40.688626,-73.985191,,,member
1390889,F858DA99BCC494D5,classic_bike,2024-07-31 08:46:30.224,2024-08-01 09:46:19.770,Dean St & Hoyt St,4446.05,,,40.686444,-73.987591,40.680000,-73.990000,member
1384440,A2B45AF4616F9921,classic_bike,2024-07-31 09:25:30.091,2024-08-01 10:25:08.418,E 11 St & 3 Ave,5788.16,,,40.731270,-73.988490,,,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3213802,5C62C2514A7B9D59,electric_bike,2024-08-31 23:55:04.745,2024-08-31 23:57:00.431,Jerome Ave & W 184 St,8550.01,E 182 St & Morris Ave,8494.01,40.860814,-73.902541,40.856708,-73.902653,casual
3208308,E69383F821276E31,electric_bike,2024-08-31 23:55:04.999,2024-08-31 23:56:55.334,Jerome Ave & W 184 St,8550.01,E 182 St & Morris Ave,8494.01,40.860814,-73.902541,40.856708,-73.902653,member
3293416,785503E9A80230B9,electric_bike,2024-08-31 23:55:07.921,2024-08-31 23:57:01.590,W 95 St & Broadway,7541.01,W 100 St & Broadway,7580.01,40.793770,-73.971888,40.797372,-73.970412,member
3147653,6192E820FEE8100A,electric_bike,2024-08-31 23:55:08.485,2024-08-31 23:58:19.768,Anthony Ave & E Tremont Ave,8356.04,Webster Ave & Ford St,8472.08,40.848793,-73.903178,40.855560,-73.896150,member


In [10]:
# Check for duplicates
duplicates = df_sorted.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


In [11]:
# Check for missing values
missing_values = df_train.isnull().sum()
print("Missing values in each column:")
print(missing_values)

Missing values in each column:
ride_id                   0
rideable_type             0
started_at                0
ended_at                  0
start_station_name     2962
start_station_id       2962
end_station_name      12410
end_station_id        13353
start_lat              2962
start_lng              2962
end_lat               13332
end_lng               13332
member_casual             0
dtype: int64


In [12]:
# Compute percentage of missing values for each column
missing_percentage = (df_train.isnull().sum() / len(df_train)) * 100
print("Percentage of missing values in each column:")
print(missing_percentage)

Percentage of missing values in each column:
ride_id               0.000000
rideable_type         0.000000
started_at            0.000000
ended_at              0.000000
start_station_name    0.064341
start_station_id      0.064341
end_station_name      0.269573
end_station_id        0.290057
start_lat             0.064341
start_lng             0.064341
end_lat               0.289601
end_lng               0.289601
member_casual         0.000000
dtype: float64


### Reminder: Decide what to do about missing values (Percentages are < 5% so dropping entirely is an option)

In [None]:
# Convert 'started_at' and 'ended_at' columns to datetime format rounded to the nearest minute
df_train['started_at'] = pd.to_datetime(df_train['started_at']).dt.round('min')
df_train['ended_at'] = pd.to_datetime(df_train['ended_at']).dt.round('min')

## Feature Engineering

In [17]:
# Add colomn for trip duration in minutes
df_train['trip_duration'] = ((df_train['ended_at'] - df_train['started_at']).dt.total_seconds() / 60).round()
df_train['trip_duration'] = pd.to_timedelta(df_train['trip_duration'], unit='m').round('min')

In [20]:
df_train.head(10)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,trip_duration
1383398,9DB3BDC056248F66,classic_bike,2024-07-31 01:42:00,2024-08-01 02:42:00,Fairmount Pl & Prospect Ave,8272.05,,,40.843099,-73.889927,,,casual,1 days 01:00:00
1384468,139A97224FDBE43C,classic_bike,2024-07-31 03:33:00,2024-08-01 04:33:00,Broadway & E 14 St,5905.12,,,40.734546,-73.990741,,,casual,1 days 01:00:00
1377696,CB56138D86500796,classic_bike,2024-07-31 08:28:00,2024-08-01 09:28:00,Schermerhorn St & Hoyt St,4479.1,,,40.688626,-73.985191,,,member,1 days 01:00:00
1390889,F858DA99BCC494D5,classic_bike,2024-07-31 08:47:00,2024-08-01 09:46:00,Dean St & Hoyt St,4446.05,,,40.686444,-73.987591,40.68,-73.99,member,1 days 00:59:00
1384440,A2B45AF4616F9921,classic_bike,2024-07-31 09:26:00,2024-08-01 10:25:00,E 11 St & 3 Ave,5788.16,,,40.73127,-73.98849,,,member,1 days 00:59:00
246606,5ACA793D529F4F9D,classic_bike,2024-07-31 09:41:00,2024-08-01 08:03:00,Lorimer St & Calyer St,5709.04,Nassau Ave & Russell St,5581.01,40.72795,-73.95414,40.72557,-73.94434,member,0 days 22:22:00
1381347,97FDC1C0AAE282EE,electric_bike,2024-07-31 10:04:00,2024-08-01 02:31:00,Beverley Rd & Nostrand Ave,3132.09,,,40.64507,-73.9488,,,casual,0 days 16:27:00
1381348,D89EE7AF1193FC2C,electric_bike,2024-07-31 11:41:00,2024-08-01 03:11:00,Beverley Rd & Nostrand Ave,3132.09,,,40.64507,-73.9488,,,casual,0 days 15:30:00
1375083,43E05EC67660DAB2,classic_bike,2024-07-31 12:21:00,2024-08-01 13:21:00,Lafayette St & Jersey St,5561.06,,,40.724561,-73.995653,,,member,1 days 01:00:00
1385977,8067826EB66A0175,electric_bike,2024-07-31 12:41:00,2024-08-01 13:41:00,W 31 St & 7 Ave,6331.01,,,40.749156,-73.9916,,,casual,1 days 01:00:00


### Note on test data- data would need to be cleaned a 2nd time exactly the same for the train dataset (2025) to be viable. May consider just splitting 2024 in train/validation/test