## London bicycle sharing prediction

The purpose of this project is to predict the number of bicycles being rented across London in future based on the database provided by TFL having the following information: date and time, number of bicycle rented, humidity, wind speed, weather condition and season.

In [124]:
import numpy as np
import pandas as pd

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

In [125]:
data = pd.read_csv("london_merged.csv")

In [126]:
data.head()

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [127]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


### Preprocessing

In [128]:
def preprocess_inputs(df):
    df=df.copy()
    
    # Extract month, day and hour features from the timestamp column
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].apply(lambda x:x.month)
    df['day'] = df['timestamp'].apply(lambda x:x.day)
    df['hour'] = df['timestamp'].apply(lambda x:x.hour)
    df = df.drop('timestamp', axis=1)
    
    # One-hot encode weather_code column
    weather_dummies= pd.get_dummies (df['weather_code'], prefix='weather')
    df= pd.concat([df, weather_dummies], axis=1)
    df = df.drop('weather_code', axis=1)
    
    # Split df into X and y
    y= df['cnt']
    X=df.drop('cnt', axis=1)
    
    # Train-Test split
    X_train, X_test, y_train, y_test= train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler =StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [129]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [130]:
X_train

Unnamed: 0,t1,t2,hum,wind_speed,is_holiday,is_weekend,season,month,day,hour,weather_1.0,weather_2.0,weather_3.0,weather_4.0,weather_7.0,weather_10.0,weather_26.0
1930,-0.793215,-0.900317,-0.592599,-0.372328,-0.149651,-0.632510,-1.336593,-1.021919,1.055657,0.068510,-0.738682,-0.54635,1.983638,-0.307663,-0.375177,-0.028655,-0.061548
14312,1.536335,1.439022,0.739356,-0.941867,-0.149651,-0.632510,-0.441295,0.424858,1.169583,-1.231778,-0.738682,1.83033,-0.504124,-0.307663,-0.375177,-0.028655,-0.061548
2542,-1.330803,-1.051242,1.019767,-1.384842,-0.149651,-0.632510,-1.336593,-0.732564,0.486026,-0.798349,1.353763,-0.54635,-0.504124,-0.307663,-0.375177,-0.028655,-0.061548
16732,-0.434823,-0.221154,1.089870,-1.637971,-0.149651,-0.632510,1.349300,1.582280,-1.108942,0.357464,-0.738682,-0.54635,1.983638,-0.307663,-0.375177,-0.028655,-0.061548
5815,0.281962,0.382546,-0.662702,-0.372328,-0.149651,-0.632510,0.454002,0.714214,-1.450721,1.513276,1.353763,-0.54635,-0.504124,-0.307663,-0.375177,-0.028655,-0.061548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,-1.062009,-1.353093,0.073378,1.273008,-0.149651,-0.632510,-1.336593,-0.732564,-0.995016,-0.653872,-0.738682,-0.54635,-0.504124,-0.307663,2.665406,-0.028655,-0.061548
17289,-1.241205,-1.126705,0.809458,-1.194996,-0.149651,-0.632510,1.349300,1.582280,1.511362,1.079846,1.353763,-0.54635,-0.504124,-0.307663,-0.375177,-0.028655,-0.061548
5192,1.177942,1.137172,-0.592599,-1.005149,-0.149651,1.581003,-0.441295,0.424858,-0.881089,1.657752,1.353763,-0.54635,-0.504124,-0.307663,-0.375177,-0.028655,-0.061548
12172,0.013168,0.156159,0.879561,-0.878585,-0.149651,1.581003,-1.336593,-0.443208,1.397436,-1.520731,1.353763,-0.54635,-0.504124,-0.307663,-0.375177,-0.028655,-0.061548


In [131]:
y_train

1930      948
14312      81
2542      603
16732    1012
5815      614
         ... 
10955    1864
17289     650
5192      967
12172     388
235       947
Name: cnt, Length: 12189, dtype: int64

### Training

In [132]:
model = XGBRegressor()
model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

### Results

In [133]:
y_pred = model.predict(X_test)

In [134]:
y_pred

array([ 565.2626 ,  857.9845 , 1522.6805 , ...,  218.54388,  252.3063 ,
       2775.347  ], dtype=float32)

In [135]:
length_y= len(y_pred)

In [136]:
length_y

5225

In [156]:
# y_test are the actual values
y_test

14999     201
5504      867
10259    1435
15150    1566
345       693
         ... 
11357     229
9217      341
3733      205
2248      236
14506    2065
Name: cnt, Length: 5225, dtype: int64

In [138]:
error= y_test - y_pred

In [139]:
error

14999   -364.262573
5504       9.015503
10259    -87.680542
15150    -57.217041
345      213.196869
            ...    
11357    145.312790
9217     -25.270508
3733     -13.543884
2248     -16.306305
14506   -710.346924
Name: cnt, Length: 5225, dtype: float64

In [140]:
squared_error=(y_test - y_pred)**2

In [141]:
squared_error

14999    132687.222265
5504         81.279293
10259      7687.877444
15150      3273.789783
345       45452.904907
             ...      
11357     21115.806913
9217        638.598565
3733        183.436801
2248        265.895581
14506    504592.752192
Name: cnt, Length: 5225, dtype: float64

In [142]:
np.mean(squared_error)

130617.37823466369

In [143]:
rmse= np.sqrt(np.mean(squared_error))

In [144]:
print ("RMSE= {:.2f}".format(rmse))

RMSE= 361.41


In [145]:
print("RMSE establish on average we have {:.0f} bicycles off every prediction".format(rmse))

RMSE establish on average we have 361 bicycles off every prediction


In [146]:
y_train.describe()

count    12189.000000
mean      1139.557142
std       1084.749581
min          0.000000
25%        256.000000
50%        838.000000
75%       1667.000000
max       7860.000000
Name: cnt, dtype: float64

Observation: when compared the RMSE with the y_train distribution, the RMSE is decent

In [147]:
np.sum((y_test - y_test.mean())**2)

6161196246.401914

In [148]:
np.sum((y_test - y_pred)**2)

682475801.2761276

In [149]:
np.sum(y_test - y_pred)**2

437309801.0422353

In [150]:
r2= 1-(np.sum(y_test - y_pred)**2)/(np.sum((y_test - y_test.mean())**2))

In [151]:
r2

0.9290219328271485

In [152]:
print ("R2= {:.2f}".format(r2))

R2= 0.93


In [153]:
r2_100=r2*100

In [154]:
print("There is {:.0f}% reduction in total error from the base line model to our model".format(r2_100))

There is 93% reduction in total error from the base line model to our model


In [155]:
fig= px.scatter(
    x=y_pred,
    y=y_test,
    labels= {'x': "Predicted", 'y':"Actual"},
    title= "Actual vs. Predicted Values",
    width=700,
    height=700
)
fig.show()

### Conclusions

The model seems to perform fairly well as the Predicted values are equal to the Actual values. Also a little dispersion can be noticed.