In [84]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [94]:
df = pd.read_csv('bike_cleaned_data.csv')

In [160]:
df.sample()

Unnamed: 0,day,date,month,year,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
8413,Friday,16,11,2018,920,13,14.4,49,3.5,890,1.71,0.0,0.0,Autumn,No Holiday,Yes


In [161]:
df[df['Rented Bike Count']>3500]

Unnamed: 0,day,date,month,year,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
4818,Tuesday,19,6,2018,3556,18,24.1,57,2.9,1301,0.56,0.0,0.0,Summer,No Holiday,Yes


In [154]:
 df.select_dtypes(include=['int32','int64', 'float64']).columns

Index(['date', 'month', 'year', 'Rented Bike Count', 'Hour', 'Temperature(°C)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)'],
      dtype='object')

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   day                      8760 non-null   object 
 1   date                     8760 non-null   int64  
 2   month                    8760 non-null   int64  
 3   year                     8760 non-null   int64  
 4   Rented Bike Count        8760 non-null   int64  
 5   Hour                     8760 non-null   int64  
 6   Temperature(°C)          8760 non-null   float64
 7   Humidity(%)              8760 non-null   int64  
 8   Wind speed (m/s)         8760 non-null   float64
 9   Visibility (10m)         8760 non-null   int64  
 10  Solar Radiation (MJ/m2)  8760 non-null   float64
 11  Rainfall(mm)             8760 non-null   float64
 12  Snowfall (cm)            8760 non-null   float64
 13  Seasons                  8760 non-null   object 
 14  Holiday                 

# training & testing

In [106]:
X = df.drop("Rented Bike Count",axis=1)
y = df['Rented Bike Count']

In [107]:
y_transformed = np.log1p(y)

In [89]:

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression , Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [108]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['date', 'month', 'year', 'Hour', 'Temperature(°C)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']),
        ('cat', OrdinalEncoder(), ['Holiday', 'Functioning Day']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['day', 'Seasons'])
    ]

)

In [109]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [110]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [111]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [112]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.963014,82.493206
6,extra trees,0.964841,83.079775
5,random forest,0.95476,98.459026
4,decision tree,0.908468,128.961187
7,gradient boosting,0.927827,158.656289
9,mlp,0.933501,176.59061
1,svr,0.87954,177.631354
2,ridge,0.791838,283.523397
0,linear_reg,0.791844,283.525083
8,adaboost,0.758439,357.631308


# hyperparameter tuning

In [113]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [114]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['date', 'month', 'year', 'Hour', 'Temperature(°C)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']),
        ('cat', OrdinalEncoder(), ['Holiday', 'Functioning Day']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['day', 'Seasons'])
    ]

)

In [115]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [116]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [117]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [118]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


Parameters: { "max_features", "max_samples" } are not used.



In [119]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'auto',
 'regressor__max_samples': 0.1,
 'regressor__n_estimators': 300}

In [120]:
search.best_score_

0.9644444377038737

# exporting model

In [122]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['date', 'month', 'year', 'Hour', 'Temperature(°C)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)']),
        ('cat', OrdinalEncoder(), ['Holiday', 'Functioning Day']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['day', 'Seasons'])
    ]

)

In [123]:
xgb_params = {
    'max_depth': None,  # No maximum depth for the trees
    'colsample_bytree': 1.0,  # Equivalent to 'auto' (using all features)
    'subsample': 0.1,  # Use 10% of the data for training each tree
    'n_estimators': 300  
}

In [124]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**xgb_params))
])

In [125]:
pipeline.fit(X,y_transformed)

In [126]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [127]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [128]:
X.head()

Unnamed: 0,day,date,month,year,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,Friday,1,12,2017,0,-5.2,37,2.2,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
1,Friday,1,12,2017,1,-5.5,38,0.8,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
2,Friday,1,12,2017,2,-6.0,39,1.0,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
3,Friday,1,12,2017,3,-6.2,40,0.9,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
4,Friday,1,12,2017,4,-6.0,36,2.3,2000,0.0,0.0,0.0,Winter,No Holiday,Yes


In [129]:
X.columns

Index(['day', 'date', 'month', 'year', 'Hour', 'Temperature(°C)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [162]:
X.iloc[4818].values

array(['Tuesday', 19, 6, 2018, 18, 24.1, 57, 2.9, 1301, 0.56, 0.0, 0.0,
       'Summer', 'No Holiday', 'Yes'], dtype=object)

In [163]:
data = [['Tuesday', 19, 6, 2018, 18, 24.1, 57, 2.9, 1301, 0.56, 0.0, 0.0,
       'Summer', 'No Holiday', 'Yes']]
columns = ['day', 'date', 'month', 'year', 'Hour', 'Temperature(°C)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day']

In [164]:
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,day,date,month,year,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,Tuesday,19,6,2018,18,24.1,57,2.9,1301,0.56,0.0,0.0,Summer,No Holiday,Yes


In [166]:
np.expm1(pipeline.predict(one_df))

array([3563.9336], dtype=float32)