In [6]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sns
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn.pipeline import Pipeline      # Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import set_config
set_config(display='diagram') # Useful for display the pipeline

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor

In [7]:
data = pd.read_csv(r'data\london_merged.csv')
data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )
data.drop('timestamp', axis=1, inplace=True)
data.head()

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,year,month,hour
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,2015,4,0
1,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,2015,4,1
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2015,4,2
3,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,2015,4,3
4,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,2015,4,4


In [8]:
def data_enhancement(data):
    
    gen_data = data
    
    for season in data['season'].unique():
        seasonal_data =  gen_data[gen_data['season'] == season]
        hum_std = seasonal_data['hum'].std()
        wind_speed_std = seasonal_data['wind_speed'].std()
        t1_std = seasonal_data['t1'].std()
        t2_std = seasonal_data['t2'].std()
        
        for i in gen_data[gen_data['season'] == season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_std/10
            else:
                gen_data['hum'].values[i] -= hum_std/10
                
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_std/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_std/10
            else:
                gen_data['t1'].values[i] -= t1_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_std/10
            else:
                gen_data['t2'].values[i] -= t2_std/10

    return gen_data

print(data.head(3))
gen = data_enhancement(data)
print(gen.head(3) )

   cnt   t1   t2   hum  wind_speed  weather_code  is_holiday  is_weekend  \
0  182  3.0  2.0  93.0         6.0           3.0         0.0         1.0   
1  138  3.0  2.5  93.0         5.0           1.0         0.0         1.0   
2  134  2.5  2.5  96.5         0.0           1.0         0.0         1.0   

   season  year month hour  
0     3.0  2015    04   00  
1     3.0  2015    04   01  
2     3.0  2015    04   02  
   cnt        t1       t2        hum  wind_speed  weather_code  is_holiday  \
0  182  3.379372  1.51169  94.089517    5.109105           3.0         0.0   
1  138  3.379372  2.98831  91.910483    4.109105           1.0         0.0   
2  134  2.879372  2.98831  97.589517   -0.890895           1.0         0.0   

   is_weekend  season  year month hour  
0         1.0     3.0  2015    04   00  
1         1.0     3.0  2015    04   01  
2         1.0     3.0  2015    04   02  


In [9]:
y = data['cnt']
x = data.drop(['cnt'], axis=1)


cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
num_vars = ['t1','t2','hum','wind_speed']


x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2,random_state=0) # Recommended for reproducibility

extra_sample = gen.sample(gen.shape[0] // 3)
x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['cnt'] ])

transformer = PowerTransformer()
y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_val = transformer.transform(y_val.values.reshape(-1,1))                     

In [10]:
num_4_treeModels = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-9999)),
])

cat_4_treeModels = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder()) # handle_unknown='ignore' ONLY IN VERSION 0.24
])

tree_prepro = ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

In [11]:
tree_classifiers = {
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
}

tree_classifiers = {name: make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

results = pd.DataFrame({'Model': [],'r_score':[], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

In [12]:
rang = abs(y_train.max()) + abs(y_train.min())
for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "r_score": r2_score(y_val, pred),
                              "MSE": mean_squared_error(y_val, pred),

                              "MAB": mean_absolute_error(y_val, pred),
                              " % error": mean_squared_error(y_val, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)

results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

print(results_ord)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Learning rate set to 0.42641
0:	learn: 0.8876335	total: 184ms	remaining: 18.2s
1:	learn: 0.8420592	total: 192ms	remaining: 9.42s
2:	learn: 0.8188574	total: 201ms	remaining: 6.5s
3:	learn: 0.8041772	total: 206ms	remaining: 4.94s
4:	learn: 0.7965034	total: 211ms	remaining: 4.02s
5:	learn: 0.7903478	total: 218ms	remaining: 3.42s
6:	learn: 0.7854029	total: 225ms	remaining: 2.99s
7:	learn: 0.7827981	total: 231ms	remaining: 2.66s
8:	learn: 0.7797221	total: 237ms	remaining: 2.4s
9:	learn: 0.7772832	total: 243ms	remaining: 2.19s
10:	learn: 0.7757979	total: 249ms	remaining: 2.02s
11:	learn: 0.7740608	total: 256ms	remaining: 1.88s
12:	learn: 0.7727595	total: 263ms	remaining: 1.76s
13:	learn: 0.7711789	total: 269ms	remaining: 1.65s
14:	learn: 0.7706299	total: 278ms	remaining: 1.57s
15:	learn: 0.7686473	total: 285ms	remaining: 1.49s
16:	learn: 0.7667073	total: 292ms	remaining: 1.43s
17:	learn: 0.7642243	total: 298ms	remaining: 1.36s
18:	learn: 0.7633221	total: 303ms	remaining: 1.29s
19:	learn: 0.7

In [13]:
results_ord

Unnamed: 0,Model,r_score,MSE,MAB,% error,Time
1,Extra Trees,0.619481,0.379716,0.371464,0.0688,4.593081
2,Random Forest,0.580571,0.418544,0.454154,0.075835,7.022218
3,XGBoost,0.47172,0.527165,0.549387,0.095515,1.580891
4,LightGBM,0.436336,0.562475,0.578765,0.101913,0.573351
5,CatBoost,0.427897,0.570896,0.579297,0.103439,1.292781
6,Skl GBM,0.37643,0.622255,0.612694,0.112744,2.009511
7,AdaBoost,0.305429,0.693106,0.663385,0.125582,0.616376
8,Decision Tree,0.275407,0.723064,0.505918,0.13101,0.231952
