## Import Libraries

In [1]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline            # Pipeline
from sklearn import preprocessing       # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection     # train_test_split
from sklearn import metrics             # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor

## Read CSV file

In [2]:
data = pd.read_csv(r'data/london_merged.csv')
data.head(5)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [3]:
np.random.seed(0)

## Preprocess the Data

In [4]:
data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )

In [5]:
data.drop('timestamp', axis=1, inplace=True)
data.head(3)

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,year,month,hour
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,2015,4,0
1,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,2015,4,1
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2015,4,2


## Enhance Data

In [6]:
def data_enhancement(data):
    
    gen_data = data
    
    for season in data['season'].unique():
        seasonal_data =  gen_data[gen_data['season'] == season]
        hum_std = seasonal_data['hum'].std()
        wind_speed_std = seasonal_data['wind_speed'].std()
        t1_std = seasonal_data['t1'].std()
        t2_std = seasonal_data['t2'].std()
        
        for i in gen_data[gen_data['season'] == season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_std/10
            else:
                gen_data['hum'].values[i] -= hum_std/10
                
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_std/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_std/10
            else:
                gen_data['t1'].values[i] -= t1_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_std/10
            else:
                gen_data['t2'].values[i] -= t2_std/10

    return gen_data

In [7]:
gen = data_enhancement(data)
print(gen.head(3) )

   cnt        t1       t2        hum  wind_speed  weather_code  is_holiday  \
0  182  3.379372  1.51169  91.910483    6.890895           3.0         0.0   
1  138  3.379372  2.98831  94.089517    5.890895           1.0         0.0   
2  134  2.879372  2.01169  97.589517    0.890895           1.0         0.0   

   is_weekend  season  year month hour  
0         1.0     3.0  2015    04   00  
1         1.0     3.0  2015    04   01  
2         1.0     3.0  2015    04   02  


In [8]:
y = data['cnt']
x = data.drop(['cnt'], axis=1)

In [9]:
cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
num_vars = ['t1','t2','hum','wind_speed']

In [10]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y, test_size=0.2, random_state=0  )

In [11]:
extra_sample = gen.sample(gen.shape[0] // 3)

In [12]:
x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['cnt'] ])

In [13]:
transformer = preprocessing.PowerTransformer()
y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_val = transformer.transform(y_val.values.reshape(-1,1))

In [14]:
rang = abs(y_train.max()) + abs(y_train.min())

In [15]:
rang = abs(y_train.max()) + abs(y_train.min())

In [16]:
num_4_treeModels = pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])

In [17]:
cat_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder()) # handle_unknown='ignore' ONLY IN VERSION 0.24
])

In [18]:
tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

In [19]:
tree_classifiers = {
                        "Decision Tree": DecisionTreeRegressor(),
                        "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
                        "Random Forest": RandomForestRegressor(n_estimators=100),
                        "AdaBoost":      AdaBoostRegressor(n_estimators=100),
 
                    }

In [20]:
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

In [21]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

In [None]:
for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":      model_name,
                              "MSE":        metrics.mean_squared_error(y_val, pred),
                              "MAB":        metrics.mean_absolute_error(y_val, pred),
                              " % error":   metrics.mean_squared_error(y_val, pred) / rang,
                              "Time":       total_time},
                              ignore_index=True)

In [23]:
results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 

results_ord.style.bar(subset=['MSE', 'MAB'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,MSE,MAB,% error,Time
1,Extra Trees,0.387647,0.374526,0.070216,7.178797
2,Random Forest,0.427189,0.456718,0.077378,13.328864
3,AdaBoost,0.697742,0.665951,0.126385,0.810713
4,Decision Tree,0.750936,0.524668,0.13602,0.40009


In [24]:
print(results_ord)

           Model       MSE       MAB   % error       Time
1    Extra Trees  0.387647  0.374526  0.070216   7.178797
2  Random Forest  0.427189  0.456718  0.077378  13.328864
3       AdaBoost  0.697742  0.665951  0.126385   0.810713
4  Decision Tree  0.750936  0.524668  0.136020   0.400090


In [25]:
print(y_train.max())
print(y_train.min())
print(y_val[3])
print(tree_classifiers['Random Forest'].predict(x_val)[3])

2.766201853700867
-2.754580496720054
[0.56668132]
1.0077299589148156
