In [1]:
import numpy    as np
from numpy.testing._private.utils import decorate_methods
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl
import time

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor

In [2]:
""" Read CSV file """

data = pd.read_csv(r'data\london_merged.csv')

data.head(4)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0


In [3]:
""" Set the seed. This will make the np.random predictable """
np.random.seed(0)

In [4]:
""" Split the data into x and y  """

# target = data['cnt']
# data = data.drop(['cnt'], axis=1)

# data.head(3)
#target.head(3)

' Split the data into x and y  '

In [5]:
# # Print data shape
# print(target.shape)
# print(data.shape)

# # Take a look at nulls 0 nulls
# print(target.isnull().sum())
# print(data.isnull().sum())

In [6]:
data['year'] = data['timestamp'].apply(lambda row: row[:4])
data['month'] = data['timestamp'].apply(lambda row: row.split('-')[2][:2] )
data['hour'] = data['timestamp'].apply(lambda row: row.split(':')[0][-2:] )

data.drop('timestamp', axis=1, inplace=True)

data.head(3)

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,year,month,hour
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,2015,4,0
1,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,2015,4,1
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2015,4,2


In [7]:
def data_enhancement(data):
    
    gen_data = data
    
    for season in data['season'].unique():
        seasonal_data =  gen_data[gen_data['season'] == season]


        hum_std = seasonal_data['hum'].std()
        wind_speed_std = seasonal_data['wind_speed'].std()
        t1_std = seasonal_data['t1'].std()  
        t2_std = seasonal_data['t2'].std() 
        
        for i in gen_data[gen_data['season'] == season].index:
            if np.random.randint(2) == 1:
                gen_data['hum'].values[i] += hum_std/10
            else:
                gen_data['hum'].values[i] -= hum_std/10
                
            if np.random.randint(2) == 1:
                gen_data['wind_speed'].values[i] += wind_speed_std/10
            else:
                gen_data['wind_speed'].values[i] -= wind_speed_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t1'].values[i] += t1_std/10
            else:
                gen_data['t1'].values[i] -= t1_std/10
                
            if np.random.randint(2) == 1:
                gen_data['t2'].values[i] += t2_std/10 
            else:
                gen_data['t2'].values[i] -= t2_std/10

    return gen_data

In [8]:
print(data.head(3))

gen = data_enhancement(data)
print(gen.head(3) )

#print(gen.shape)

   cnt   t1   t2   hum  wind_speed  weather_code  is_holiday  is_weekend  \
0  182  3.0  2.0  93.0         6.0           3.0         0.0         1.0   
1  138  3.0  2.5  93.0         5.0           1.0         0.0         1.0   
2  134  2.5  2.5  96.5         0.0           1.0         0.0         1.0   

   season  year month hour  
0     3.0  2015    04   00  
1     3.0  2015    04   01  
2     3.0  2015    04   02  
   cnt        t1       t2        hum  wind_speed  weather_code  is_holiday  \
0  182  3.379372  1.51169  91.910483    6.890895           3.0         0.0   
1  138  3.379372  2.98831  94.089517    5.890895           1.0         0.0   
2  134  2.879372  2.01169  97.589517    0.890895           1.0         0.0   

   is_weekend  season  year month hour  
0         1.0     3.0  2015    04   00  
1         1.0     3.0  2015    04   01  
2         1.0     3.0  2015    04   02  


In [9]:
#final_data = data
y = data['cnt']
x = data.drop(['cnt'], axis=1)

In [10]:
cat_vars = ['season','is_weekend','is_holiday','year','month','weather_code']
num_vars = ['t1','t2','hum','wind_speed']

x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y, test_size=0.2, random_state=0 ) # Recommended for reproducibility

In [11]:
extra_sample = gen.sample(gen.shape[0] // 3)

x_train = pd.concat([x_train, extra_sample.drop(['cnt'], axis=1 ) ])
y_train = pd.concat([y_train, extra_sample['cnt'] ])


transformer = preprocessing.PowerTransformer()

y_train = transformer.fit_transform(y_train.values.reshape(-1,1))
y_val = transformer.transform(y_val.values.reshape(-1,1))

In [12]:

rang = abs(y_train.max()) - abs(y_train.min())

In [13]:
num_4_treeModels = pipeline.Pipeline(steps=[  ('imputer', impute.SimpleImputer(strategy='constant', fill_value=-9999)),])

cat_4_treeModels = pipeline.Pipeline(steps=[ ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),  ('ordinal', preprocessing.OrdinalEncoder()) ])

tree_prepro = compose.ColumnTransformer(transformers=[     ('num', num_4_treeModels, num_vars),  ('cat', cat_4_treeModels, cat_vars),  ], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

In [14]:
tree_classifiers = {
  
  "Decision Tree": DecisionTreeRegressor(),
  "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
  "Random Forest": RandomForestRegressor(n_estimators=100),
  "AdaBoost":      AdaBoostRegressor(n_estimators=100),
  "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
  "XGBoost":       XGBRegressor(n_estimators=100),
  "LightGBM":      LGBMRegressor(n_estimators=100),
  "CatBoost":      CatBoostRegressor(n_estimators=100),
}
### END SOLUTION


In [15]:
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})

In [16]:
for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_val, pred),
                              "MAB": metrics.mean_absolute_error(y_val, pred),
                              " % error": metrics.mean_squared_error(y_val, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Learning rate set to 0.42641
0:	learn: 0.8888793	total: 155ms	remaining: 15.4s
1:	learn: 0.8442251	total: 167ms	remaining: 8.2s
2:	learn: 0.8215277	total: 178ms	remaining: 5.74s
3:	learn: 0.8078977	total: 189ms	remaining: 4.55s
4:	learn: 0.7997043	total: 199ms	remaining: 3.78s
5:	learn: 0.7936340	total: 209ms	remaining: 3.28s
6:	learn: 0.7888441	total: 222ms	remaining: 2.95s
7:	learn: 0.7868824	total: 232ms	remaining: 2.67s
8:	learn: 0.7817758	total: 257ms	remaining: 2.6s
9:	learn: 0.7801680	total: 264ms	remaining: 2.38s
10:	learn: 0.7778249	total: 274ms	remaining: 2.21s
11:	learn: 0.7757535	total: 288ms	remaining: 2.11s
12:	learn: 0.7743679	total: 301ms	remaining: 2.01s
13:	learn: 0.7720303	total: 322ms	remaining: 1.98s
14:	learn: 0.7695139	total: 341ms	remaining: 1.93s
15:	learn: 0.7686639	total: 352ms	remaining: 1.85s
16:	learn: 0.7672573	total: 359ms	remaining: 1.75s
17:	learn: 0.7665251	total: 376ms	remaining: 1.71s
18:	learn: 0.7647027	total: 393ms	remaining: 1.68s
19:	learn: 0.7

In [17]:

results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

print(results_ord)

           Model       MSE       MAB    % error       Time
1    Extra Trees  0.387647  0.374526  33.356404  12.720417
2  Random Forest  0.427189  0.456718  36.758975  21.195471
3        XGBoost  0.532979  0.554602  45.862034   1.469127
4       LightGBM  0.563719  0.577875  48.507157   2.847920
5       CatBoost  0.581709  0.584727  50.055168   2.839109
6        Skl GBM  0.625755  0.614863  53.845264  10.921943
7       AdaBoost  0.697742  0.665951  60.039629   2.915734
8  Decision Tree  0.750936  0.524668  64.616883   0.636203


In [18]:
print(y_train.max())
print(y_train.min())
print(y_val[3]) 
print(tree_classifiers['Random Forest'].predict(x_val)[3])   

2.766201853700867
-2.754580496720054
[0.56668132]
1.0077299589148156
