# Modelling
<u>Tests using the following models :</u>
* Linear regression
* Random forest regressor
* Ridge and Lasso Regularization (add on to linear modelling?)

<u> Tests using the following variables:</u>
* Weather variables (rain, temperature, windspeed)
* Time variables (Day of week, month, year, time of day, public holiday)
* Sensor environment variables:
    * Sensor_id
    * Betweenness of the street 
    * Buildings in proximity to the sensor
    * Landmarks in proximity to the sensor  
    * Furniture in proximity to the sensor    
    * Lights in proximity to the sensor   


Normalise variables: should this be with MinMax or StandardScaler??


Process:
* Keep only data from sensor's with relatively complete data
* Split data into training ( 75%) and test (25%)
* Define the models to use in testing (linear regression, random forest, xgboost)
* Define the error metrics to use in evaluating the model performance

In [1]:
import copy
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
import time as thetime
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier, XGBRegressor
from time import time
from sklearn.inspection import permutation_importance

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

import multiprocessing

# To display tables in HTML output
from IPython.display import HTML, display

from Functions import *

In [None]:
# start = time()
# model_output = cross_validate(model, Xfull, Yfull, cv=10, scoring=error_metrics ,return_estimator=True, error_score="raise")
# end = time()

In [None]:
# r = permutation_importance(model_output['estimator'][0], Xfull,Yfull, n_repeats=2, random_state=0)

In [None]:
# len(feature_importances) 
# len(r.importances_mean.argsort()[::-1])

In [None]:
# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(model_output['estimator'][0], random_state=1).fit(Xfull, Yfull)
# eli5.show_weights(perm, feature_names = Yfull.columns.tolist())

In [2]:
def run_model_with_cv(model,model_name, metrics, cv, Xfull, Yfull, regex_name, regex_pattern):
    print("Running {} model, variables include {}".format(model_name,  regex_name))

    # Filter columns using the regex pattern in function input
    Xfull = Xfull[Xfull.columns.drop(list(Xfull.filter(regex=regex_pattern)))].copy()
    # Get list of all features
    feature_list = list(Xfull.columns)
    
    # Scaling!?!?!?
    scaler = StandardScaler()
    Xfull = pd.DataFrame(scaler.fit_transform(Xfull), columns=X_train.columns)
    Xfull = pd.DataFrame(scaler.transform(Xfull), columns=X_test.columns)
    
    # Perform cross validation, time how long it takes
    start = time()
    model_output = cross_validate(model, Xfull, Yfull, cv=cv, scoring=metrics ,return_estimator=True, error_score="raise")
    end = time()
    
    #  Create a dataframe containng scores for each performance metric
    df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
         'r2': round(abs(model_output['test_r2'].mean()),2), 'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
                     index =["{}_{}".format(model_name, regex_name)])
    
    # Get the estimators 
    estimators = model_output['estimator']
    
    print('Ran in {} minutes'.format(round((end - start)/60),2))
    return [estimators, df, feature_list]


In [None]:
# from sklearn.inspection import permutation_importance
# r = permutation_importance(rf_model, X_val, y_val, n_repeats=30, random_state=0)

# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(rf_model, random_state=1).fit(X_test, Y_test)
# eli5.show_weights(perm, feature_names = val_x.columns.tolist())

In [None]:
# def run_model_with_cv(model,model_name, metrics, cv, Xfull, Yfull, regex_name, regex_pattern):
#     print("Running {} model, variables include {}".format(model_name,  regex_name))

#     # Filter columns using the regex pattern in function input
#     Xfull = Xfull[Xfull.columns.drop(list(Xfull.filter(regex=regex_pattern)))].copy()
#     # Get list of all features
#     feature_list = list(Xfull.columns)
    
#     # Perform cross validation, time how long it takes
#     start = time()
#     model_output = cross_validate(model, Xfull, Yfull, cv=cv, scoring=metrics ,return_estimator=True, error_score="raise")
#     end = time()
    
#     #  Create a dataframe containng scores for each performance metric
#     df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
#          'r2': round(abs(model_output['test_r2'].mean()),2), 'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
#                      index =["{}_{}".format(model_name, regex_name)])
    
#     # Get the estimators 
#     estimators = model_output['estimator']
    
#     print('Ran in {} minutes'.format(round((end - start)/60),2))
#     return [estimators, df, feature_list]


### Read in formatted data

In [3]:
data = pd.read_csv("formatted_data_for_modelling.csv", index_col = False)

### Keep only sensors with relatively complete data

In [4]:
### Filter to include just sensors which we know have quite complete data 
data = data[data['sensor_id'].isin([2,6,9,10,14,18])]
data.reset_index(inplace=True, drop = True)

In [5]:
# data = data.drop(['Pressure', 'Humidity'],axis=1) # seem obviously irrelevant
data = data.drop(['sensor_id'],axis=1) # don't want this included
# Get rid of columns in which none of the sensors have a value
for column in data.columns:
    if np.nanmax(data[column]) ==0:
        del data[column]

## Prepare data for modelling - split into predictor/predictand variables

In [6]:
# The predictor variables
Xfull = data.drop(['hourly_counts'], axis =1)
# Xfull['random'] = np.random.random(size=len(Xfull))

# The variable to be predicted
Yfull = data['hourly_counts'].values

# Split data into training and test sets (Xfull/Yfull aren't used again, so these don't need to be scaled)
X_train, X_test, Y_train, Y_test = train_test_split(Xfull, Yfull, test_size=0.75, random_state=123)

#### Standardize both training and testing data
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

## Define models (linear regression, random forest and XGBoost)

In [7]:
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators = 500, random_state = 1, n_jobs = 64)
xgb_model = XGBRegressor(random_state=1, n_jobs = 200)

## Run models with cross-validation

#### Define the error metrics for the cross-validation to return, and the parameters of the cross validatio

In [8]:
error_metrics = ['neg_mean_absolute_error', 'r2', 'neg_root_mean_squared_error']
cv_parameters = KFold(n_splits=10, random_state=1, shuffle=True)

#### Define regex's to remove columns not needed in various splits of removing column

In [11]:
column_regex_dict = {'withsubtypes':'buildings$|furniture$|landmarks$',
                     'nosubtyes':'buildings_|furniture_|landmarks_|sensor_id',
                     'time_and_weather':'buildings|furniture|landmarks|h_|lights|avg_n_floors|betweenness',
                      'just_location_features':'buildings$|furniture$|landmarks$|school_holiday|public_holiday|Temp|Humidity|Pressure|Rain|WindSpeed|Sin|Cos'}

#### Loop through each combination of the models, and the variables to include in the modelling

In [16]:
# Dataframe to store the scores for each model
error_metric_scores = pd.DataFrame()

# Dictionary to store dataframes of feature importance scores
feature_importance_scores ={}

models_dict = {"linear_regression": lr_model, "xgboost":xgb_model}
for model_name,model in models_dict.items():
    for regex_name, regex in column_regex_dict.items():
        # Run the model: return the estimators and a dataframe containing evaluation metrics
        estimators, error_metrics_df, feature_list = run_model_with_cv(model, model_name, error_metrics, cv_parameters, Xfull, Yfull, regex_name, regex) 
        # Add evaluation metric scores for this model to the dataframe containing the metrics for each model
        error_metric_scores = error_metric_scores.append(error_metrics_df)
        
        # Create dataframe of feature importances (no feature importances for linear regression)
        if model_name != 'linear_regression':
            feature_importances = pd.DataFrame(index =[feature_list])
            for idx,estimator in enumerate(estimators):
                    feature_importances['Estimator{}'.format(idx)] = estimators[idx].feature_importances_
            feature_importance_scores["{}_{}".format(model_name, regex_name)] = feature_importances

Running linear_regression model, variables include withsubtypes
Ran in 0 minutes
Running linear_regression model, variables include nosubtyes
Ran in 0 minutes
Running linear_regression model, variables include time_and_weather
Ran in 0 minutes
Running linear_regression model, variables include just_location_features
Ran in 0 minutes
Running xgboost model, variables include withsubtypes
Ran in 2 minutes
Running xgboost model, variables include nosubtyes
Ran in 1 minutes
Running xgboost model, variables include time_and_weather
Ran in 1 minutes
Running xgboost model, variables include just_location_features
Ran in 1 minutes


In [17]:
error_metric_scores

Unnamed: 0,mae,r2,rmse
linear_regression_withsubtypes,260.3,0.48,354.47
linear_regression_nosubtyes,264.22,0.45,363.75
linear_regression_time_and_weather,305.05,0.25,425.36
linear_regression_just_location_features,332.51,0.18,443.75
xgboost_withsubtypes,99.31,0.87,177.73
xgboost_nosubtyes,119.82,0.82,209.14
xgboost_time_and_weather,275.87,0.33,401.24
xgboost_just_location_features,330.65,0.19,442.23


## Fit best perfomring model and get feature importances?

### Create dataframe containing the feature importances from each of the estimators

### Find the best model  
Use k-fold cross validation to evaluate a range of regression algorithms on the training data. Use a pipeline for evaluation which first scales the (weather) data. Print the results and assess which models perform best.

The following models were trialled:

* Decision Tree
* Random Forest
* Extra Trees
* Dummy Regressor
* Elastic Net CV
* Passive Aggressive
* RANSAC
* SGD
* TheilSen (dropped in code below because it takes too long)
* K Neighbours
* LinearRegression
* XGBoost

In [None]:
# # Define a list of all the models to use
# Models = {'LinearRegression': LinearRegression,'DecisionTree' : DecisionTreeRegressor,
#           'RandomForest': RandomForestRegressor, 'ExtraTrees' : ExtraTreesRegressor,
#           'DummyRegressor' :DummyRegressor, 'ElasticNetCV' : ElasticNetCV, 
#           'PassiveAggressive' : PassiveAggressiveRegressor, #RANSAC': RANSACRegressor, # This one is terrible too
#           'SGD': SGDRegressor, #'TheilSen': TheilSenRegressor, # Drop this - it isn't great and takes too long
#           'KN': KNeighborsRegressor}#, 'XGBoost': xgb.XGBRegressor}
 
# # Now just run each model, but do this in multiple processes simultaneously to save time    
# # Now call that function simultaneously for each model
# p = Pool(processes=None) # A pool of processes (one for each core)
# results = p.map(run_model, [(name, model_type) for name, model_type in Models.items()])

# # Sort the results by median mse (that's item 5 in the tuple)
# results.sort(key=lambda x: x[5], reverse=True)

# # Put the results in a nice dictionary and print them
# results_dict = {}
# txt = "<table><thead><td>Name</td><td>Median R2</td><td>Median MSE</td><td>runtime (sec)</td></thead>"
# for name, model, all_r2, r2, all_mse, mse, runtime in results:
#     txt += "<tr><td>{}</td><td>{}</td><td>{}</td><td>{}</td></tr>".format(name, r2, mse, runtime)
#     results_dict[name] = (model, all_r2, r2, all_mse, mse, runtime)
# txt += "</table>"
# display(HTML(txt)) # print as html

# min_mse = min([mse for (name, model, all_r2, r2, all_mse, mse, runtime) in results])
               
# x =  [ name for (name, model, all_r2, r2, all_mse, mse, runtime) in results]
# y1 = [ mse-min_mse   for (name, model, all_r2, r2, all_mse, mse, runtime) in results]
# y2 = [ r2 if r2 > 0 else 0 for (name, model, all_r2, r2, all_mse, mse, runtime) in results]

# fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(15, 7))

# ax1.set_title("MSE")
# #ax1.invert_yaxis()
# ax1.bar(range(len(x)), y1)
# ax1.set_xticks(range(len(x)))
# ax1.set_xticklabels(x, rotation=90)
# ax1.set_ylim([27000000000, 29000000000])

# ax2.set_title("R^2")
# ax2.bar(range(len(x)), y2)
# ax2.set_xticks(range(len(x)))
# ax2.set_xticklabels(x, rotation=90)

# plt.show()

# #del x,y1, y2

# ## Set up a dictionary containing the hyperparameters we want to tune
# hyperparameters_rf = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
#                   'randomforestregressor__max_depth': [None, 5, 3, 1]}
# # hyperparameters_xgb = {'xgbregressor__max_depth': range(1, 11, 2),
# #                    'xgbregressor__n_estimators' : range(50, 400, 50),
# #                    'xgbregressor__learning_rate' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]}
# hyperparameters_lr = {}

# # Set up the pipeline containing the scalers
# pipeline_rf = make_pipeline(MinMaxScaler(feature_range = (0,1)), 
#                          RandomForestRegressor(n_estimators=100))
# # pipeline_xgb = make_pipeline(MinMaxScaler(feature_range = (0,1)),
# #                          xgb.XGBRegressor(n_estimators=100))
# pipeline_lr = make_pipeline(MinMaxScaler(feature_range = (0,1)),
#                          LinearRegression())

# # Store the scores in a results dictionary (and print them)
# final_results = {}
# for model_values in [(pipeline_rf,  hyperparameters_rf,  'RandomForest'),
# #                      (pipeline_xgb, hyperparameters_xgb, 'XGBoost'),
#                      (pipeline_lr,  hyperparameters_lr,  'LinearRegression')]:
    
#     clf = GridSearchCV(model_values[0], model_values[1], 
#                        #cv = None, # Cross-validation method. None means default (3-fold)
#                        cv = 10, # positive intiger means k-fold (e.g. 10-fold)
#                        #scoring  = 'neg_mean_squared_error', # MSE to calculate score
#                        scoring  = 'r2', # MSE to calculate score
#                        n_jobs=multiprocessing.cpu_count()) # Run on multiple cores
    
#     #clf = GridSearchCV(model_values[0], model_values[1], cv = 10, scoring  = 'r2')
#     clf.fit(X_validate, Y_validate)
#     name = model_values[2]
#     final_results[name] = clf
#     print ("Hyperparameter results for {}".format(name))
#     print ("\tBest Score: {}".format(clf.best_score_))
#     print ("\tBest params: {}".format(clf.best_params_))