# Model selection

Cross-validation is used here to select the best model. In this script it is used to test the best buffer size to draw around the sensors from within which to draw the environment variables. 

Tests the performance of a <u>Random Forest Regressor</u>

<u> The following variables are included in the model:</u>
* Weather variables (rain, temperature, windspeed)
* Time variables (Day of week, month, year, time of day, public holiday)
* Sensor environment variables:
    * Betweenness of the street 
    * Buildings in proximity to the sensor
    * Landmarks in proximity to the sensor  
    * Furniture in proximity to the sensor    
    * Lights in proximity to the sensor   


<u> Model performance is evaluated for a range of buffer sizes around the sensors within which the environment variables are counted</u>:
   * 50
   * 100
   * 200
   * 400
   * 500
   * 600
   * 1000

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 
import xgboost as xgb
from sklearn.pipeline import Pipeline
import folium
import branca.colormap as cm
from eli5.sklearn import PermutationImportance
import joblib
import os
import psutil

from Functions import *

## Run models with cross-validation

### Define the error metrics for the cross-validation to return, and the parameters of the cross validation

In [2]:
error_metrics = ['neg_mean_absolute_error', 'r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_percentage_error']
cv_parameters = KFold(n_splits=10, random_state=1, shuffle=True)

### Use CV to return error metrics for the datasets produced with different buffer sizes

In [13]:
# # Dataframe to store the scores for all the models
# error_metric_scores = pd.DataFrame()

# # Set up model pipeline
# model = Pipeline(steps=[['scaler',StandardScaler()],['rf_regressor', RandomForestRegressor(random_state = 1, n_jobs = 10)]])

# # Define parameters
# model_name = 'rf_regressor'

# # Loop through each buffer size option
# for buffer_size_m in [50,100,200,400,500,600,1000]:
#     # Prepare the input data
#     Xfull, Yfull = prepare_x_y_data(buffer_size_m)
    
#     # Use cross_validate to return the error scores associated with this model and this data
#     start = time()
#     model_output = cross_validate(rf_model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
#     end = time()
#     print('Ran in {} minutes'.format(round((end - start)/60),2))
    
#     # Formulate the different error scores into a dataframe
#     error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
#                   'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
#                   'r2': round(abs(model_output['test_r2'].mean()),2), 
#                   'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
#                  index =["{}".format(buffer_size_m)])
        
#     # Add evaluation metric scores for this model to the dataframe containing the metrics for each model
#     error_metric_scores = error_metric_scores.append(error_metrics_df)
#     # Save error scores for this distance to file
#     error_metrics_df.to_csv('PickleFiles/CV/{}/{}_error_metrics_{}m.csv'.format(buffer_size_m, model_name,buffer_size_m),index=False)    

# # Save dataframes of error metrics for each buffer distance 
# error_metric_scores.to_csv('error_metric_scores.csv')   

In [5]:
# Dataframe to store the scores for all the models
error_metric_scores = pd.DataFrame()

# Set up model pipeline
model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['rf_regressor', RandomForestRegressor(random_state = 1, n_jobs = 10)]])

# Define parameters
model_name = 'rf_regressor'

# Loop through each buffer size option
for buffer_size_m in [500]:
    # Prepare the input data
    Xfull, Yfull, data_time_columns = prepare_x_y_data(buffer_size_m)
    
    # Use cross_validate to return the error scores associated with this model and this data
    start = time()
    model_output = cross_validate(model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
    end = time()
    print('Ran in {} minutes'.format(round((end - start)/60),2))
    
    # Formulate the different error scores into a dataframe
    error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
                  'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
                  'r2': round(abs(model_output['test_r2'].mean()),2), 
                  'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
                 index =["{}".format(buffer_size_m)])
        
    # Add evaluation metric scores for this model to the dataframe containing the metrics for each model
    error_metric_scores = error_metric_scores.append(error_metrics_df)
    # Save error scores for this distance to file
    #error_metrics_df.to_csv('PickleFiles/CV/{}/{}_error_metrics_{}m.csv'.format(buffer_size_m, model_name,buffer_size_m),index=False)    

# Save dataframes of error metrics for each buffer distance 
#error_metric_scores.to_csv('error_metric_scores.csv')   

Ran in 185 minutes


In [6]:
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
500,75.64,2108923000000000.0,0.91,149.93


### Print table showing error metrics associated with each buffer size

In [32]:
error_metric_scores

Unnamed: 0,mae,map,r2,rmse
50,61.34,1493958000000000.0,0.93,127.24
100,60.79,1508953000000000.0,0.93,126.02
200,60.27,1513131000000000.0,0.94,124.99
400,60.12,1521340000000000.0,0.94,125.01
500,59.55,1513783000000000.0,0.94,123.22
600,59.77,1508504000000000.0,0.94,123.83
1000,61.05,1474851000000000.0,0.93,126.53


In [11]:
def prepare_x_y_data(buffer_size_m):
    # Read in formatted data
    data = pd.read_csv("../Cleaned_data/FormattedDataForModelling/formatted_data_for_modelling_allsensors_{}_withsincos.csv".format(buffer_size_m), 
                       index_col = False)
    data = data.fillna(0)
    
    ### Delete unneeded columns - we currently include data from all sensors (even incomplete ones)
    sensor_ids = data['sensor_id']
    data = data.drop(['sensor_id'],axis=1) # don't want this included
    # Get rid of columns in which none of the sensors have a value
    for column in data.columns:
        if np.nanmax(data[column]) ==0:
            del data[column]
            
    # Filter columns using the regex pattern in function input
    regex_pattern = 'buildings$|furniture$|landmarks$'
    data = data[data.columns.drop(list(data.filter(regex=regex_pattern)))].copy()
    
    ### Add a random variable (to compare performance of other variables against)
    rng = np.random.RandomState(seed=42)
    data['random'] = np.random.random(size=len(data))
    data["random_cat"] = rng.randint(3, size=data.shape[0])
    
    ## Prepare data for modelling 
    ### Split into predictor/predictand variables
    Xfull = data.drop(['hourly_counts'], axis =1)
    Yfull = data['hourly_counts'].values
       
    ### Store the (non Sin/Cos) time columns and then remove them (Need them later to segment the results by hour of the day)
    data_time_columns = Xfull[['day_of_month_num', 'time', 'weekday_num', 'time_of_day']]
    Xfull = Xfull.drop(['day_of_month_num', 'time', 'weekday_num', 'time_of_day','year','datetime', 'month_num'],axis=1)
    return Xfull, Yfull, data_time_columns

In [23]:
Xfull, Yfull, data_time_columns = prepare_x_y_data(buffer_size_m)
Xfull.drop(['Sin_month_num', 'Cos_month_num', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Saturday', 'Sunday'], axis=1, inplace =True)

# Use cross_validate to return the error scores associated with this model and this data
start = time()
model_output = cross_validate(model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
end = time()
print('Ran in {} minutes'.format(round((end - start)/60),2))

# Formulate the different error scores into a dataframe
error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
              'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
              'r2': round(abs(model_output['test_r2'].mean()),2), 
              'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
             index =["{}".format(buffer_size_m)])

# Add evaluation metric scores for this model to the dataframe containing the metrics for each model
error_metric_scores = error_metric_scores.append(error_metrics_df)

Ran in 172 minutes


In [None]:
Xfull, Yfull, data_time_columns = prepare_x_y_data(buffer_size_m)
Xfull.drop(['Sin_weekday_num', 'Cos_weekday_num', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
           "month_8", "month_9", "month_10", "month_11", "month_12"], axis=1, inplace =True)
Xfull.columns

In [31]:
# Use cross_validate to return the error scores associated with this model and this data
start = time()
model_output = cross_validate(model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
end = time()
print('Ran in {} minutes'.format(round((end - start)/60),2))

# Formulate the different error scores into a dataframe
error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
              'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
              'r2': round(abs(model_output['test_r2'].mean()),2), 
              'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
             index =["{}_sin_wd_nm, months".format(buffer_size_m)])

# Add evaluation metric scores for this model to the dataframe containing the metrics for each model
error_metric_scores = error_metric_scores.append(error_metrics_df)

Ran in 166 minutes


In [32]:
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
500,75.64,2108923000000000.0,0.91,149.93
500,74.29,2077179000000000.0,0.91,147.42
"500_sin_wd_nm, months",74.44,2089302000000000.0,0.91,147.96


In [35]:
Xfull, Yfull, data_time_columns = prepare_x_y_data(buffer_size_m)
Xfull.drop([ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Saturday', 'Sunday', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
           "month_8", "month_9", "month_10", "month_11", "month_12"], axis=1, inplace =True

In [36]:
# Use cross_validate to return the error scores associated with this model and this data
start = time()
model_output = cross_validate(model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
end = time()
print('Ran in {} minutes'.format(round((end - start)/60),2))

# Formulate the different error scores into a dataframe
error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
              'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
              'r2': round(abs(model_output['test_r2'].mean()),2), 
              'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
             index =["{}_sin_wd_nm, months".format(buffer_size_m)])

# Add evaluation metric scores for this model to the dataframe containing the metrics for each model
error_metric_scores = error_metric_scores.append(error_metrics_df)

Ran in 162 minutes


In [37]:
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
500,75.64,2108923000000000.0,0.91,149.93
500,74.29,2077179000000000.0,0.91,147.42
"500_sin_wd_nm, months",74.44,2089302000000000.0,0.91,147.96
"500_sin_wd_nm, months",73.03,2068655000000000.0,0.91,145.32
