# Model selection

Cross-validation is used here to select the best model. In this script it is used to test the best machine learning model for use in this context.

<u>Tests using the following models :</u>
* Linear regression
* Random forest regressor
* XGBoost
* Extra Trees Regressor

<u> The following variables are included in the model:</u>
* Weather variables (rain, temperature, windspeed)
* Time variables (Day of week, month, year, time of day, public holiday)
* Sensor environment variables (within a 500m buffer of the sensor):
    * Betweenness of the street 
    * Buildings in proximity to the sensor
    * Landmarks in proximity to the sensor  
    * Furniture in proximity to the sensor    
    * Lights in proximity to the sensor   

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 
import xgboost as xgb
from sklearn.pipeline import Pipeline
import folium
import branca.colormap as cm
from eli5.sklearn import PermutationImportance
import joblib
import os
import psutil

from Functions import *

In [2]:
buffer_size_m = 500
input_csv ="../Cleaned_data/FormattedDataForModelling/formatted_data_for_modelling_allsensors_{}.csv".format(buffer_size_m)

## Run models with cross-validation

### Define the error metrics for the cross-validation to return, and the parameters of the cross validation

In [3]:
error_metrics = ['neg_mean_absolute_error', 'r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_percentage_error']
cv_parameters = KFold(n_splits=10, random_state=1, shuffle=True)

In [4]:
lr_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['linear_regressor',LinearRegression()]])
rf_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['rf_regressor', RandomForestRegressor(random_state = 1, n_jobs = 10)]])
xgb_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['xgb_regressor',xgb.XGBRegressor(random_state=1, n_jobs = 16)]])
et_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['et_regressor',ExtraTreesRegressor (random_state = 1, n_jobs = 16)]])

In [5]:
models_dict = {"linear_regressor": lr_model_pipeline, "xgb_regressor":xgb_model_pipeline, 
               "rf_regressor":rf_model_pipeline}

### Prepare data for modelling

In [6]:
Xfull, Yfull, data_time_columns = prepare_x_y_data(input_csv)

In [21]:
test = pd.read_csv(input_csv)
test = pd.merge(test, melbourne_sensors[['Latitude', 'Longitude', 'sensor_id']], on='sensor_id', how='left')

In [64]:
X_train_spatial, X_test_spatial = X.iloc[train_spatial_index,], X.iloc[test_spatial_index,]

In [None]:
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Prepare your feature matrix with both spatial and non-spatial features
X = Xfull  # Assuming X contains both spatial and non-spatial features
y = Yfull

# Define the spatial coordinates
coordinates = test[['Latitude','Longitude']]

# Create a StratifiedKFold object for spatial stratification
spatial_cv = StratifiedKFold(n_splits=3)

# Define the number of temporal blocks for time series splitting
n_temporal_blocks = 5

# Initialize a Random Forest Regressor model
model = RandomForestRegressor()

# Define the scoring metric(s) you want to evaluate
scoring = ['neg_mean_squared_error']

# Initialize an array to store evaluation metrics
mse_scores = []

# Perform spatially stratified cross-validation with temporal blocking
for train_spatial_index, test_spatial_index in spatial_cv.split(X, y):
    # Split the data into training and testing sets based on the spatial indices
    X_train_spatial, X_test_spatial = X.iloc[train_spatial_index,], X.iloc[test_spatial_index,]
    y_train_spatial, y_test_spatial = y[train_spatial_index], y[test_spatial_index]
    
    # Create a TimeSeriesSplit object for temporal blocking within each spatial fold
    temporal_cv = TimeSeriesSplit(n_splits=n_temporal_blocks)
    
    # Perform temporal blocking within the training set
    for train_temporal_index, _ in temporal_cv.split(X_train_spatial):
        # Split the training data into training and validation sets based on the temporal indices
        X_train, X_val = X_train_spatial.iloc[train_temporal_index,], X_train_spatial.iloc[train_temporal_index+1]
        y_train, y_val = y_train_spatial[train_temporal_index], y_train_spatial[train_temporal_index+1]
        
        # Train your model on the training data
        model.fit(X_train, y_train)
        
        # Make predictions on the validation data
        y_pred = model.predict(X_val)
        
        # Calculate the evaluation metric (e.g., mean squared error)
        mse = mean_squared_error(y_val, y_pred)
        mse_scores.append(mse)

# Calculate the mean and standard deviation of the evaluation metric
mean_mse = np.mean(mse_scores)
std_mse = np.std(mse_scores)

### Remove year

In [7]:
del Xfull['year']

### Choose which month_num and weekday_num option to include

In [8]:
# If using the dummy variables
# Xfull.drop(['Cos_month_num', 'Sin_month_num', 'Cos_weekday_num', 'Sin_weekday_num'], axis=1)
# If using the cyclical variables
Xfull.drop(['Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12'], axis=1, inplace = True)

In [13]:
print(Xfull.columns)

Index(['Temp', 'Humidity', 'Pressure', 'Rain', 'WindSpeed',
       'Rainfall amount (millimetres)', 'public_holiday', 'school_holiday',
       'betweenness', 'lights', 'street_inf', 'memorials', 'trees',
       'bus-stops', 'tram-stops', 'metro-stations', 'taxi-ranks',
       'big-car-parks', 'street_inf_Bicycle Rails', 'street_inf_Bollard',
       'street_inf_Drinking Fountain', 'street_inf_Floral Crate/Planter Box',
       'street_inf_Horse Trough', 'street_inf_Information Pillar',
       'street_inf_Litter Bin', 'street_inf_Seat', 'street_inf_Tree Guard',
       'landmarks_Community Use', 'landmarks_Mixed Use',
       'landmarks_Place Of Assembly', 'landmarks_Place of Worship',
       'landmarks_Retail', 'landmarks_Transport', 'landmarks_Education Centre',
       'landmarks_Leisure/Recreation', 'landmarks_Office',
       'street_inf_Barbeque', 'street_inf_Hoop', 'street_inf_Picnic Setting',
       'landmarks_Specialist Residential Accommodation',
       'landmarks_Vacant Land', 'lan

### Run model with cross validation

In [9]:
# Dataframe to store the scores for all the models
error_metric_scores = pd.DataFrame()

for model_name, model_pipeline in models_dict.items():
    print(model_name)
    # Use cross_validate to return the error scores associated with this model and this data
    start = time()
    model_output = cross_validate(model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
    end = time()
    print('Ran in {} minutes'.format(round((end - start)/60),2))
    
    # Formulate the different error scores into a dataframe
    error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
                  'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
                  'r2': round(abs(model_output['test_r2'].mean()),2), 
                  'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
                 index =[model_name])
        
    # Add evaluation metric scores for this model to the dataframe containing the metrics for each model
    error_metric_scores = error_metric_scores.append(error_metrics_df)
    # Save error scores for this distance to file
    #error_metrics_df.to_csv('Results/CV/ComparingModels/{}_{}m_error_metric_scores.csv'.format(model_name,buffer_size_m),index=False)    

# Save dataframes of error metrics for each buffer distance 
#error_metric_scores.to_csv('Results/CV/ComparingModels/comparingmodels_error_metric_scores.csv')   

linear_regressor
Ran in 4 minutes
xgb_regressor
Ran in 123 minutes
rf_regressor
Ran in 195 minutes


### Print table showing error metrics associated with each model

In [14]:
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
linear_regressor,261.31,1.459393e+16,0.47,358.5
xgb_regressor,112.8,4262695000000000.0,0.85,191.91
rf_regressor,73.33,2087694000000000.0,0.91,146.02


In [2]:
# df= error_metric_scores.copy()
# df = df.reindex(['linear_regressor', 'rf_regressor', 'xgb_regressor'])