## if include sensor_id then can fit quite a good model - but this would be a model that would only be able to predict at those locations

In [210]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 
import xgboost as xgb
from sklearn.pipeline import Pipeline
import folium
import branca.colormap as cm
from eli5.sklearn import PermutationImportance
import joblib
import os
import psutil
import geopy.distance

from Functions import *

In [211]:
buffer_size_m = 400
input_csv ="../Cleaned_data/FormattedDataForModelling/formatted_data_for_modelling_allsensors_{}.csv".format(buffer_size_m)

## Run models with cross-validation

### Define the error metrics for the cross-validation to return, and the parameters of the cross validation

In [212]:
error_metrics = ['neg_mean_absolute_error', 'r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_percentage_error']
cv_parameters = KFold(n_splits=2, random_state=1, shuffle=True)

In [213]:
rf_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['rf_regressor', RandomForestRegressor(random_state = 1, n_jobs = 10)]])

### Prepare data for modelling

In [214]:
Xfull, Yfull, data_time_columns = prepare_x_y_data(input_csv)

### Add sensor ID

In [215]:
sensor_ids = pd.read_csv(input_csv)['sensor_id']
Xfull['sensor_id'] = sensor_ids
# sensor_dummy =  pd.get_dummies(Xfull['sensor_id'], drop_first = True)
# sensor_dummy.columns= prepend(sensor_dummy.columns.values, 'sensor_')
# Xfull = pd.concat([Xfull, sensor_dummy],axis=1)
# del Xfull['sensor_id']
# Xfull.columns

In [216]:
# del Xfull['year']

### Choose which month_num and weekday_num option to include

In [217]:
# If using the dummy variables
# Xfull.drop(['Cos_month_num', 'Sin_month_num', 'Cos_weekday_num', 'Sin_weekday_num'], axis=1)
# If using the cyclical variables
Xfull.drop(['Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12'], axis=1, inplace = True)

### Remove spatial features

In [218]:
Xfull.drop(['transport_stops','betweenness', 'lights', 'street_inf', 'memorials', 'trees','bus-stops', 'tram-stops', 'metro-stations', 
            'taxi-ranks', 'big-car-parks', 'street_inf_Bicycle Rails', 'street_inf_Bollard','street_inf_Drinking Fountain',
            'street_inf_Floral Crate/Planter Box','street_inf_Horse Trough', 'street_inf_Information Pillar',
            'street_inf_Litter Bin', 'street_inf_Seat', 'street_inf_Tree Guard','landmarks_Community Use', 
            'landmarks_Mixed Use','landmarks_Place Of Assembly', 'landmarks_Place of Worship', 'landmarks_Retail', 
            'landmarks_Transport', 'landmarks_Education Centre','landmarks_Leisure/Recreation', 'landmarks_Office',
       'street_inf_Barbeque', 'street_inf_Hoop', 'street_inf_Picnic Setting', 'landmarks_Specialist Residential Accommodation',
       'landmarks_Vacant Land', 'landmarks_Purpose Built','landmarks_Health Services', 'avg_n_floors', 'buildings_Community Use',
       'buildings_Education', 'buildings_Entertainment', 'buildings_Events','buildings_Hospital/Clinic', 'buildings_Office', 'buildings_Parking',
       'buildings_Public Display Area', 'buildings_Residential','buildings_Retail', 'buildings_Storage', 'buildings_Unoccupied',
       'buildings_Working', 'buildings_Transport'], axis=1, inplace = True)

### Add distance from centre to data on each sensor

In [219]:
### Read in Melbourne sensor location spatial data
melbourne_sensors = pd.read_csv("../Data/FootfallData/melbourne_locations.csv")
melbourne_sensors.rename(columns={'sensor_description': 'Name'}, inplace = True)
melbourne_sensors = melbourne_sensors.drop_duplicates('sensor_id', keep='first')

In [220]:
# Coordinates of 'centre' of CBD (done on google maps)
coords_1 = (-37.812187461761596, 144.962265054567)
distances =[]
for row_number in range(0,len(melbourne_sensors)):
    coords_2 = (melbourne_sensors['Latitude'][row_number], melbourne_sensors['Longitude'][row_number])
    distances.append(geopy.distance.geodesic(coords_1, coords_2).km)

In [221]:
melbourne_sensors['distance_from_centre']=distances
melbourne_sensors=melbourne_sensors[['sensor_id','distance_from_centre']]
melbourne_sensors

Unnamed: 0,sensor_id,distance_from_centre
0,16,0.471417
1,50,1.625052
2,73,0.889019
3,66,0.262052
4,59,0.441733
...,...,...
86,51,0.504507
87,63,0.415290
88,87,1.427802
89,52,0.046840


In [223]:
Xfull = pd.merge(Xfull, melbourne_sensors, on=['sensor_id'], how='left')
del Xfull['sensor_id']

Unnamed: 0,year,Temp,Humidity,Pressure,Rain,WindSpeed,Rainfall amount (millimetres),public_holiday,school_holiday,Sin_time,Cos_time,Sin_month_num,Cos_month_num,Sin_weekday_num,Cos_weekday_num,random,random_cat,distance_from_centre
0,2011,23.0,57.0,1008.0,0,6.0,0.0,0.0,0.0,0.000000e+00,1.0,0.500000,0.866025,-0.781831,0.62349,0.452578,2,1.785631
1,2011,23.0,57.0,1008.0,0,6.0,0.0,0.0,0.0,0.000000e+00,1.0,0.500000,0.866025,-0.781831,0.62349,0.991102,0,0.312417
2,2011,23.0,57.0,1008.0,0,6.0,0.0,0.0,0.0,0.000000e+00,1.0,0.500000,0.866025,-0.781831,0.62349,0.483220,2,0.259036
3,2011,23.0,57.0,1008.0,0,6.0,0.0,0.0,0.0,0.000000e+00,1.0,0.500000,0.866025,-0.781831,0.62349,0.850175,2,0.960237
4,2011,23.0,57.0,1008.0,0,6.0,0.0,0.0,0.0,0.000000e+00,1.0,0.500000,0.866025,-0.781831,0.62349,0.927746,0,1.303438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4129077,2022,24.0,60.0,995.0,0,30.5,0.0,0.0,0.0,-2.449294e-16,1.0,-0.866025,0.500000,0.781831,0.62349,0.294216,1,0.599761
4129078,2022,24.0,60.0,995.0,0,30.5,0.0,0.0,0.0,-2.449294e-16,1.0,-0.866025,0.500000,0.781831,0.62349,0.365612,2,0.529163
4129079,2022,24.0,60.0,995.0,0,30.5,0.0,0.0,0.0,-2.449294e-16,1.0,-0.866025,0.500000,0.781831,0.62349,0.907096,0,1.625052
4129080,2022,24.0,60.0,995.0,0,30.5,0.0,0.0,0.0,-2.449294e-16,1.0,-0.866025,0.500000,0.781831,0.62349,0.174914,1,0.286180


In [239]:
# Dataframe to store the scores for all the models
# error_metric_scores = pd.DataFrame()

print(model_name)
# Use cross_validate to return the error scores associated with this model and this data
start = time()
model_output = cross_validate(model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
end = time()
print('Ran in {} minutes'.format(round((end - start)/60),2))

# Formulate the different error scores into a dataframe
error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
              'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
              'r2': round(abs(model_output['test_r2'].mean()),2), 
              'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
             index =['sensor_ids'])

# Add evaluation metric scores for this model to the dataframe containing the metrics for each model
error_metric_scores = error_metric_scores.append(error_metrics_df)
# Save error scores for this distance to file
#error_metrics_df.to_csv('Results/CV/ComparingModels/{}_{}m_error_metric_scores.csv'.format(model_name,buffer_size_m),index=False)    

# Save dataframes of error metrics for each buffer distance 
#error_metric_scores.to_csv('Results/CV/ComparingModels/comparingmodels_error_metric_scores.csv')   

rf_regressor
Ran in 5 minutes


In [241]:
# error_metric_scores.index = ['distance_from_centre', 'random_variable', 'no_extra_category']
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
distance_from_centre,64.69,1689712000000000.0,0.93,132.22
random_variable,279.87,5096450000000000.0,0.25,428.52
no_extra_category,279.17,4798112000000000.0,0.22,434.85
sensor_ids,62.96,1587872000000000.0,0.93,129.16
