In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 
import xgboost as xgb
from sklearn.pipeline import Pipeline
import folium
import branca.colormap as cm
from eli5.sklearn import PermutationImportance
import joblib
import os
import psutil

from Functions import *

In [2]:
lr_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['linear_regressor',LinearRegression()]])
rf_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['rf_regressor', RandomForestRegressor(random_state = 1, n_jobs = 10)]])
xgb_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['xgb_regressor',xgb.XGBRegressor(random_state=1, n_jobs = 16)]])
et_model_pipeline = Pipeline(steps=[['scaler',StandardScaler()],['et_regressor',ExtraTreesRegressor (random_state = 1, n_jobs = 16)]])

In [3]:
error_metrics = ['neg_mean_absolute_error', 'r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_percentage_error']
cv_parameters = KFold(n_splits=10, random_state=1, shuffle=True)

In [9]:
# buffer_size_m = 500
# Xfull, Yfull = prepare_x_y_data_old(buffer_size_m)
Xfull.columns

Index(['year', 'Temp', 'Humidity', 'Pressure', 'Rain', 'WindSpeed',
       'Rainfall amount (millimetres)', 'public_holiday', 'school_holiday',
       'betweenness', 'lights', 'memorials', 'trees', 'bus-stops',
       'tram-stops', 'metro-stations', 'taxi-ranks', 'big-car-parks',
       'street_inf_Bicycle Rails', 'street_inf_Bollard',
       'street_inf_Drinking Fountain', 'street_inf_Floral Crate/Planter Box',
       'street_inf_Horse Trough', 'street_inf_Information Pillar',
       'street_inf_Litter Bin', 'street_inf_Seat', 'street_inf_Tree Guard',
       'landmarks_Community Use', 'landmarks_Mixed Use',
       'landmarks_Place Of Assembly', 'landmarks_Place of Worship',
       'landmarks_Retail', 'landmarks_Transport', 'landmarks_Education Centre',
       'landmarks_Leisure/Recreation', 'landmarks_Office',
       'street_inf_Barbeque', 'street_inf_Hoop', 'street_inf_Picnic Setting',
       'landmarks_Specialist Residential Accommodation',
       'landmarks_Vacant Land', 'landmarks

In [8]:
# Dataframe to store the scores for each model
error_metric_scores = pd.DataFrame()

# model_name = {"xgb_regressor":xgb_model_pipeline}
model = rf_model_pipeline
model_name = 'rf_regressor'
regex_name = 'withsubtypes'
regex = 'buildings$|furniture$|landmarks$'

for buffer_size_m in [500]:
    Xfull, Yfull = prepare_x_y_data_old(buffer_size_m)

    start = time()
    print("running cross_validate")
    model_output = cross_validate(rf_model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
    print("ran cross_validate")   
    end = time()
    print('Ran in {} minutes'.format(round((end - start)/60),2))
    
    error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
                  'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
                  'r2': round(abs(model_output['test_r2'].mean()),2), 
                  'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
                 index =["{}".format(buffer_size_m)])
    
    
    # Add evaluation metric scores for this model to the dataframe containing the metrics for each model
    error_metric_scores = error_metric_scores.append(error_metrics_df)
    #error_metrics_df.to_csv('PickleFiles/CV/{}/{}_error_metrics_{}m.csv'.format(buffer_size_m, model_name,buffer_size_m),index=False)    
        
#error_metric_scores.to_csv('error_metric_scores.csv')   

In [8]:
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
500,61.11,1480037000000000.0,0.94,125.58


In [7]:
error_metric_scores

Unnamed: 0,mae,mape,r2,rmse
500,57.77,1402749000000000.0,0.94,119.62


In [11]:
# del Xfull['year']
Xfull.columns

Index(['Temp', 'Humidity', 'Pressure', 'Rain', 'WindSpeed',
       'Rainfall amount (millimetres)', 'public_holiday', 'school_holiday',
       'betweenness', 'lights', 'memorials', 'trees', 'bus-stops',
       'tram-stops', 'metro-stations', 'taxi-ranks', 'big-car-parks',
       'street_inf_Bicycle Rails', 'street_inf_Bollard',
       'street_inf_Drinking Fountain', 'street_inf_Floral Crate/Planter Box',
       'street_inf_Horse Trough', 'street_inf_Information Pillar',
       'street_inf_Litter Bin', 'street_inf_Seat', 'street_inf_Tree Guard',
       'landmarks_Community Use', 'landmarks_Mixed Use',
       'landmarks_Place Of Assembly', 'landmarks_Place of Worship',
       'landmarks_Retail', 'landmarks_Transport', 'landmarks_Education Centre',
       'landmarks_Leisure/Recreation', 'landmarks_Office',
       'street_inf_Barbeque', 'street_inf_Hoop', 'street_inf_Picnic Setting',
       'landmarks_Specialist Residential Accommodation',
       'landmarks_Vacant Land', 'landmarks_Purpose

In [None]:
# Dataframe to store the scores for each model
error_metric_scores = pd.DataFrame()

# model_name = {"xgb_regressor":xgb_model_pipeline}
model = rf_model_pipeline
model_name = 'rf_regressor'
regex_name = 'withsubtypes'
regex = 'buildings$|furniture$|landmarks$'

for buffer_size_m in [500]:

    start = time()
    print("running cross_validate")
    model_output = cross_validate(rf_model_pipeline, Xfull, Yfull, cv=cv_parameters, scoring=error_metrics, error_score="raise")
    print("ran cross_validate")   
    end = time()
    print('Ran in {} minutes'.format(round((end - start)/60),2))
    
    error_metrics_df =pd.DataFrame({'mae': round(abs(model_output['test_neg_mean_absolute_error'].mean()),2), 
                  'mape': round(abs(model_output['test_neg_mean_absolute_percentage_error'].mean()),2),
                  'r2': round(abs(model_output['test_r2'].mean()),2), 
                  'rmse': round(abs(model_output['test_neg_root_mean_squared_error'].mean()),2)},
                 index =["{}".format(buffer_size_m)])
    
    
    # Add evaluation metric scores for this model to the dataframe containing the metrics for each model
    error_metric_scores = error_metric_scores.append(error_metrics_df)
    #error_metrics_df.to_csv('PickleFiles/CV/{}/{}_error_metrics_{}m.csv'.format(buffer_size_m, model_name,buffer_size_m),index=False)    
        
#error_metric_scores.to_csv('error_metric_scores.csv')   

running cross_validate
