# Baseline Model- without crime data

In [None]:
import os
path = "/content/drive/MyDrive/COMPSCI760"
os.chdir(path)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
f = pd.read_csv('/content/drive/MyDrive/COMPSCI760/data_withoutcrime_27.csv')
f.columns

Index(['averge_room', 'Male', 'Female', 'Average Age', 'European', 'Māori',
       'Pacific Peoples', 'Asian', 'Middle Eastern / Latin American / African',
       'Other Ethnicity', 'New Zealander(19)', 'Other Ethnicity nec(19)',
       'Median personal income($)', 'Employed Full time', 'Employed Part time',
       'Unemployed', 'Paid employee', 'Employer',
       'Self employed and without employees', 'Unpaid family worker',
       'mean_CV', 'mean_FA', 'mean_AR', 'mean_bed', 'mean_Bath', 'mean_Price',
       'mean_school'],
      dtype='object')

In [None]:
f = f.rename(columns={'mean_CV':'median_CV', 'mean_FA':'median_FA','mean_AR':'median_AR','mean_bed':'median_Bed','mean_Bath':'median_Bath','mean_Price':'median_Price','mean_school':'median_School'})
f.columns

Index(['averge_room', 'Male', 'Female', 'Average Age', 'European', 'Māori',
       'Pacific Peoples', 'Asian', 'Middle Eastern / Latin American / African',
       'Other Ethnicity', 'New Zealander(19)', 'Other Ethnicity nec(19)',
       'Median personal income($)', 'Employed Full time', 'Employed Part time',
       'Unemployed', 'Paid employee', 'Employer',
       'Self employed and without employees', 'Unpaid family worker',
       'median_CV', 'median_FA', 'median_AR', 'median_Bed', 'median_Bath',
       'median_Price', 'median_School'],
      dtype='object')

In [None]:
xf = f.drop(['median_Price'], axis=1)
xf_nocv = f.drop(['median_CV', 'median_Price'], axis = 1)
yf = f['median_Price']
print(xf.shape, xf_nocv.shape, yf.shape)

(7938, 26) (7938, 25) (7938,)


## f_Drop median_cv

### 5-fold cross validation in three regression models

In [None]:
## Decide modelling methods
models = [LinearRegression(),
      DecisionTreeRegressor(),
      RandomForestRegressor(),
      ]

## Use 5-Fold cross validatin which wraped in the 'cross_val_score' function
rmse_f = dict()
for m in models:
    m_name = str(m).split('(')[0]
    scores = cross_val_score(m, X=xf_nocv, y=yf, cv = 5, scoring="neg_mean_squared_error")
    rmse_f[m_name] = np.sqrt(-scores)    ## Convert from MSE to RMSE
    print(m_name + ' is finished')

## Dispaly the model results in type of DataFrame
rmse_f = pd.DataFrame(rmse_f)   
rmse_f.index = ['cv' + str(x) for x in range(1, 6)]
rmse_f.loc['RMSE'] = rmse_f.apply(lambda x: x.mean())
print(rmse_f)

LinearRegression is finished
DecisionTreeRegressor is finished
RandomForestRegressor is finished
      LinearRegression  DecisionTreeRegressor  RandomForestRegressor
cv1           0.303679               0.395883               0.273104
cv2           0.253434               0.358464               0.254088
cv3           0.327144               0.441553               0.337490
cv4           0.253893               0.355217               0.252093
cv5           0.266035               0.385243               0.253397
RMSE          0.280837               0.387272               0.274035


### Tuning hyperparameters in DTR model and RFR model

In [None]:
## Tuning the Hyperparameters for Decision Tree Regressor
## Grid Search Method

from sklearn.model_selection import GridSearchCV
parameters_dt = {'max_depth': range(1,24), 'min_samples_split': range(10, 100, 10)}
model_dt = DecisionTreeRegressor()
gs_dt = GridSearchCV(model_dt, parameters_dt, cv=5, scoring='neg_mean_squared_error')
gs_dt.fit(xf_nocv, yf)

print("Best_params for DTR model:{0}\nBest_RMSE for DTR model:{1}".format(gs_dt.best_params_, np.sqrt(-gs_dt.best_score_)))

Best_params for DTR model:{'max_depth': 6, 'min_samples_split': 80}
Best_RMSE for DTR model:0.3049867604004201


In [None]:
## Tuning the Hyperparameters for Random Forest Regressor
## Grid Search Method

## First step, find the best number of n_estimators in RFR model

from sklearn.model_selection import GridSearchCV
parameters_rf_test1 = {'n_estimators': range(10, 100, 10)}
model_rf = RandomForestRegressor()
gs_rf = GridSearchCV(model_rf, parameters_rf_test1, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gs_rf.fit(xf_nocv, yf)

print("Best_params for RFR model:{0}\nBest_RMSE for RFR model:{1}".format(gs_rf.best_params_, np.sqrt(-gs_rf.best_score_)))

Best_params for RFR model:{'n_estimators': 70}
Best_RMSE for RFR model:0.2758608982531706


In [None]:
## Set the parameter for n_estimators as we searching before, 70
## Then search for the other two parameters

parameters_rf_test2 = {'max_depth': range(2,20), 'min_samples_split': range(10, 100, 10)}
model_rf2 = RandomForestRegressor(n_estimators = 70)
gs_rf2 = GridSearchCV(model_rf, parameters_rf_test2, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gs_rf2.fit(xf_nocv, yf)

print("Best_params for RFR model(n_estimators=70):{0}\nBest_RMSE for RFR model(n_estimators=70):{1}".format(gs_rf2.best_params_, np.sqrt(-gs_rf2.best_score_)))

Best_params for RFR model(n_estimators=70):{'max_depth': 19, 'min_samples_split': 10}
Best_RMSE for RFR model(n_estimators=70):0.27500179820432485


### 5-fold cross validation in three regression models (with tuning parameters)

In [None]:
## Decide modelling methods
models2 = [LinearRegression(),
      DecisionTreeRegressor(max_depth = 6, min_samples_split = 80),
      RandomForestRegressor(n_estimators = 70, max_depth = 19, min_samples_split =10),
      ]

## Use 5-Fold cross validatin which wraped in the 'cross_val_score' function
rmse_f2 = dict()
for m in models2:
    m_name = str(m).split('(')[0]
    scores = cross_val_score(m, X=xf_nocv, y=yf, cv = 5, scoring="neg_mean_squared_error")
    rmse_f2[m_name] = np.sqrt(-scores)    ## Convert from MSE to RMSE
    print(m_name + ' is finished')

## Dispaly the model results in type of DataFrame
rmse_f2 = pd.DataFrame(rmse_f2)   
rmse_f2.index = ['cv' + str(x) for x in range(1, 6)]
rmse_f2.loc['RMSE'] = rmse_f2.apply(lambda x: x.mean())
print("The cross validation results in three models are as follows:\n{}".format(rmse_f2))

LinearRegression is finished
DecisionTreeRegressor is finished
RandomForestRegressor is finished
The cross validation results in three models are as follows:
      LinearRegression  DecisionTreeRegressor  RandomForestRegressor
cv1           0.303679               0.303937               0.274240
cv2           0.253434               0.294738               0.255992
cv3           0.327144               0.363934               0.335738
cv4           0.253893               0.273264               0.251815
cv5           0.266035               0.280562               0.250807
RMSE          0.280837               0.303287               0.273718


# Baseline Model- with crime data

In [None]:
fc = pd.read_csv('/content/drive/MyDrive/COMPSCI760/data_withcrime_31.csv')
fc.columns

Index(['averge_room', 'Male', 'Female', 'Average Age', 'European', 'Māori',
       'Pacific Peoples', 'Asian', 'Middle Eastern / Latin American / African',
       'Other Ethnicity', 'New Zealander(19)', 'Other Ethnicity nec(19)',
       'Median personal income($)', 'Employed Full time', 'Employed Part time',
       'Unemployed', 'Paid employee', 'Employer',
       'Self employed and without employees', 'Unpaid family worker',
       'median_CV', 'median_FA', 'median_AR', 'median_bed', 'median_Bath',
       'median_Price', 'median_school', 'weighted_weapon', 'weighted_crime',
       'Number_of_happens', 'Number of Victimisations'],
      dtype='object')

In [None]:
fc = fc.rename(columns={'mean_CV':'median_CV', 'mean_FA':'median_FA','mean_AR':'median_AR','mean_bed':'median_Bed','mean_Bath':'median_Bath','mean_Price':'median_Price','mean_school':'median_School'})
fc.columns

Index(['averge_room', 'Male', 'Female', 'Average Age', 'European', 'Māori',
       'Pacific Peoples', 'Asian', 'Middle Eastern / Latin American / African',
       'Other Ethnicity', 'New Zealander(19)', 'Other Ethnicity nec(19)',
       'Median personal income($)', 'Employed Full time', 'Employed Part time',
       'Unemployed', 'Paid employee', 'Employer',
       'Self employed and without employees', 'Unpaid family worker',
       'median_CV', 'median_FA', 'median_AR', 'median_bed', 'median_Bath',
       'median_Price', 'median_school', 'weighted_weapon', 'weighted_crime',
       'Number_of_happens', 'Number of Victimisations'],
      dtype='object')

In [None]:
xfc = fc.drop(['median_Price'], axis=1)
xfc_nocv = fc.drop(['median_CV', 'median_Price'], axis = 1)
yfc = fc['median_Price']
print(xfc.shape, xfc_nocv.shape, yfc.shape)

(7417, 30) (7417, 29) (7417,)


## fc_Drop median_cv

### 5-fold cross validation in three regression models with crime data

In [None]:
## Decide modelling methods
models_fc = [LinearRegression(),
      DecisionTreeRegressor(),
      RandomForestRegressor(),
      ]

## Use 5-Fold cross validatin which wraped in the 'cross_val_score' function
rmse_fc = dict()
for m in models_fc:
    m_name = str(m).split('(')[0]
    scores = cross_val_score(m, X=xfc_nocv, y=yfc, cv = 5, scoring="neg_mean_squared_error")
    rmse_fc[m_name] = np.sqrt(-scores)    ## Convert from MSE to RMSE
    print(m_name + ' is finished')

## Dispaly the model results in type of DataFrame
rmse_fc = pd.DataFrame(rmse_fc)   
rmse_fc.index = ['cv' + str(x) for x in range(1, 6)]
rmse_fc.loc['RMSE'] = rmse_f.apply(lambda x: x.mean())
print(rmse_fc)

LinearRegression is finished
DecisionTreeRegressor is finished
RandomForestRegressor is finished
      LinearRegression  DecisionTreeRegressor  RandomForestRegressor
cv1           0.305318               0.411453               0.271443
cv2           0.246244               0.348718               0.249756
cv3           0.326694               0.425611               0.340553
cv4           0.253919               0.347882               0.252549
cv5           0.264898               0.368605               0.250137
RMSE          0.280837               0.387272               0.274035


### Tuning hyperparameters in DTR model and RFR model with crime data

In [None]:
## Tuning the Hyperparameters for Decision Tree Regressor
## Grid Search Method

from sklearn.model_selection import GridSearchCV
parameters_dt_c = {'max_depth': range(5,10), 'min_samples_split': range(10, 100, 10)}
model_dt_c = DecisionTreeRegressor()
gs_dt_c = GridSearchCV(model_dt_c, parameters_dt_c, cv=5, scoring='neg_mean_squared_error')
gs_dt_c.fit(xfc_nocv, yfc)

print("Best_params for DTR model:{0}\nBest_RMSE for DTR model:{1}".format(gs_dt_c.best_params_, np.sqrt(-gs_dt_c.best_score_)))

Best_params for DTR model:{'max_depth': 5, 'min_samples_split': 30}
Best_RMSE for DTR model:0.3105236020330802


In [None]:
## Tuning the Hyperparameters for Random Forest Regressor
## Grid Search Method

## First step, find the best number of n_estimators in RFR model

from sklearn.model_selection import GridSearchCV
parameters_rf_test1_c = {'n_estimators': range(10, 100, 10)}
model_rf = RandomForestRegressor()
gs_rf_c = GridSearchCV(model_rf, parameters_rf_test1_c, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gs_rf_c.fit(xfc_nocv, yfc)

print("Best_params for RFR model:{0}\nBest_RMSE for RFR model:{1}".format(gs_rf_c.best_params_, np.sqrt(-gs_rf_c.best_score_)))

Best_params for RFR model:{'n_estimators': 70}
Best_RMSE for RFR model:0.2751467492819437


In [None]:
## Set the parameter for n_estimators as we searching before, 70
## Then search for the other two parameters

parameters_rf_test2_c = {'max_depth': range(2,20), 'min_samples_split': range(10, 100, 10)}
model_rf2 = RandomForestRegressor(n_estimators = 70)
gs_rf2_c = GridSearchCV(model_rf, parameters_rf_test2, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
gs_rf2_c.fit(xfc_nocv, yfc)

print("Best_params for RFR model(n_estimators=70):{0}\nBest_RMSE for RFR model(n_estimators=70):{1}".format(gs_rf2_c.best_params_, np.sqrt(-gs_rf2_c.best_score_)))

Best_params for RFR model(n_estimators=70):{'max_depth': 15, 'min_samples_split': 10}
Best_RMSE for RFR model(n_estimators=70):0.2739484817633714


### 5-fold cross validation in three regression models (with tuning parameters) with crime data

In [None]:
## Decide modelling methods
models2_c = [LinearRegression(),
       DecisionTreeRegressor(max_depth = 5, min_samples_split = 30),
       RandomForestRegressor(n_estimators = 70, max_depth = 15, min_samples_split =10),
       ]

## Use 5-Fold cross validatin which wraped in the 'cross_val_score' function
rmse_f2_c = dict()
for m in models2_c:
    m_name = str(m).split('(')[0]
    scores = cross_val_score(m, X=xfc_nocv, y=yfc, cv = 5, scoring="neg_mean_squared_error")
    rmse_f2_c[m_name] = np.sqrt(-scores)    ## Convert from MSE to RMSE
    print(m_name + ' is finished')

## Dispaly the model results in type of DataFrame
rmse_f2_c = pd.DataFrame(rmse_f2_c)   
rmse_f2_c.index = ['cv' + str(x) for x in range(1, 6)]
rmse_f2_c.loc['RMSE'] = rmse_f2_c.apply(lambda x: x.mean())
print("The cross validation results in three models are as follows:\n{}".format(rmse_f2_c))

LinearRegression is finished
DecisionTreeRegressor is finished
RandomForestRegressor is finished
The cross validation results in three models are as follows:
      LinearRegression  DecisionTreeRegressor  RandomForestRegressor
cv1           0.305318               0.311047               0.270066
cv2           0.246244               0.292014               0.251066
cv3           0.326694               0.384405               0.340112
cv4           0.253919               0.270697               0.250023
cv5           0.264898               0.281896               0.249312
RMSE          0.279414               0.308012               0.272116
