In [2]:
import pandas as pd 
import numpy as np
import pickle
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [13]:
data = pd.read_pickle('data_final2.pkl')

In [14]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f'The mae of the model is {mae}.')
    print(f'The mse of the model is {mse}, using weighted average.')
    print(f'The rmse of the model is {rmse}, using weighted average.')
    print(f'The r2 of the model is {r2}, using weighted average.')
    
    return rmse

In [15]:
# 86/14 split

In [16]:
test = data[data['year'] == 2019]
train = data[data['year']<2019]

In [17]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train[['LSOA name']] = train[['LSOA name']].apply(lab.fit_transform)
test[['LSOA name']] = test[['LSOA name']].apply(lab.fit_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [18]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train[['name']] = train[['name']].apply(lab.fit_transform)
test[['name']] = test[['name']].apply(lab.fit_transform)

In [19]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train[['Force Name']] = train[['Force Name']].apply(lab.fit_transform)
test[['Force Name']] = test[['Force Name']].apply(lab.fit_transform)

In [20]:
test.shape[0]/train.shape[0]

0.1392892759762532

In [22]:
X_train = train[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score', 'GDP']]
y_train = train['amount_of_crimes']

X_test = test[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score', 'GDP']]
y_test = test['amount_of_crimes']

In [23]:
cl = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [24]:
evaluate(cl, X_test, y_test)

The mae of the model is 6.026346735406362.
The mse of the model is 180.94061179906652, using weighted average.
The rmse of the model is 13.45141672089102, using weighted average.
The r2 of the model is 0.6856745600004959, using weighted average.


13.45141672089102

In [25]:
cl.feature_importances_

array([0.01302943, 0.14030739, 0.02562346, 0.03305125, 0.06271517,
       0.03977202, 0.21063538, 0.0457592 , 0.37372059, 0.05538611])

In [26]:
X_train

Unnamed: 0,year,LSOA population,name,Force Name,population,headcount,LSOA name,month,IMD score,GDP
0,2012,1968.0,14,0,159369,1179,1344,1,49.525,23203
1,2012,1968.0,14,0,159369,1179,1344,2,49.525,23203
2,2012,1968.0,14,0,159369,1179,1344,3,49.525,23203
3,2012,1968.0,14,0,159369,1179,1344,4,49.525,23203
4,2012,1968.0,14,0,159369,1179,1344,5,49.525,23203
...,...,...,...,...,...,...,...,...,...,...
2766755,2018,1624.0,285,36,498064,1021,28071,8,16.378,26113
2766756,2018,1624.0,285,36,498064,1021,28071,9,16.378,26113
2766757,2018,1624.0,285,36,498064,1021,28071,10,16.378,26113
2766758,2018,1624.0,285,36,498064,1021,28071,11,16.378,26113


In [27]:
#67/33 split

In [28]:
test2 = data[data['year'] > 2017]
train2 = data[data['year']<2018]

In [29]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train2[['LSOA name']] = train2[['LSOA name']].apply(lab.fit_transform)
test2[['LSOA name']] = test2[['LSOA name']].apply(lab.fit_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [30]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train2[['name']] = train2[['name']].apply(lab.fit_transform)
test2[['name']] = test2[['name']].apply(lab.fit_transform)

In [31]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train2[['Force Name']] = train2[['Force Name']].apply(lab.fit_transform)
test2[['Force Name']] = test2[['Force Name']].apply(lab.fit_transform)

In [32]:
test2.shape[0]/train2.shape[0]

0.329649877025901

In [33]:
X_train2 = train2[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score', 'GDP']]
y_train2 = train2['amount_of_crimes']

X_test2 = test2[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score', 'GDP']]
y_test2 = test2['amount_of_crimes']

In [34]:
cl2 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl2.fit(X_train2, y_train2)

RandomForestRegressor(n_estimators=10, random_state=42)

In [35]:
evaluate(cl2, X_test2, y_test2)

The mae of the model is 6.692002064209149.
The mse of the model is 240.60155394422844, using weighted average.
The rmse of the model is 15.511336304272062, using weighted average.
The r2 of the model is 0.570994214993488, using weighted average.


15.511336304272062

In [36]:
cl2.feature_importances_

array([0.01067915, 0.14621783, 0.03048273, 0.03887407, 0.06238558,
       0.04028772, 0.21083009, 0.04651981, 0.35976931, 0.0539537 ])

In [37]:
X_train2

Unnamed: 0,year,LSOA population,name,Force Name,population,headcount,LSOA name,month,IMD score,GDP
0,2012,1968.0,14,0,159369,1179,1344,1,49.525,23203
1,2012,1968.0,14,0,159369,1179,1344,2,49.525,23203
2,2012,1968.0,14,0,159369,1179,1344,3,49.525,23203
3,2012,1968.0,14,0,159369,1179,1344,4,49.525,23203
4,2012,1968.0,14,0,159369,1179,1344,5,49.525,23203
...,...,...,...,...,...,...,...,...,...,...
2766613,2017,1644.0,285,36,496043,1007,28071,8,16.378,25198
2766614,2017,1644.0,285,36,496043,1007,28071,9,16.378,25198
2766615,2017,1644.0,285,36,496043,1007,28071,10,16.378,25198
2766616,2017,1644.0,285,36,496043,1007,28071,11,16.378,25198
