In [1]:
import pandas as pd 
import numpy as np
import pickle
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [2]:
data = pd.read_pickle('data_final.pkl')

In [None]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f'The mae of the model is {mae}.')
    print(f'The mse of the model is {mse}, using weighted average.')
    print(f'The rmse of the model is {rmse}, using weighted average.')
    print(f'The r2 of the model is {r2}, using weighted average.')
    
    return rmse

In [50]:
# 86/14 split

In [25]:
test = data[data['year'] == 2019]
train = data[data['year']<2019]

In [26]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train[['LSOA name']] = train[['LSOA name']].apply(lab.fit_transform)
test[['LSOA name']] = test[['LSOA name']].apply(lab.fit_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [27]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train[['name']] = train[['name']].apply(lab.fit_transform)
test[['name']] = test[['name']].apply(lab.fit_transform)

In [28]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train[['Force Name']] = train[['Force Name']].apply(lab.fit_transform)
test[['Force Name']] = test[['Force Name']].apply(lab.fit_transform)

In [29]:
test.shape[0]/train.shape[0]

0.1392892759762532

In [30]:
X_train = train[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score']]
y_train = train['amount_of_crimes']

X_test = test[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score']]
y_test = test['amount_of_crimes']

In [31]:
cl = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [39]:
evaluate(cl, X_test, y_test)

The mae of the model is 5.83184246233161.
The mse of the model is 190.6349049163269, using weighted average.
The rmse of the model is 13.807059966420328, using weighted average.
The r2 of the model is 0.6688338799604014, using weighted average.


13.807059966420328

In [35]:
cl.feature_importances_

array([0.01491093, 0.15113223, 0.03716676, 0.0359944 , 0.08322728,
       0.04747549, 0.25286059, 0.0458436 , 0.33138873])

In [34]:
X_train

Unnamed: 0,year,LSOA population,name,Force Name,population,headcount,LSOA name,month,IMD score
0,2012,1968.0,14,0,159369,1179,1344,1,49.525
1,2012,1968.0,14,0,159369,1179,1344,2,49.525
2,2012,1968.0,14,0,159369,1179,1344,3,49.525
3,2012,1968.0,14,0,159369,1179,1344,4,49.525
4,2012,1968.0,14,0,159369,1179,1344,5,49.525
...,...,...,...,...,...,...,...,...,...
2766886,2018,1624.0,285,36,498064,1021,28071,8,16.378
2766887,2018,1624.0,285,36,498064,1021,28071,9,16.378
2766888,2018,1624.0,285,36,498064,1021,28071,10,16.378
2766889,2018,1624.0,285,36,498064,1021,28071,11,16.378


In [51]:
#67/33 split

In [40]:
test2 = data[data['year'] > 2017]
train2 = data[data['year']<2018]

In [41]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train2[['LSOA name']] = train2[['LSOA name']].apply(lab.fit_transform)
test2[['LSOA name']] = test2[['LSOA name']].apply(lab.fit_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [42]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train2[['name']] = train2[['name']].apply(lab.fit_transform)
test2[['name']] = test2[['name']].apply(lab.fit_transform)

In [43]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train2[['Force Name']] = train2[['Force Name']].apply(lab.fit_transform)
test2[['Force Name']] = test2[['Force Name']].apply(lab.fit_transform)

In [44]:
test2.shape[0]/train2.shape[0]

0.329649877025901

In [45]:
X_train2 = train2[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score']]
y_train2 = train2['amount_of_crimes']

X_test2 = test2[['year', 'LSOA population', 'name', 'Force Name', 'population', 'headcount', 'LSOA name', 'month', 'IMD score']]
y_test2 = test2['amount_of_crimes']

In [46]:
cl2 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl2.fit(X_train2, y_train2)

RandomForestRegressor(n_estimators=10, random_state=42)

In [47]:
evaluate(cl2, X_test2, y_test2)

The mae of the model is 6.315487982833242.
The mse of the model is 239.0227636932433, using weighted average.
The rmse of the model is 15.46036104666522, using weighted average.
The r2 of the model is 0.5738092847213482, using weighted average.


15.46036104666522

In [48]:
cl2.feature_importances_

array([0.01270737, 0.15367738, 0.0357377 , 0.03496749, 0.08771683,
       0.04503517, 0.25874197, 0.04658025, 0.32483585])

In [49]:
X_train2

Unnamed: 0,year,LSOA population,name,Force Name,population,headcount,LSOA name,month,IMD score
0,2012,1968.0,14,0,159369,1179,1344,1,49.525
1,2012,1968.0,14,0,159369,1179,1344,2,49.525
2,2012,1968.0,14,0,159369,1179,1344,3,49.525
3,2012,1968.0,14,0,159369,1179,1344,4,49.525
4,2012,1968.0,14,0,159369,1179,1344,5,49.525
...,...,...,...,...,...,...,...,...,...
2766874,2017,1644.0,285,36,496043,1007,28071,8,16.378
2766875,2017,1644.0,285,36,496043,1007,28071,9,16.378
2766876,2017,1644.0,285,36,496043,1007,28071,10,16.378
2766877,2017,1644.0,285,36,496043,1007,28071,11,16.378
