In [1]:
import pandas as pd 
import numpy as np
import pickle
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [2]:
data = pd.read_pickle('data_final3.pkl')

In [3]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f'The mae of the model is {mae}.')
    print(f'The mse of the model is {mse}.')
    print(f'The rmse of the model is {rmse}.')
    print(f'The r2 of the model is {r2}.')
    
    return rmse

In [4]:
# 86/14 split

In [7]:
test = data[data['year'] == 2019]
train = data[data['year']<2019]

In [8]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train[['LSOA name']] = train[['LSOA name']].apply(lab.transform)
test[['LSOA name']] = test[['LSOA name']].apply(lab.transform)

In [9]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train[['name']] = train[['name']].apply(lab.transform)
test[['name']] = test[['name']].apply(lab.transform)

In [10]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train[['Force Name']] = train[['Force Name']].apply(lab.transform)
test[['Force Name']] = test[['Force Name']].apply(lab.transform)

In [11]:
test.shape[0]/train.shape[0]

0.1392892759762532

In [14]:
X_train = train[['year', 'LSOA population_last_year', 'name', 'Force Name', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year']]
y_train = train['amount_of_crimes']

X_test = test[['year', 'LSOA population_last_year', 'name', 'Force Name', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year']]
y_test = test['amount_of_crimes']

In [15]:
cl = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [16]:
evaluate(cl, X_test, y_test)

The mae of the model is 6.337779833925052.
The mse of the model is 209.74326613082016.
The rmse of the model is 14.482515877112656.
The r2 of the model is 0.6356393196751463.


14.482515877112656

In [None]:
#The mae of the model is 6.337779833925052.
#The mse of the model is 209.74326613082016.
#The rmse of the model is 14.482515877112656.
#The r2 of the model is 0.6356393196751463.

In [17]:
cl.feature_importances_

array([0.01070261, 0.14632143, 0.03034203, 0.0336744 , 0.0698704 ,
       0.03898161, 0.21306211, 0.04588179, 0.35451534, 0.05664827])

In [None]:
#array([0.01070261, 0.14632143, 0.03034203, 0.0336744 , 0.0698704 ,
#       0.03898161, 0.21306211, 0.04588179, 0.35451534, 0.05664827])


In [18]:
X_train

Unnamed: 0,year,LSOA population_last_year,name,Force Name,population_last_year,headcount_last_year,LSOA name,month,IMD score,GDP_last_year
0,2012,1937.0,14,0,157840,1238,1344,1,49.525,22533
1,2012,1937.0,14,0,157840,1238,1344,2,49.525,22533
2,2012,1937.0,14,0,157840,1238,1344,3,49.525,22533
3,2012,1937.0,14,0,157840,1238,1344,4,49.525,22533
4,2012,1937.0,14,0,157840,1238,1344,5,49.525,22533
...,...,...,...,...,...,...,...,...,...,...
2766886,2018,1644.0,285,36,496043,1007,28071,8,16.378,25198
2766887,2018,1644.0,285,36,496043,1007,28071,9,16.378,25198
2766888,2018,1644.0,285,36,496043,1007,28071,10,16.378,25198
2766889,2018,1644.0,285,36,496043,1007,28071,11,16.378,25198


In [50]:
#67/33 split

In [19]:
test2 = data[data['year'] > 2017]
train2 = data[data['year']<2018]

In [20]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train2[['LSOA name']] = train2[['LSOA name']].apply(lab.transform)
test2[['LSOA name']] = test2[['LSOA name']].apply(lab.transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [21]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train2[['name']] = train2[['name']].apply(lab.transform)
test2[['name']] = test2[['name']].apply(lab.transform)

In [22]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train2[['Force Name']] = train2[['Force Name']].apply(lab.transform)
test2[['Force Name']] = test2[['Force Name']].apply(lab.transform)

In [23]:
test2.shape[0]/train2.shape[0]

0.329649877025901

In [24]:
X_train2 = train2[['year', 'LSOA population_last_year', 'name', 'Force Name', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year']]
y_train2 = train2['amount_of_crimes']

X_test2 = test2[['year', 'LSOA population_last_year', 'name', 'Force Name', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year']]
y_test2 = test2['amount_of_crimes']

In [25]:
cl2 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl2.fit(X_train2, y_train2)

RandomForestRegressor(n_estimators=10, random_state=42)

In [26]:
evaluate(cl2, X_test2, y_test2)

The mae of the model is 6.7745503129113676.
The mse of the model is 245.0768458855034.
The rmse of the model is 15.65493040180963.
The r2 of the model is 0.5630145236701103.


15.65493040180963

In [27]:
#The mae of the model is 6.7745503129113676.
#The mse of the model is 245.0768458855034.
#The rmse of the model is 15.65493040180963.
#The r2 of the model is 0.5630145236701103.

In [28]:
cl2.feature_importances_

array([0.01061901, 0.14362526, 0.02724101, 0.03331124, 0.07014297,
       0.03592539, 0.21783454, 0.04660415, 0.36082067, 0.05387575])

In [29]:
#array([0.01061901, 0.14362526, 0.02724101, 0.03331124, 0.07014297,
#       0.03592539, 0.21783454, 0.04660415, 0.36082067, 0.05387575]

In [30]:
X_train2

Unnamed: 0,year,LSOA population_last_year,name,Force Name,population_last_year,headcount_last_year,LSOA name,month,IMD score,GDP_last_year
0,2012,1937.0,14,0,157840,1238,1344,1,49.525,22533
1,2012,1937.0,14,0,157840,1238,1344,2,49.525,22533
2,2012,1937.0,14,0,157840,1238,1344,3,49.525,22533
3,2012,1937.0,14,0,157840,1238,1344,4,49.525,22533
4,2012,1937.0,14,0,157840,1238,1344,5,49.525,22533
...,...,...,...,...,...,...,...,...,...,...
2766874,2017,1697.0,285,36,492240,1028,28071,8,16.378,24643
2766875,2017,1697.0,285,36,492240,1028,28071,9,16.378,24643
2766876,2017,1697.0,285,36,492240,1028,28071,10,16.378,24643
2766877,2017,1697.0,285,36,492240,1028,28071,11,16.378,24643
