In [13]:
import pandas as pd 
import numpy as np
import pickle
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [14]:
data = pd.read_pickle('crime_data_final.pkl')

In [15]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f'The mae of the model is {mae}.')
    print(f'The mse of the model is {mse}.')
    print(f'The rmse of the model is {rmse}.')
    print(f'The r2 of the model is {r2}.')
    
    return rmse

In [16]:
# 86/14 split

In [17]:
test = data[data['year'] == 2019]
train = data[data['year']<2019]

In [18]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train[['LSOA name']] = train[['LSOA name']].apply(lab.transform)
test[['LSOA name']] = test[['LSOA name']].apply(lab.transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [19]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train[['name']] = train[['name']].apply(lab.transform)
test[['name']] = test[['name']].apply(lab.transform)

In [20]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train[['Force Name']] = train[['Force Name']].apply(lab.transform)
test[['Force Name']] = test[['Force Name']].apply(lab.transform)

In [21]:
test.shape[0]/train.shape[0]

0.13945309714156623

# Looking for the best features for Random Forest

## Model 1: 'LSOA population_last_year', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year'

In [23]:
X_train2 = train[['LSOA population_last_year', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes']]
y_train2 = train['amount_of_crimes']

X_test2 = test[['LSOA population_last_year', 'population_last_year', 'headcount_last_year', 'LSOA name', 'month', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes']]
y_test2 = test['amount_of_crimes']

In [24]:
cl2 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl2.fit(X_train2, y_train2)

RandomForestRegressor(n_estimators=10, random_state=42)

In [25]:
evaluate(cl2, X_test2, y_test2)

The mae of the model is 4.68072312663141.
The mse of the model is 53.294884105226124.
The rmse of the model is 7.300334520090578.
The r2 of the model is 0.9074174795280304.


7.300334520090578

In [26]:
cl2.feature_importances_

array([0.01416206, 0.01099461, 0.00964008, 0.01119042, 0.00877015,
       0.0140782 , 0.01149575, 0.87873326, 0.04093546])

# BEST FEATURES

## Model 2: 'LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes'

In [28]:
X_train3 = train[['LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes']]
y_train3 = train['amount_of_crimes']

X_test3 = test[['LSOA population_last_year', 'population_last_year',  'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes']]
y_test3 = test['amount_of_crimes']

In [29]:
cl3 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl3.fit(X_train3, y_train3)

RandomForestRegressor(n_estimators=10, random_state=42)

In [30]:
evaluate(cl3, X_test3, y_test3)

The mae of the model is 4.701100277068165.
The mse of the model is 53.055407986637164.
The rmse of the model is 7.283914331363128.
The r2 of the model is 0.9078334913652648.


7.283914331363128

In [40]:
cl3.feature_importances_

array([0.01729696, 0.01436548, 0.01362286, 0.01651003, 0.01479012,
       0.88050452, 0.04291003])

## Model 3: 'LSOA population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes'

In [33]:
X_train4 = train[['LSOA population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes']]
y_train4 = train['amount_of_crimes']

X_test4 = test[['LSOA population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes', 'prev_year_crimes']]
y_test4 = test['amount_of_crimes']

In [34]:
cl4 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl4.fit(X_train4, y_train4)

RandomForestRegressor(n_estimators=10, random_state=42)

In [35]:
evaluate(cl4, X_test4, y_test4)

The mae of the model is 4.702217087730027.
The mse of the model is 53.4310556825813.
The rmse of the model is 7.309654963305813.
The r2 of the model is 0.9071809257187887.


7.309654963305813

In [36]:
cl4.feature_importances_

array([0.02023342, 0.01697617, 0.01934325, 0.01892915, 0.88094723,
       0.04357078])

## Model 4: 'LSOA population_last_year', 'LSOA name', 'prev_month_crimes', 'prev_year_crimes'

In [31]:
X_train5 = train[['LSOA population_last_year', 'LSOA name', 'prev_month_crimes', 'prev_year_crimes']]
y_train5 = train['amount_of_crimes']

X_test5 = test[['LSOA population_last_year', 'LSOA name', 'prev_month_crimes', 'prev_year_crimes']]
y_test5 = test['amount_of_crimes']

In [37]:
cl5 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl5.fit(X_train5, y_train5)

RandomForestRegressor(n_estimators=10, random_state=42)

In [38]:
evaluate(cl5, X_test5, y_test5)

The mae of the model is 4.897153456326829.
The mse of the model is 57.28416132651994.
The rmse of the model is 7.568630082552585.
The r2 of the model is 0.900487408354978.


7.568630082552585

In [39]:
cl5.feature_importances_

array([0.0347294 , 0.03879046, 0.8816137 , 0.04486644])