In [1]:
import pandas as pd 
import numpy as np
import pickle
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [2]:
data = pd.read_pickle('crime_data_final.pkl')

In [3]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f'The mae of the model is {mae}.')
    print(f'The mse of the model is {mse}.')
    print(f'The rmse of the model is {rmse}.')
    print(f'The r2 of the model is {r2}.')
    
    return rmse

# Train-test split

In [4]:
# 86/14 split

In [5]:
test = data[data['year'] == 2019]
train = data[data['year']<2019]

In [26]:
test.shape[0]/train.shape[0]

0.13945309714156623

# Encoding variables

In [6]:
lab = LabelEncoder()

labels_lsoa = data['LSOA name'].unique()
lab.fit(labels_lsoa)
train[['LSOA name']] = train[['LSOA name']].apply(lab.fit_transform)
test[['LSOA name']] = test[['LSOA name']].apply(lab.fit_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [7]:
labels_dist = data['name'].unique()
lab.fit(labels_dist)
train[['name']] = train[['name']].apply(lab.fit_transform)
test[['name']] = test[['name']].apply(lab.fit_transform)

In [8]:
labels_force = data['Force Name'].unique()
lab.fit(labels_force)
train[['Force Name']] = train[['Force Name']].apply(lab.fit_transform)
test[['Force Name']] = test[['Force Name']].apply(lab.fit_transform)

# Random forest models

## Model 1: 'LSOA population_last_year', 'population_last_year',  'LSOA name', 'IMD score', 'GDP_last_year'

In [12]:
X_train1 = train[['LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year']]
y_train1 = train['amount_of_crimes']

X_test1 = test[['LSOA population_last_year', 'population_last_year',  'LSOA name', 'IMD score', 'GDP_last_year']]
y_test1 = test['amount_of_crimes']

In [13]:
cl1 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl1.fit(X_train1, y_train1)

RandomForestRegressor(n_estimators=10, random_state=42)

In [14]:
evaluate(cl1, X_test1, y_test1)

The mae of the model is 5.808081820131146.
The mse of the model is 181.94503293731836.
The rmse of the model is 13.488700194507933.
The r2 of the model is 0.683929705083257.


13.488700194507933

In [15]:
cl1.feature_importances_

array([0.14949181, 0.09739342, 0.27560448, 0.40161509, 0.0758952 ])

In [None]:
X_train1.columns

## Model 2: 'LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_year_crimes'

In [16]:
X_train2 = train[['LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_year_crimes']]
y_train2 = train['amount_of_crimes']

X_test2 = test[['LSOA population_last_year', 'population_last_year',  'LSOA name', 'IMD score', 'GDP_last_year','prev_year_crimes']]
y_test2 = test['amount_of_crimes']

In [17]:
cl2 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl2.fit(X_train2, y_train2)

RandomForestRegressor(n_estimators=10, random_state=42)

In [18]:
evaluate(cl2, X_test2, y_test2)

The mae of the model is 5.168946050970327.
The mse of the model is 83.01068836520388.
The rmse of the model is 9.111020160509133.
The r2 of the model is 0.8557959383157726.


9.111020160509133

### Model 2 biased towards 'prev_year_crimes" variable

In [21]:
cl2.feature_importances_

array([0.0294898 , 0.02424477, 0.02068817, 0.02842445, 0.02401873,
       0.87313408])

In [29]:
X_train2.columns

Index(['LSOA population_last_year', 'population_last_year', 'LSOA name',
       'IMD score', 'GDP_last_year', 'prev_year_crimes'],
      dtype='object')

## Model 3: 'LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes'

In [22]:
X_train3 = train[['LSOA population_last_year', 'population_last_year', 'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes']]
y_train3 = train['amount_of_crimes']

X_test3 = test[['LSOA population_last_year', 'population_last_year',  'LSOA name', 'IMD score', 'GDP_last_year', 'prev_month_crimes']]
y_test3 = test['amount_of_crimes']

In [23]:
cl3 = RandomForestRegressor(n_estimators = 10, random_state = 42)
cl3.fit(X_train3, y_train3)

RandomForestRegressor(n_estimators=10, random_state=42)

In [24]:
evaluate(cl3, X_test3, y_test3)

The mae of the model is 4.839634101906336.
The mse of the model is 56.922277554977136.
The rmse of the model is 7.544685384757746.
The r2 of the model is 0.9011160636612019.


7.544685384757746

### Model 3 biased towards 'prev_month_crimes" variable

In [25]:
cl3.feature_importances_

array([0.02287467, 0.01831815, 0.0155889 , 0.02136357, 0.01800401,
       0.9038507 ])

In [28]:
X_train3.columns

Index(['LSOA population_last_year', 'population_last_year', 'LSOA name',
       'IMD score', 'GDP_last_year', 'prev_month_crimes'],
      dtype='object')