In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
sns.set()

In [2]:
#reading the dataset
df2=pd.read_csv("all_variables.csv")
df2_data=df2[['Year','LSOA_code','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December','Type_II_crime_amount']]

In [3]:
#removing the COVID data
df2_data=df2_data[df2_data['Year']<2020]

In [4]:
#extracting the years to loop over them later
years=df2_data['Year'].unique()

## Random Forest implementation-final


In [5]:
#Creating the tables for the LSOA codes which we will focus on
set1=df2_data[df2_data['LSOA_code']=='E01000005']
set2=df2_data[df2_data['LSOA_code']=='E01030759']
set3=df2_data[df2_data['LSOA_code']=='E01020795']

In [6]:
#removing the LSOA code feature in each dataset
set1=set1.drop(['LSOA_code'],axis=1)
set2=set2.drop(['LSOA_code'],axis=1)
set3=set3.drop(['LSOA_code'],axis=1)

In [7]:
#Numeric array for the months of a year
months=[1,2,3,4,5,6,7,8,9,10,11,12]

In [8]:
#hyperparameter tuning

#choosing the parameters
n_estimators=[int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features=['auto','sqrt']
max_depth=[int(x) for x in np.linspace(10,110, num=11)]
max_depth.append(None)
min_samples_split=[2,5,10]
min_samples_leaf=[1,2,4]
bootstrap=[True,False]

#creating a dictionary to use in the hyperparameter tuning
random_grid={'n_estimators':n_estimators,
                'max_features':max_features,
                'max_depth':max_depth,
                'min_samples_split':min_samples_split,
                'min_samples_leaf':min_samples_leaf,
                'bootstrap':bootstrap}



In [9]:
#tuning the hyperparameters for the type 1 crimes for area 1

#selecting the data for the type 1 cries for the area 1
type1_crimes=set1[['Year','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#getting the training and the test years for the tuning
train_year_type1=type1_crimes[type1_crimes['Year']==2012]
test_year_type1=type1_crimes[type1_crimes['Year']==2013]
#dropping the year feature from the data
train_year_type1=train_year_type1.drop(['Year'],axis=1)
test_year_type1=test_year_type1.drop(['Year'],axis=1)
#creating a new feature for the months
train_year_type1['Month']=months
test_year_type1['Month']=months
#removing the one hot encoding for the months
train_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
test_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
#creating the X and the Y for both training and test sets
X_type1_train=train_year_type1.drop(['Type_I_crime_amount'],axis=1)
y_type1_train=train_year_type1['Type_I_crime_amount']
X_type1_test=test_year_type1.drop(['Type_I_crime_amount'],axis=1)
y_type1_test=test_year_type1['Type_I_crime_amount']
#creating the random forest regressor and the method through which we will tune the hyperparameters
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=35,n_jobs=-1)
rf_random.fit(X_type1_train,y_type1_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=35, verbose=2)

In [10]:
#hyperparameter tuning for the type 2 crimes for area 1
type2_crimes=set1[['Year','Type_II_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#getting the training and the test years for the tuning
train_year_type2=type2_crimes[type2_crimes['Year']==2012]
test_year_type2=type2_crimes[type1_crimes['Year']==2013]
#dropping the year feature from the data
train_year_type2=train_year_type2.drop(['Year'],axis=1)
test_year_type2=test_year_type2.drop(['Year'],axis=1)
#creating a new feature for the months
train_year_type2['Month']=months
test_year_type2['Month']=months
#removing the one hot encoding for the months
train_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
test_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
#creating the X and the Y for both training and test sets
X_type1_train=train_year_type2.drop(['Type_II_crime_amount'],axis=1)
y_type1_train=train_year_type2['Type_II_crime_amount']
X_type1_test=test_year_type2.drop(['Type_II_crime_amount'],axis=1)
y_type1_test=test_year_type2['Type_II_crime_amount']
#creating the random forest regressor and the method through which we will tune the hyperparameters
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=35,n_jobs=-1)
rf_random.fit(X_type1_train,y_type1_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=35, verbose=2)

In [None]:
#this prints the best parameters for the random forest regressor (must be run after each dataset was tuned)
rf_random.best_params_

In [12]:
#area 1
#Separating the crimes data in this are for each crime type
type1_crimes=set1[['Year','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
type2_crimes=set1[['Year','Type_II_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#arrays where we will keep the results of our evaluations
r2_type1=[]
r2_type2=[]
rmse_type1=[]
rmse_type2=[]
mae_type1=[]
mae_type2=[]
year_sofar=[]
#Looping through the years where we will retrain the model based on the data of said year
for i in range(1,len(years)):
    #adding in the year to the array of years which have been used so far
    year_sofar.append(years[i])
    #training the data on year X-1 if we want to predict year X (e.g if we want to predict the crime numbers for year 2013, we will train on 2012)
    train_year_type1=type1_crimes[type1_crimes['Year']==years[i-1]]
    train_year_type2=type2_crimes[type2_crimes['Year']==years[i-1]]
    test_year_type1=type1_crimes[type1_crimes['Year']==years[i]]
    test_year_type2=type2_crimes[type2_crimes['Year']==years[i]]
    #dropping the year feature from the data
    train_year_type1=train_year_type1.drop(['Year'],axis=1)
    train_year_type2=train_year_type2.drop(['Year'],axis=1)
    test_year_type1=test_year_type1.drop(['Year'],axis=1)
    test_year_type2=test_year_type2.drop(['Year'],axis=1)
    #creating a new feature for the months
    train_year_type1['Month']=months
    train_year_type2['Month']=months
    test_year_type1['Month']=months
    test_year_type2['Month']=months
    #removing the one hot encoding for the months
    train_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    train_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    test_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    test_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    #creating the X and the Y for both training and test sets for both types of crimes
    X_type1_train=train_year_type1.drop(['Type_I_crime_amount'],axis=1)
    X_type2_train=train_year_type2.drop(['Type_II_crime_amount'],axis=1)
    y_type1_train=train_year_type1['Type_I_crime_amount']
    y_type2_train=train_year_type2['Type_II_crime_amount']
    X_type1_test=test_year_type1.drop(['Type_I_crime_amount'],axis=1)
    X_type2_test=test_year_type2.drop(['Type_II_crime_amount'],axis=1)
    y_type1_test=test_year_type1['Type_I_crime_amount']
    y_type2_test=test_year_type2['Type_II_crime_amount']
    #Creating two random forest regressors for each type of crime
    rf_type1=RandomForestRegressor(n_estimators=400,min_samples_split=2,min_samples_leaf=4,max_features='auto',max_depth=20,bootstrap=True,random_state=0)
    rf_type2=RandomForestRegressor(n_estimators=400,min_samples_split=10,min_samples_leaf=2,max_features='sqrt',max_depth=100,bootstrap=True,random_state=0)
    rf_type1.fit(X_type1_train,y_type1_train)
    rf_type2.fit(X_type2_train,y_type2_train)
    #predicting the crime numbers for the test set for both types of crimes
    y_pred_type1=rf_type1.predict(X_type1_test)
    y_pred_type2=rf_type2.predict(X_type2_test)
    #calculating the r2 score for both types of crimes
    r2_type1.append(r2_score(y_type1_test,y_pred_type1))
    r2_type2.append(r2_score(y_type2_test,y_pred_type2))
    #calculating the rmse score for both types of crimes
    rmse_type1.append(np.sqrt(mean_squared_error(y_type1_test,y_pred_type1)))
    rmse_type2.append(np.sqrt(mean_squared_error(y_type2_test,y_pred_type2)))
    #calculating the mae score for both types of crimes
    mae_type2.append(mean_absolute_error(y_type2_test,y_pred_type2,multioutput='raw_values'))
    mae_type1.append(mean_absolute_error(y_type1_test,y_pred_type1,multioutput='raw_values'))
#combining the years array with the score array for each score for each crime type
rmse_area1_type1=list(zip(year_sofar,rmse_type1))
rmse_area1_type2=list(zip(year_sofar,rmse_type2))
mae_area1_type1=list(zip(year_sofar,mae_type1))
mae_area1_type2=list(zip(year_sofar,mae_type2))
r2_area1_type2=list(zip(year_sofar,r2_type2))
r2_area1_type1=list(zip(year_sofar,r2_type1))
#creating dataframes for the scores for each crime type
rmse_area1_type1_table=pd.DataFrame(rmse_area1_type1,columns=['Year','RMSE_Type_I'])
rmse_area1_type2_table=pd.DataFrame(rmse_area1_type2,columns=['Year','RMSE_Type_II'])
mae_area1_type1_table=pd.DataFrame(mae_area1_type1,columns=['Year','MAE_Type_I'])
mae_area1_type2_table=pd.DataFrame(mae_area1_type2,columns=['Year','MAE_Type_II'])
r2_area1_type1_table=pd.DataFrame(r2_area1_type1,columns=['Year','R2_Type_I'])
r2_area1_type2_table=pd.DataFrame(r2_area1_type2,columns=['Year','R2_Type_II'])
#outputting the data to csv files
rmse_area1_type1_table.to_csv('rmse_area1_type1.csv')
rmse_area1_type2_table.to_csv('rmse_area1_type2.csv')
mae_area1_type1_table.to_csv('mae_area1_type1.csv')
mae_area1_type2_table.to_csv('mae_area1_type2.csv')
r2_area1_type1_table.to_csv('r2_area1_type1.csv')
r2_area1_type2_table.to_csv('r2_area1_type2.csv')


In [13]:
#hyperparameter tuning for the second area

#creating the arrays for the first crime type
type1_crimes=set2[['Year','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#creating the arrays for the training and test year 
train_year_type1=type1_crimes[type1_crimes['Year']==2012]
test_year_type1=type1_crimes[type1_crimes['Year']==2013]
#dropping the year column from both the training and test sets
train_year_type1=train_year_type1.drop(['Year'],axis=1)
test_year_type1=test_year_type1.drop(['Year'],axis=1)
#adding the numerical value for months in each set
train_year_type1['Month']=months
test_year_type1['Month']=months
#dropping the one hot encoded month columns
train_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
test_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
#creating the X and Y arrays for the training and test sets
X_type1_train=train_year_type1.drop(['Type_I_crime_amount'],axis=1)
y_type1_train=train_year_type1['Type_I_crime_amount']
X_type1_test=test_year_type1.drop(['Type_I_crime_amount'],axis=1)
y_type1_test=test_year_type1['Type_I_crime_amount']
#creating the random forest regressor and the hyperparameter which will be used for training
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=35,n_jobs=-1)
rf_random.fit(X_type1_train,y_type1_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=35, verbose=2)

In [14]:
#creating the array for the second crime type
type2_crimes=set2[['Year','Type_II_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#creating the arrays for the training and test year
train_year_type2=type2_crimes[type2_crimes['Year']==2012]
test_year_type2=type2_crimes[type1_crimes['Year']==2013]
#dropping the year column from both the training and test sets
train_year_type2=train_year_type2.drop(['Year'],axis=1)
test_year_type2=test_year_type2.drop(['Year'],axis=1)
#adding the numerical value for months in each set
train_year_type2['Month']=months
test_year_type2['Month']=months
#dropping the one hot encoded month columns
train_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
test_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
#creating the X and Y arrays for the training and test sets
X_type1_train=train_year_type2.drop(['Type_II_crime_amount'],axis=1)
y_type1_train=train_year_type2['Type_II_crime_amount']
X_type1_test=test_year_type2.drop(['Type_II_crime_amount'],axis=1)
y_type1_test=test_year_type2['Type_II_crime_amount']
#creating the random forest regressor and the hyperparameter which will be used for training
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=35,n_jobs=-1)
rf_random.fit(X_type1_train,y_type1_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=35, verbose=2)

In [None]:
#Outputting the best parameters after tuning
rf_random.best_params_

In [16]:
#creating the arrays for each crime type
type1_crimes=set2[['Year','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
type2_crimes=set2[['Year','Type_II_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#creating the arrays to store our evaluation results
r2_type1=[]
r2_type2=[]
rmse_type1=[]
rmse_type2=[]
mae_type1=[]
mae_type2=[]
year_sofar=[]
#loop to iterate through the years
for i in range(1,len(years)):
    #adding the already evaluated years to the year_sofar array
    year_sofar.append(years[i])
    #creating the arrays for the training and test year
    train_year_type1=type1_crimes[type1_crimes['Year']==years[i-1]]
    train_year_type2=type2_crimes[type2_crimes['Year']==years[i-1]]
    test_year_type1=type1_crimes[type1_crimes['Year']==years[i]]
    test_year_type2=type2_crimes[type2_crimes['Year']==years[i]]
    #dropping the year column from both the training and test sets
    train_year_type1=train_year_type1.drop(['Year'],axis=1)
    train_year_type2=train_year_type2.drop(['Year'],axis=1)
    test_year_type1=test_year_type1.drop(['Year'],axis=1)
    test_year_type2=test_year_type2.drop(['Year'],axis=1)
    #adding the numerical value for months in each set
    train_year_type1['Month']=months
    train_year_type2['Month']=months
    test_year_type1['Month']=months
    test_year_type2['Month']=months
    #dropping the one hot encoded month columns
    train_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    train_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    test_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    test_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    #creating the X and Y arrays for the training and test sets
    X_type1_train=train_year_type1.drop(['Type_I_crime_amount'],axis=1)
    X_type2_train=train_year_type2.drop(['Type_II_crime_amount'],axis=1)
    y_type1_train=train_year_type1['Type_I_crime_amount']
    y_type2_train=train_year_type2['Type_II_crime_amount']
    X_type1_test=test_year_type1.drop(['Type_I_crime_amount'],axis=1)
    X_type2_test=test_year_type2.drop(['Type_II_crime_amount'],axis=1)
    y_type1_test=test_year_type1['Type_I_crime_amount']
    y_type2_test=test_year_type2['Type_II_crime_amount']
    #creating two separate random forest regressor for each crime type
    rf_type1=RandomForestRegressor(n_estimators=400,min_samples_split=10,min_samples_leaf=1,max_features='sqrt',max_depth=18,bootstrap=False,random_state=0)
    rf_type2=RandomForestRegressor(n_estimators=200,min_samples_split=10,min_samples_leaf=2,max_features='auto',max_depth=60,bootstrap=True,random_state=0)
    rf_type1.fit(X_type1_train,y_type1_train)
    rf_type2.fit(X_type2_train,y_type2_train)
    #predicting the values for the test set
    y_pred_type1=rf_type1.predict(X_type1_test)
    y_pred_type2=rf_type2.predict(X_type2_test)
    #calculating the r2 score for each crime type
    r2_type1.append(r2_score(y_type1_test,y_pred_type1))
    r2_type2.append(r2_score(y_type2_test,y_pred_type2))
    #calculating the rmse for each crime type
    rmse_type1.append(np.sqrt(mean_squared_error(y_type1_test,y_pred_type1)))
    rmse_type2.append(np.sqrt(mean_squared_error(y_type2_test,y_pred_type2)))
    #calculating the mae for each crime type
    mae_type1.append(mean_absolute_error(y_type1_test,y_pred_type1,multioutput='raw_values'))
    mae_type2.append(mean_absolute_error(y_type2_test,y_pred_type2,multioutput='raw_values'))

#combining the array for the navigated years with the evaluation results for each metric    
rmse_area2_type1=list(zip(year_sofar,rmse_type1))
rmse_area2_type2=list(zip(year_sofar,rmse_type2))
mae_area2_type1=list(zip(year_sofar,mae_type1))
mae_area2_type2=list(zip(year_sofar,mae_type2))
r2_area2_type2=list(zip(year_sofar,r2_type2))
r2_area2_type1=list(zip(year_sofar,r2_type1))
#creating dataframes for the evaluation results
rmse_area2_type1_table=pd.DataFrame(rmse_area2_type1,columns=['Year','RMSE_Type_I'])
rmse_area2_type2_table=pd.DataFrame(rmse_area2_type2,columns=['Year','RMSE_Type_II'])
mae_area2_type1_table=pd.DataFrame(mae_area2_type1,columns=['Year','MAE_Type_I'])
mae_area2_type2_table=pd.DataFrame(mae_area2_type2,columns=['Year','MAE_Type_II'])
r2_area2_type1_table=pd.DataFrame(r2_area2_type1,columns=['Year','R2_Type_I'])
r2_area2_type2_table=pd.DataFrame(r2_area2_type2,columns=['Year','R2_Type_II'])
#outputting the data to csv files
rmse_area2_type1_table.to_csv('rmse_area2_type1.csv')
rmse_area2_type2_table.to_csv('rmse_area2_type2.csv')
mae_area2_type1_table.to_csv('mae_area2_type1.csv')
mae_area2_type2_table.to_csv('mae_area2_type2.csv')
r2_area2_type1_table.to_csv('r2_area2_type1.csv')
r2_area2_type2_table.to_csv('r2_area2_type2.csv')

In [17]:
#creating the dataset which will be used for tuning
type1_crimes=set3[['Year','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#Separating the data into test and training years
train_year_type1=type1_crimes[type1_crimes['Year']==2012]
test_year_type1=type1_crimes[type1_crimes['Year']==2013]
#dropping the year column
train_year_type1=train_year_type1.drop(['Year'],axis=1)
test_year_type1=test_year_type1.drop(['Year'],axis=1)
#adding the numerical values for months
train_year_type1['Month']=months
test_year_type1['Month']=months
#dropping the one-hot encoded columns
train_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
test_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
#creating the X and Y variables for both training and test data
X_type1_train=train_year_type1.drop(['Type_I_crime_amount'],axis=1)
y_type1_train=train_year_type1['Type_I_crime_amount']
X_type1_test=test_year_type1.drop(['Type_I_crime_amount'],axis=1)
y_type1_test=test_year_type1['Type_I_crime_amount']
#creating the random forest regressor and the hyperparameter tuning method
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=35,n_jobs=-1)
rf_random.fit(X_type1_train,y_type1_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=35, verbose=2)

In [18]:
#creating the dataframe for the tuning of the hyperparameters
type2_crimes=set3[['Year','Type_II_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#Separating the data into test and training years
train_year_type2=type2_crimes[type2_crimes['Year']==2012]
test_year_type2=type2_crimes[type1_crimes['Year']==2013]
#dropping the year column
train_year_type2=train_year_type2.drop(['Year'],axis=1)
test_year_type2=test_year_type2.drop(['Year'],axis=1)
#adding the numerical values for months
train_year_type2['Month']=months
test_year_type2['Month']=months
#dropping the one-hot encoded columns
train_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
test_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
#creating the X and Y variables for both training and test data
X_type1_train=train_year_type2.drop(['Type_II_crime_amount'],axis=1)
y_type1_train=train_year_type2['Type_II_crime_amount']
X_type1_test=test_year_type2.drop(['Type_II_crime_amount'],axis=1)
y_type1_test=test_year_type2['Type_II_crime_amount']
#creating the random forest regressor and the hyperparameter tuning method
rf=RandomForestRegressor()
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=2,random_state=35,n_jobs=-1)
rf_random.fit(X_type1_train,y_type1_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=35, verbose=2)

In [None]:
#printing the best parameters after tuning (must be run after each tune for each crime type)
rf_random.best_params_

In [20]:
#creating the dataframe for each crime type
type1_crimes=set3[['Year','Type_I_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
type2_crimes=set3[['Year','Type_II_crime_amount','January','February','March','April','May','June','July','August','September','October','November','December']]
#creating arrays where we will store the results
year_sofar=[]
r2_type1=[]
r2_type2=[]
rmse_type1=[]
rmse_type2=[]
mae_type1=[]
mae_type2=[]
year_sofar=[]
#looping through the years
for i in range(1,len(years)):
    #addng the evaluated years to the array
    year_sofar.append(years[i])
    #creating the data for the training and test years
    train_year_type1=type1_crimes[type1_crimes['Year']==years[i-1]]
    train_year_type2=type2_crimes[type2_crimes['Year']==years[i-1]]
    test_year_type1=type1_crimes[type1_crimes['Year']==years[i]]
    test_year_type2=type2_crimes[type2_crimes['Year']==years[i]]
    #dropping the year column
    train_year_type1=train_year_type1.drop(['Year'],axis=1)
    train_year_type2=train_year_type2.drop(['Year'],axis=1)
    test_year_type1=test_year_type1.drop(['Year'],axis=1)
    test_year_type2=test_year_type2.drop(['Year'],axis=1)
    #adding the numerical values for months
    train_year_type1['Month']=months
    train_year_type2['Month']=months
    test_year_type1['Month']=months
    test_year_type2['Month']=months
    #dropping the one-hot encoded columns
    train_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    train_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    test_year_type1.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    test_year_type2.drop(['January','February','March','April','May','June','July','August','September','October','November','December'],axis=1,inplace=True)
    #creating the X and Y variables for both training and test data
    X_type1_train=train_year_type1.drop(['Type_I_crime_amount'],axis=1)
    X_type2_train=train_year_type2.drop(['Type_II_crime_amount'],axis=1)
    y_type1_train=train_year_type1['Type_I_crime_amount']
    y_type2_train=train_year_type2['Type_II_crime_amount']
    X_type1_test=test_year_type1.drop(['Type_I_crime_amount'],axis=1)
    X_type2_test=test_year_type2.drop(['Type_II_crime_amount'],axis=1)
    y_type1_test=test_year_type1['Type_I_crime_amount']
    y_type2_test=test_year_type2['Type_II_crime_amount']
    #creating the random forest regressors for each crime tyoe
    rf_type1=RandomForestRegressor(n_estimators=600,min_samples_split=10,min_samples_leaf=2,max_features='sqrt',max_depth=None,bootstrap=True,random_state=0)
    rf_type2=RandomForestRegressor(n_estimators=800,min_samples_split=5,min_samples_leaf=4,max_features='auto',max_depth=110,bootstrap=True,random_state=0)
    rf_type1.fit(X_type1_train,y_type1_train)
    rf_type2.fit(X_type2_train,y_type2_train)
    #predicting the results for each crime type
    y_pred_type1=rf_type1.predict(X_type1_test)
    y_pred_type2=rf_type2.predict(X_type2_test)
    #calculating the r2 score for each crime type
    r2_type1.append(r2_score(y_type1_test,y_pred_type1))
    r2_type2.append(r2_score(y_type2_test,y_pred_type2))
    #calculating the rmse for each crime type
    rmse_type1.append(np.sqrt(mean_squared_error(y_type1_test,y_pred_type1)))
    rmse_type2.append(np.sqrt(mean_squared_error(y_type2_test,y_pred_type2)))
    #calculating the mae for each crime type
    mae_type1.append(mean_absolute_error(y_type1_test,y_pred_type1,multioutput='raw_values'))
    mae_type2.append(mean_absolute_error(y_type2_test,y_pred_type2,multioutput='raw_values'))
#combining the data for the years for each evaluation metric
rmse_area3_type1=list(zip(year_sofar,rmse_type1))
rmse_area3_type2=list(zip(year_sofar,rmse_type2))
mae_area3_type1=list(zip(year_sofar,mae_type1))
mae_area3_type2=list(zip(year_sofar,mae_type2))
r2_area3_type2=list(zip(year_sofar,r2_type2))
r2_area3_type1=list(zip(year_sofar,r2_type1))
#creating the dataframes for the results
rmse_area3_type1_table=pd.DataFrame(rmse_area3_type1,columns=['Year','RMSE_Type_I'])
rmse_area3_type2_table=pd.DataFrame(rmse_area3_type2,columns=['Year','RMSE_Type_II'])
mae_area3_type1_table=pd.DataFrame(mae_area3_type1,columns=['Year','MAE_Type_I'])
mae_area3_type2_table=pd.DataFrame(mae_area3_type2,columns=['Year','MAE_Type_II'])
r2_area3_type1_table=pd.DataFrame(r2_area3_type1,columns=['Year','R2_Type_I'])
r2_area3_type2_table=pd.DataFrame(r2_area3_type2,columns=['Year','R2_Type_II'])
#outputting the dataframes to csv files
rmse_area3_type1_table.to_csv('rmse_area3_type1.csv')
rmse_area3_type2_table.to_csv('rmse_area3_type2.csv')
mae_area3_type1_table.to_csv('mae_area3_type1.csv')
mae_area3_type2_table.to_csv('mae_area3_type2.csv')
r2_area3_type1_table.to_csv('r2_area3_type1.csv')
r2_area3_type2_table.to_csv('r2_area3_type2.csv')