In [80]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
import math
import warnings
from sklearn import linear_model

from IPython.display import display

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA
from sklearn import neighbors

from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2, f_classif

from timeit import default_timer as timer

import pydotplus
from sklearn import tree


In [81]:
# importing data sets

# 2013
crimestats_baseline_2013 = pd.read_csv('Thinkful 2_4 dataframe.csv')
crimestats13 = crimestats_baseline_2013



In [82]:
# cleaning 2013 data

# renaming the columns to fix the non-standard text in them
crimestats13.columns = ['City', 'Population', 'Violent_Crime', 'Murder_and_Nonnegligent_Manslaughter', 'Rape_Revised','Rape_Legacy','Robbery','Aggravated_Assault','Property_Crime','Burglary','Larceny_Theft','Motor_Vehicle_Theft','Arson']

#dropping the first four rows to remove the non-numeric title fluff
crimestats13.drop([0,1,2,3], axis=0, inplace=True)

#dropping the last three rows to remove the non-numeric title fluff
crimestats13.drop([352,353,354], axis=0, inplace=True)

# dropping the Rape 1 column due to it not having any contents
crimestats13.drop('Rape_Revised', axis=1, inplace=True)
crimestats13.drop('Rape_Legacy', axis=1, inplace=True)

# dropping the Arson column due to it having so many NaN values
crimestats13.drop('Arson', axis=1, inplace=True)

# Remove 293: Seneca Falls Town due to the massive differential in per-1000-people burglary, property crime, 
# and larceny that it has relative to other cities.
# Remove 220: New York (city) because of just how much of an outlier it is from the rest of the cities.
crimestats13.drop(293, axis=0, inplace=True)
crimestats13.drop(220, axis=0, inplace=True)

# Removing outliers that showed up in model execution
crimestats13.drop(7, axis=0, inplace=True)
crimestats13.drop(11, axis=0, inplace=True)
crimestats13.drop(39, axis=0, inplace=True)
crimestats13.drop(130, axis=0, inplace=True)
crimestats13.drop(276, axis=0, inplace=True)
crimestats13.drop(314, axis=0, inplace=True)
crimestats13.drop(350, axis=0, inplace=True)

# cleaning up the data by removing commas and changing the contents from str to int
def comma_cleaning_and_int(x):
    crimestats13[x] = crimestats13[x].str.replace(',', '')
    crimestats13[x] = crimestats13[x].astype(int)

comma_cleaning_and_int('Population')
comma_cleaning_and_int('Violent_Crime')
comma_cleaning_and_int('Murder_and_Nonnegligent_Manslaughter')
comma_cleaning_and_int('Robbery')
comma_cleaning_and_int('Aggravated_Assault')
comma_cleaning_and_int('Property_Crime')
comma_cleaning_and_int('Burglary')
comma_cleaning_and_int('Larceny_Theft')
comma_cleaning_and_int('Motor_Vehicle_Theft')

# adding in features requested in the drill
def yesno(x):
    if x > 0:
        return 1
    else:
        return 0

crimestats13['murder_yesno'] = crimestats13.apply(lambda row: yesno(row['Murder_and_Nonnegligent_Manslaughter']), axis=1)
crimestats13['robbery_yesno'] = crimestats13.apply(lambda row: yesno(row['Robbery']), axis=1)

# creating features using the ratio of crime to population
def popratio(x,y):
    return (1000*(x/y))

crimestats13['violent_crime_per1000'] = 1000 * crimestats13['Violent_Crime'] / crimestats13['Population']
crimestats13['murder_per1000'] = 1000 * crimestats13['Murder_and_Nonnegligent_Manslaughter'] / crimestats13['Population']
crimestats13['robbery_per1000'] = 1000 * crimestats13['Robbery'] / crimestats13['Population']
crimestats13['assault_per1000'] = 1000 * crimestats13['Aggravated_Assault'] / crimestats13['Population']
crimestats13['propertycrime_per1000'] = 1000 * crimestats13['Property_Crime'] / crimestats13['Population']
crimestats13['burglary_per1000'] = 1000 * crimestats13['Burglary'] / crimestats13['Population']
crimestats13['larceny_per1000'] = 1000 * crimestats13['Larceny_Theft'] / crimestats13['Population']
crimestats13['motorvehicle_crime_per1000'] = 1000 * crimestats13['Motor_Vehicle_Theft'] / crimestats13['Population']

# creating a sqrt feature for property crime
crimestats13['propertycrime_per1000_sqrt'] = crimestats13['propertycrime_per1000']**.5

# creating PCA features for violent and property crimes
pca = PCA(n_components=1) 
X_reg1 = pca.fit_transform(crimestats13[['propertycrime_per1000_sqrt', 'burglary_per1000', 'larceny_per1000', 'motorvehicle_crime_per1000', ]])
crimestats13['property_crime_pca'] = X_reg1

X_reg2 = pca.fit_transform(crimestats13[['violent_crime_per1000', 'robbery_per1000','assault_per1000','murder_per1000']])
crimestats13['violent_crime_pca'] = X_reg2

In [83]:
# creating base training & testing dataframes without city names
X_train_all = crimestats13.drop('City', axis=1)

# creating training sets
Y_train = crimestats_train['Property_Crime'].values.reshape(-1, 1)
X_train_picked = crimestats_train[['Population', 'property_crime_pca', 'violent_crime_pca']]
X_train_best = SelectKBest(f_classif, k=10).fit_transform(crimestats_train, Y_train)


  f = msb / msw


In [84]:
# Creating the model objects
rfc = ensemble.RandomForestRegressor()
decision_tree = tree.DecisionTreeRegressor(max_depth=5)

grid_param_RF = {
    'n_estimators': [100,150,200,250,300],
    'criterion': ['mse', 'mae'],
    'max_features' : [.2,.4,.6,.8,1],
}

grid_search_RF = GridSearchCV(estimator = rfc,  
                              param_grid = grid_param_RF,
                              scoring = 'neg_mean_squared_error',
                              cv = 10)

# Set parameters and create Grid Search object for Decision Tree algorithms
grid_param_DT = {
    'max_depth': [1,2,3,4,5,6,7,8,9,10,15,20,30],
    'criterion': ['mse', 'mae','friedman_mse'],
    'max_features' : [.2,.4,.6,.8,1],
}

grid_search_DT = GridSearchCV(estimator = decision_tree,  
                              param_grid = grid_param_DT,
                              scoring = 'neg_mean_squared_error',
                              cv = 10)



In [57]:
# Doing Gridsearch CV on all model/feature set combinations

grid_search_DT.fit(X_train_all, Y_train)
print('X_train_all recommended parameters, Decision Tree regression:')
print(grid_search_DT.best_params_)
print(' ')

grid_search_DT.fit(X_train_picked, Y_train)
print('X_train_picked recommended parameters, Decision Tree regression:')
print(grid_search_DT.best_params_)
print(' ')

grid_search_DT.fit(X_train_best, Y_train)
print('X_train_best recommended parameters, Decision Tree regression:')
print(grid_search_DT.best_params_)
print(' ')

grid_search_RF.fit(X_train_all, Y_train)
print('X_train_all recommended parameters, Random Forest regression:')
print(grid_search_RF.best_params_)
print(' ')

grid_search_RF.fit(X_train_picked, Y_train)
print('X_train_picked recommended parameters, Random Forest regression:')
print(grid_search_RF.best_params_)
print(' ')

grid_search_RF.fit(X_train_best, Y_train)
print('X_train_best recommended parameters, Random Forest regression:')
print(grid_search_RF.best_params_)
print(' ')





X_train_all recommended parameters, Decision Tree regression:
{'criterion': 'mse', 'max_depth': 20, 'max_features': 0.8}
 




X_train_picked recommended parameters, Decision Tree regression:
{'criterion': 'mse', 'max_depth': 8, 'max_features': 0.8}
 




X_train_best recommended parameters, Decision Tree regression:
{'criterion': 'friedman_mse', 'max_depth': 10, 'max_features': 0.8}
 




X_train_all recommended parameters, Random Forest regression:
{'criterion': 'mae', 'max_features': 0.8, 'n_estimators': 250}
 




X_train_picked recommended parameters, Random Forest regression:
{'criterion': 'mae', 'max_features': 0.8, 'n_estimators': 200}
 




X_train_best recommended parameters, Random Forest regression:
{'criterion': 'mae', 'max_features': 0.8, 'n_estimators': 100}
 


In [119]:
# Decision Tree trained on X_train_all, the entirety of the cleaned numeric 2013 data

decision_tree = tree.DecisionTreeRegressor(max_depth=20,max_features = .8,criterion = 'mse')
score = 0

start = timer()

for n in range(0,50):
    decision_tree.fit(X_train_all, Y_train)
    cross_val_score(decision_tree, X_train_all, Y_train, cv=10)
    predicted = decision_tree.predict(X_train_all)
    actual = Y_train
    score += sklearn.metrics.mean_squared_error(actual, predicted)

end = timer()

print('Decision Tree predictions of Property Crime, all features:')
print(cross_val_score(decision_tree, X_train_all, Y_train, cv=10))
print('Average runtime: '+ str((end - start)/50) + ' seconds')
print('Average Mean Squared Error: ' + str(score/50))

Decision Tree predictions of Property Crime, all features:
[0.97881985 0.80957932 0.98918268 0.9884727  0.99258803 0.99885301
 0.97779593 0.99839716 0.98967847 0.93753081]
Average runtime: 0.05680933391995495 seconds
Average Mean Squared Error: 0.0


GridSearch CV recommended parameters for X_train_all, using Decision Tree regression:  max_depth=20, max_features = .8, criterion = 'mse'

These parameters resulted in an average runtime of .054 seconds and a MSE of 0.
Using the default parameters for DecisionTreeRegressor, the result was an average of .0579 seconds and an MSE of 0.

In [120]:
# Decision Tree trained on X_train_picked, using population and PCA features

decision_tree = tree.DecisionTreeRegressor(max_depth= 8, max_features = .8, criterion = 'mse')
score = 0

start = timer()

for n in range(0,50):
    decision_tree.fit(X_train_picked, Y_train)
    cross_val_score(decision_tree, X_train_picked, Y_train, cv=10)
    predicted = decision_tree.predict(X_train_picked)
    actual = Y_train
    score += sklearn.metrics.mean_squared_error(actual, predicted)

end = timer()

print('Decision Tree predictions of Property Crime, using PCA features:')
print(cross_val_score(decision_tree, X_train_picked, Y_train, cv=10))
print('Average runtime: '+ str((end - start)/50) + ' seconds')
print('Average Mean Squared Error: ' + str(score/50))

Decision Tree predictions of Property Crime, using PCA features:
[0.97394673 0.74019463 0.68000526 0.86581228 0.92368882 0.8528206
 0.87779242 0.63112426 0.87063062 0.75325385]
Average runtime: 0.036057757000016864 seconds
Average Mean Squared Error: 454.22383805496594


GridSearch CV recommended parameters for X_train_picked, using Decision Tree regression:  max_depth= 8, max_features = .8, criterion = 'mse'

These parameters resulted in an average runtime of .0436 seconds and an average MSE of 271.03.
Using the default parameters for DecisionTreeRegressor, the result was an average of .04 seconds and an MSE of 1191.42.

In [129]:
# Decision Tree trained on X_train_best, the 10 features created by SelectKBest

decision_tree = tree.DecisionTreeRegressor(max_depth= 10, max_features = .8, criterion = 'friedman_mse')
score = 0

start = timer()

for n in range(0,50):
    decision_tree.fit(X_train_best, Y_train)
    cross_val_score(decision_tree, X_train_best, Y_train, cv=10)
    actual = Y_train
    predicted = decision_tree.predict(X_train_best)
    score += sklearn.metrics.mean_squared_error(actual, predicted)

end = timer()

print('Decision Tree predictions of Property Crime, using PCA features:')
print(cross_val_score(decision_tree, X_train_best, Y_train, cv=10))
print('Average runtime: '+ str((end - start)/50) + ' seconds')
print('Average Mean Squared Error: ' + str(score/50))

Decision Tree predictions of Property Crime, using PCA features:
[0.98081048 0.95952988 0.99611835 0.98787402 0.99849933 0.99671784
 0.99793724 0.99351202 0.99064074 0.98459368]
Average runtime: 0.02392657071999565 seconds
Average Mean Squared Error: 0.04119984080161071


GridSearch CV recommended parameters for X_train_best, using Decision Tree regression:  max_depth= 10, max_features = .8, criterion = 'friedman_mse'

These parameters resulted in an average runtime of .0222 seconds and an average MSE of .0411.
Using the default parameters for DecisionTreeRegressor, the result was an average of .0235 seconds and an MSE of 0.

In [134]:
# Random Forest trained on crimestats_train, the entirety of the cleaned numeric 2013 data

rfc = ensemble.RandomForestRegressor(n_estimators = 250, max_features = .8, criterion = 'mse')
score = 0

start = timer()

for n in range(0,25):
    rfc.fit(X_train_all, Y_train)  
    cross_val_score(rfc, X_train_all, Y_train, cv=10)
    actual = Y_train
    predicted = rfc.predict(X_train_all)
    score += sklearn.metrics.mean_squared_error(actual, predicted)

end = timer()

print('Random Forest predictions of Property Crime, entirety of the cleaned 2013 dataset:')
print(cross_val_score(rfc, X_train_all, Y_train, cv=10))
print('Average runtime: '+ str((end - start)/25) + ' seconds')
print('Average Mean Squared Error: ' + str(score/25))


Random Forest predictions of Property Crime, entirety of the cleaned 2013 dataset:
[0.99700386 0.9481509  0.99907163 0.99748476 0.99896764 0.99960329
 0.9894518  0.99981205 0.99214894 0.9994063 ]
Average runtime: 5.990344341220043 seconds
Average Mean Squared Error: 255.10616439787606


GridSearch CV recommended parameters for X_train_all, using Random Forest regression:  n_estimators = 250, max_features = .8, criterion = 'mse'

These parameters resulted in an average runtime of 5.99 seconds and an average MSE of 255.11.
Using the default parameters for RandomForestRegressor, the result was an average of .34 seconds and an MSE of 443.22.

In [139]:
# Random Forest trained on X_train_picked, using population and PCA features

rfc = ensemble.RandomForestRegressor(n_estimators = 200, max_features = .8, criterion = 'mae')
score = 0

start = timer()

for n in range(0,25):
    rfc.fit(X_train_picked, Y_train)  
    cross_val_score(rfc, X_train_picked, Y_train, cv=10)
    actual = Y_train
    predicted = rfc.predict(X_train_picked)
    score += sklearn.metrics.mean_squared_error(actual, predicted)

end = timer()

print('Random Forest predictions of Property Crime, entirety of the cleaned 2013 dataset:')
print(cross_val_score(rfc, X_train_picked, Y_train, cv=10))
print('Average runtime: '+ str((end - start)/25) + ' seconds')
print('Average Mean Squared Error: ' + str(score/25))




Random Forest predictions of Property Crime, entirety of the cleaned 2013 dataset:
[0.97632936 0.92045763 0.97501787 0.93133827 0.95153778 0.96563046
 0.96355918 0.84381568 0.96772118 0.98604475]
Average runtime: 0.18611074032000033 seconds
Average Mean Squared Error: 1705.32972979351


GridSearch CV recommended parameters for X_train_picked, using Random Forest regression:  n_estimators = 200, max_features = .8, criterion = 'mae'

These parameters resulted in an average runtime of 6.962 seconds and an average MSE of 1216.
Using the default parameters for RandomForestRegressor, the result was an average of .197 seconds and an MSE of 1764.

In [138]:
# Random Forest trained on X_train_best, the 10 features created by SelectKBest

rfc = ensemble.RandomForestRegressor()
score = 0

start = timer()

for n in range(0,25):
    rfc.fit(X_train_best, Y_train)  
    cross_val_score(rfc, X_train_best, Y_train, cv=10)
    actual = Y_train
    predicted = rfc.predict(X_train_best)
    score += sklearn.metrics.mean_squared_error(actual, predicted)

end = timer()

print('Random Forest predictions of Property Crime, entirety of the cleaned 2013 dataset:')
print(cross_val_score(rfc, X_train_best, Y_train, cv=10))
print('Average runtime: '+ str((end - start)/25) + ' seconds')
print('Average Mean Squared Error: ' + str(score/25))



Random Forest predictions of Property Crime, entirety of the cleaned 2013 dataset:
[0.9997741  0.95984919 0.99909822 0.99388549 0.99744806 0.99856756
 0.99379989 0.99639888 0.992617   0.99391021]
Average runtime: 0.20949420488002943 seconds
Average Mean Squared Error: 409.70017345132743


GridSearch CV recommended parameters for X_train_picked, using Random Forest regression:  n_estimators = 100, max_features = .8, criterion = 'mae'

These parameters resulted in an average runtime of 5.89 seconds and an average MSE of 181.4.
Using the default parameters for RandomForestRegressor, the result was an average of .209 seconds and an MSE of 409.7.

Grid Search CV suggests more time-intensive parameters, albeit ones that improve performance notably.  

Decision Tree algorithm seemed to be able to function with nearly-zero error using both SelectKBest, with +/- 2% variation in Cross-Valuation tests. 

Random Forest using SelectKBest and default parameters produced a ~400 mse, very little variation in the Cross-Valuation, and an average runtime of ~.2 seconds, the best time & accuracy performance of all the Random Forest trials.  