In [128]:
import pandas as pd
import numpy as np
import pprint as pp
from collections import OrderedDict
import requests
import json
import datetime
import spacy
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib


# File to Load (Remember to Change These)
file_to_load = "db/schoolShootingData_withGeoCoordinates.csv"
schoolData = pd.read_csv(file_to_load, encoding="utf-8-sig")
schoolData.reset_index()
schoolData.fillna(1)
schoolData.head()
df = schoolData[['Wounded','Gender of Victims (M/F/Both)','Targeted Specific Victim(s)','Suicide (or attempted suicide) by Shooter (Y/N)',
                              'Pre-planned school attack','Day of week (formula)',
                               'School Type','Number of Shooters',
                              'Killed']]

df.loc[df['Gender of Victims (M/F/Both)'] == 'Male', 'Gender of Victims (M/F/Both)'] = 0
df.loc[df['Gender of Victims (M/F/Both)'] == 'Female', 'Gender of Victims (M/F/Both)'] = 1
df.loc[df['Gender of Victims (M/F/Both)'] == 'Both', 'Gender of Victims (M/F/Both)'] = 2
df.loc[df['Gender of Victims (M/F/Both)'] == 'No Victims', 'Gender of Victims (M/F/Both)'] = -1
df.loc[df['Gender of Victims (M/F/Both)'] == 'Unknown', 'Gender of Victims (M/F/Both)'] = -1



df.loc[df['Targeted Specific Victim(s)'] == 'N', 'Targeted Specific Victim(s)'] = 0
df.loc[df['Targeted Specific Victim(s)'] == 'Y', 'Targeted Specific Victim(s)'] = 1
df.loc[df['Targeted Specific Victim(s)'] == 'Unknown', 'Targeted Specific Victim(s)'] = -1
df.loc[df['Targeted Specific Victim(s)'] == 'Officer Involved', 'Targeted Specific Victim(s)'] = -2


df.loc[df['Suicide (or attempted suicide) by Shooter (Y/N)'] == 'N', 'Suicide (or attempted suicide) by Shooter (Y/N)'] = 0
df.loc[df['Suicide (or attempted suicide) by Shooter (Y/N)'] == 'Y', 'Suicide (or attempted suicide) by Shooter (Y/N)'] = 1
df.loc[df['Suicide (or attempted suicide) by Shooter (Y/N)'] == 'Officer Involved', 'Suicide (or attempted suicide) by Shooter (Y/N)'] = -1


df.loc[df['Pre-planned school attack'] == 'N', 'Pre-planned school attack'] = 0
df.loc[df['Pre-planned school attack'] == 'Y', 'Pre-planned school attack'] = 1
df.loc[df['Pre-planned school attack'] == 'Officer Involved', 'Pre-planned school attack'] = -1
df.loc[df['Pre-planned school attack'] == 'Other', 'Pre-planned school attack'] = -1
df.loc[df['Pre-planned school attack'] == 'Unknown', 'Pre-planned school attack'] = -1


df.loc[df['Day of week (formula)'] == 'Mon', 'Day of week (formula)'] = 0
df.loc[df['Day of week (formula)'] == 'Tue', 'Day of week (formula)'] = 1
df.loc[df['Day of week (formula)'] == 'Wed', 'Day of week (formula)'] = 2
df.loc[df['Day of week (formula)'] == 'Thu', 'Day of week (formula)'] = 3
df.loc[df['Day of week (formula)'] == 'Fri', 'Day of week (formula)'] = 4
df.loc[df['Day of week (formula)'] == 'Sat', 'Day of week (formula)'] = 5
df.loc[df['Day of week (formula)'] == 'Sun', 'Day of week (formula)'] = 6


#df.loc[df['Number of Shots Fired'] == None , 'Number of Shots Fired'] = 0


df.loc[df['School Type'] == 'Elementary', 'School Type'] = 0
df.loc[df['School Type'] == 'Junior High', 'School Type'] = 1
df.loc[df['School Type'] == 'High', 'School Type'] = 2
df.loc[df['School Type'] == 'Middle', 'School Type'] = 1
df.loc[df['School Type'] == 'Other', 'School Type'] = 3
df.loc[df['School Type'] == 'K-8', 'School Type'] = 1
df.loc[df['School Type'] == 'Unknown', 'School Type'] = -1


y = df.Killed
X = df.drop('Killed', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [131]:
df.head()

Unnamed: 0,Wounded,Gender of Victims (M/F/Both),Targeted Specific Victim(s),Suicide (or attempted suicide) by Shooter (Y/N),Pre-planned school attack,Day of week (formula),School Type,Number of Shooters,Killed
0,0.0,0,0,0,0,0,2,1.0,1
1,1.0,0,0,0,0,0,1,1.0,0
2,0.0,-1,1,0,0,0,2,1.0,0
3,1.0,0,1,0,0,4,2,1.0,0
4,2.0,0,0,0,0,0,2,2.0,0


In [130]:
df.to_csv("machineLearntCsv.csv")

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123)
X_train_scaled = preprocessing.scale(X_train)

print (X_trained_scaled)

print (X_train_scaled.mean(axis=0))

print (X_train_scaled.std(axis=0))

scaler = preprocessing.StandardScaler().fit(X_train)

X_test_scaled = scaler.transform(X_test)
 
print (X_test_scaled.mean(axis=0))

print (X_test_scaled.std(axis=0))


pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}


clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
clf.fit(X_train, y_train)

print (clf.best_params_)
print (clf.refit)
y_pred = clf.predict(X_test)

joblib.dump(clf, 'rf_regressor.pkl')
clf2 = joblib.load('rf_regressor.pkl')
clf2.predict(X_test)


ValueError: could not convert string to float: 'N '