In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [46]:
final = pd.read_csv('df_final.csv')

In [47]:
y = final.INJ_SEV
x = final.drop(['INJ_SEV','Unnamed: 0','MAKE'],axis=1)
x.MOD_YEAR = 2019 - x.MOD_YEAR
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [49]:
x

Unnamed: 0,HOUR,BODY_TYP,MOD_YEAR,AGE,REST_USE,EJECTION,VE_FORMS,VSURCOND,ROLLOVER,LGT_COND
0,22,1,11.0,37,0,0,2,2,0,2
1,13,2,15.0,58,1,0,1,1,0,1
2,13,2,15.0,26,0,1,1,1,0,1
3,22,2,8.0,23,1,0,1,1,0,2
4,9,2,5.0,73,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
49362,10,2,5.0,63,1,0,2,4,0,1
49363,18,0,25.0,61,0,2,1,4,1,2
49364,13,0,17.0,34,0,1,1,1,9,1
49365,13,0,17.0,36,0,0,1,1,9,1


In [51]:
train_y.unique()

array(['No Apparent Injury', 'Possible Injury', 'Suspected Minor Injury',
       'Fatal Injury', 'Suspected Serious Injury'], dtype=object)

In [50]:
#Normalizing Inputs to speed up SVM, and boost its performance
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

## SVM

In [52]:
#For each INJ Severity, set 1 of the inj_sev labels into 1, and rest into 0
for i in train_y.unique():

    train_y1 = pd.get_dummies(train_y)[i]

    test_y1 = pd.get_dummies(test_y)[i]

    svm = SVC(gamma = 0.5, C = 0.5)
    svm.fit(train_x, train_y1)

    test_predictions = svm.predict(test_x)
    conf_mat = confusion_matrix(test_y1, test_predictions)
    print('Accuracy:' + i)
    print(accuracy_score(test_y1, test_predictions))
    print('Confusion Matrix:')
    print(conf_mat)

Accuracy:No Apparent Injury
0.7827628114239417
Confusion Matrix:
[[6674  425]
 [1720 1055]]
Accuracy:Possible Injury
0.9044966578894065
Confusion Matrix:
[[8928    1]
 [ 942    3]]
Accuracy:Suspected Minor Injury
0.8738100060765647
Confusion Matrix:
[[8627    0]
 [1246    1]]
Accuracy:Fatal Injury
0.7646343933562892
Confusion Matrix:
[[5358  753]
 [1571 2192]]
Accuracy:Suspected Serious Injury
0.8841401660927689
Confusion Matrix:
[[8730    0]
 [1144    0]]


#### Testing Radial SVM

In [44]:
i = 'Suspected Serious Injury'
train_y1 = pd.get_dummies(train_y)[i]
test_y1 = pd.get_dummies(test_y)[i]

svm = SVC(gamma = 0.5, C = 0.5)
svm.fit(train_x, train_y1)
test_predictions = svm.predict(test_x)
conf_mat = confusion_matrix(test_y1, test_predictions)
print('Accuracy:' + i)
print(accuracy_score(test_y1, test_predictions))
print('Confusion Matrix:')
print(conf_mat)

Accuracy:Suspected Serious Injury
0.8841401660927689
Confusion Matrix:
[[8730    0]
 [1144    0]]


#### Testing Linear SVM

In [32]:
i = 'Fatal Injury'
train_y1 = pd.get_dummies(train_y)[i]
test_y1 = pd.get_dummies(test_y)[i]

svm = SVC(kernel = 'linear')
svm.fit(train_x, train_y1)
test_predictions = svm.predict(test_x)
conf_mat = confusion_matrix(test_y1, test_predictions)
print('Accuracy:' + i)
print(accuracy_score(test_y1, test_predictions))
print('Confusion Matrix:')
print(conf_mat)

Accuracy:Suspected Serious Injury
0.7174397407332388
Confusion Matrix:
[[5299  812]
 [1978 1785]]


## Decision Trees

In [23]:
for i in train_y.unique():

    train_y1 = pd.get_dummies(train_y)[i]

    test_y1 = pd.get_dummies(test_y)[i]
    
    dt = DecisionTreeClassifier(criterion = 'gini', splitter = 'best')
    #print(cross_val_score(dt, train_x, train_y1, cv=10))
    dt.fit(train_x, train_y1)
    test_preds = dt.predict(test_x)
    conf_mat = confusion_matrix(test_y1, test_preds)
    print('Accuracy:' +i)
    print(accuracy_score(test_y1, test_preds))
    print('Confusion Matrix:')
    print(conf_mat)

Accuracy:No Apparent Injury
0.734656674093579
Confusion Matrix:
[[5805 1294]
 [1326 1449]]
Accuracy:Possible Injury
0.8422118695564108
Confusion Matrix:
[[8107  822]
 [ 736  209]]
Accuracy:Suspected Minor Injury
0.7898521369252582
Confusion Matrix:
[[7492 1135]
 [ 940  307]]
Accuracy:Fatal Injury
0.6747012355681588
Confusion Matrix:
[[4529 1582]
 [1630 2133]]
Accuracy:Suspected Serious Injury
0.791877658497063
Confusion Matrix:
[[7590 1140]
 [ 915  229]]


## Log Regression

In [117]:
model = LogisticRegression(solver = 'lbfgs', multi_class = 'ovr', max_iter = 1000)
# create the RFE model and select 8 attributes
rfe = RFE(model, 8)
rfe = rfe.fit(x, y)
# summarize the selection of the attributes
print('Selected features: %s' % list(x.columns[rfe.support_]))

Selected features: ['BODY_TYP', 'MOD_YEAR', 'REST_USE', 'EJECTION', 'VE_FORMS', 'VSURCOND', 'ROLLOVER', 'LGT_COND']


In [116]:
cols = ['BODY_TYP', 'MOD_YEAR', 'REST_USE', 'EJECTION', 'VE_FORMS', 'VSURCOND', 'ROLLOVER', 'LGT_COND']
y = final.INJ_SEV
x = final[cols]
x.MOD_YEAR = 2019 - x.MOD_YEAR
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [104]:
test_x.shape

(9874, 8)

In [114]:
#LR does one vs rest classification on its own...
model = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 1000)
model.fit(train_x, train_y)
test_preds = model.predict(test_x)
conf_mat = confusion_matrix(test_y, test_preds)
print('Accuracy:')
print(accuracy_score(test_y, test_preds))
print('Confusion Matrix:')
print(conf_mat)
print(classification_report(test_y,test_preds))

Accuracy:
0.5060765647154142
Confusion Matrix:
[[2683 1059    2   19    0]
 [ 473 2285   16    1    0]
 [ 275  653    4   12    1]
 [ 501  718    3   25    0]
 [ 578  543    2   21    0]]
                          precision    recall  f1-score   support

            Fatal Injury       0.59      0.71      0.65      3763
      No Apparent Injury       0.43      0.82      0.57      2775
         Possible Injury       0.15      0.00      0.01       945
  Suspected Minor Injury       0.32      0.02      0.04      1247
Suspected Serious Injury       0.00      0.00      0.00      1144

                accuracy                           0.51      9874
               macro avg       0.30      0.31      0.25      9874
            weighted avg       0.40      0.51      0.41      9874



In [115]:
for i in train_y.unique():

    train_y1 = pd.get_dummies(train_y)[i]

    test_y1 = pd.get_dummies(test_y)[i]
    
    model = LogisticRegression(solver='lbfgs',max_iter = 1000)
    
    model.fit(train_x, train_y1)
    test_preds = model.predict(test_x)
    conf_mat = confusion_matrix(test_y1, test_preds)
    print('Accuracy:' + i)
    print(accuracy_score(test_y1, test_preds))
    print('Confusion Matrix:')
    print(conf_mat)

Accuracy:No Apparent Injury
0.7304030787927891
Confusion Matrix:
[[6761  338]
 [2324  451]]
Accuracy:Possible Injury
0.9039902774964553
Confusion Matrix:
[[8925    4]
 [ 944    1]]
Accuracy:Suspected Minor Injury
0.8734049017622038
Confusion Matrix:
[[8624    3]
 [1247    0]]
Accuracy:Fatal Injury
0.7271622442779015
Confusion Matrix:
[[5447  664]
 [2030 1733]]
Accuracy:Suspected Serious Injury
0.8841401660927689
Confusion Matrix:
[[8730    0]
 [1144    0]]


In [60]:
LogisticRegression?