In [313]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [314]:
final = pd.read_csv('df_final_speed.csv')

In [315]:
final

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,HOUR,MAKE,BODY_TYP,MOD_YEAR,AGE,INJ_SEV,REST_USE,EJECTION,VE_FORMS,VSURCOND,ROLLOVER,TRAV_SP,VSPD_LIM,LGT_COND
0,7,7,22,Toyota,Pickups,2011.0,23,0,1,0,1,1,0,75,70,2
1,9,9,0,Chevrolet,Pickups,2004.0,40,0,1,0,2,1,0,70,70,2
2,11,11,12,Jeep,SUVs,2016.0,38,4,1,0,3,2,0,65,70,1
3,12,12,12,Jeep,SUVs,2016.0,30,3,1,0,3,2,0,65,70,1
4,20,20,21,Hyundai,Sedans,2016.0,24,4,1,0,2,1,0,30,55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13634,57395,58224,8,Ford,Pickups,1977.0,18,4,1,1,2,4,0,35,70,1
13635,57396,58225,9,Chevrolet,Pickups,2008.0,39,1,1,0,1,4,1,50,80,1
13636,57397,58226,9,Chevrolet,Pickups,2008.0,38,4,0,1,1,4,1,50,80,1
13637,57398,58227,9,Chevrolet,Pickups,2008.0,41,1,1,0,1,4,1,50,80,1


In [316]:
inj_str = ['No Apparent Injury', 'Possible Injury', 'Suspected Minor Injury', 
        'Suspected Serious Injury', 'Fatal Injury', 'Injured, Severity Unknown', 
        'Died Prior to Crash', '','','Unknown']
def change_inj(injury):
    for i in range(0,10):
        if injury == i:
            injury = inj_str[i]
    return injury

In [317]:
light = [1]
dark = [2,3,6]
part_dark = [4,5]
def change_lgt(LGT_COND):
    if LGT_COND in light:
        LGT_COND = 0
    if LGT_COND in part_dark:
        LGT_COND = 1
    if LGT_COND in dark:
        LGT_COND = 2
    return LGT_COND

In [318]:
dry = [1]
wet = [2,3,4]
def change_vsur(VSURCOND):
    if VSURCOND in dry:
        VSURCOND = 0
    if VSURCOND in wet:
        VSURCOND = 1
    return VSURCOND

In [319]:
def change_ro(ROLLOVER):
    if ROLLOVER == 9:
        ROLLOVER = 3
    return ROLLOVER

In [320]:
final = final[final.VSURCOND <= 4]
final = final[final.VSURCOND != 0]
final = final[(final.INJ_SEV <= 4)]
final = final[(final.LGT_COND <= 6)]

In [321]:
final['INJ_SEV'] = final.INJ_SEV.apply(change_inj)
final.LGT_COND = final.LGT_COND.apply(change_lgt)
final.ROLLOVER = final.ROLLOVER.apply(change_ro)
final.VSURCOND = final.VSURCOND.apply(change_vsur)
final.MOD_YEAR = 2019 - final.MOD_YEAR

In [322]:
final.LGT_COND.value_counts()

0    6885
2    5934
1     593
Name: LGT_COND, dtype: int64

In [334]:
final['DIF_SP'] = final.TRAV_SP - final.VSPD_LIM

In [336]:
#final = final.drop('Unnamed: 0', axis = 1)
final

Unnamed: 0,HOUR,MAKE,BODY_TYP,MOD_YEAR,AGE,INJ_SEV,REST_USE,EJECTION,VE_FORMS,VSURCOND,ROLLOVER,TRAV_SP,VSPD_LIM,LGT_COND,DIF_SP
0,22,Toyota,Pickups,8.0,23,No Apparent Injury,1,0,1,0,0,75,70,2,5
1,0,Chevrolet,Pickups,15.0,40,No Apparent Injury,1,0,2,0,0,70,70,2,0
2,12,Jeep,SUVs,3.0,38,Fatal Injury,1,0,3,1,0,65,70,0,-5
3,12,Jeep,SUVs,3.0,30,Suspected Serious Injury,1,0,3,1,0,65,70,0,-5
4,21,Hyundai,Sedans,3.0,24,Fatal Injury,1,0,2,0,0,30,55,2,-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13634,8,Ford,Pickups,42.0,18,Fatal Injury,1,1,2,1,0,35,70,0,-35
13635,9,Chevrolet,Pickups,11.0,39,Possible Injury,1,0,1,1,1,50,80,0,-30
13636,9,Chevrolet,Pickups,11.0,38,Fatal Injury,0,1,1,1,1,50,80,0,-30
13637,9,Chevrolet,Pickups,11.0,41,Possible Injury,1,0,1,1,1,50,80,0,-30


In [341]:
final.to_csv("df_final_speed_encoded.csv")

In [325]:
final.INJ_SEV.value_counts()

Fatal Injury                4513
No Apparent Injury          3982
Suspected Minor Injury      1885
Suspected Serious Injury    1647
Possible Injury             1385
Name: INJ_SEV, dtype: int64

In [332]:
y = final.INJ_SEV
x = final.drop(['INJ_SEV'],axis=1)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2)

In [327]:
x.LGT_COND.value_counts()

0    6885
2    5934
1     593
Name: LGT_COND, dtype: int64

In [280]:
x

Unnamed: 0,HOUR,MAKE,BODY_TYP,MOD_YEAR,AGE,REST_USE,EJECTION,VE_FORMS,VSURCOND,ROLLOVER,TRAV_SP,VSPD_LIM,LGT_COND
0,22,Toyota,Pickups,8.0,23,1,0,1,0,0,75,70,2
1,0,Chevrolet,Pickups,15.0,40,1,0,2,0,0,70,70,2
2,12,Jeep,SUVs,3.0,38,1,0,3,1,0,65,70,0
3,12,Jeep,SUVs,3.0,30,1,0,3,1,0,65,70,0
4,21,Hyundai,Sedans,3.0,24,1,0,2,0,0,30,55,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,8,Ford,Pickups,42.0,18,1,1,2,1,0,35,70,0
13481,9,Chevrolet,Pickups,11.0,39,1,0,1,1,1,50,80,0
13482,9,Chevrolet,Pickups,11.0,38,0,1,1,1,1,50,80,0
13483,9,Chevrolet,Pickups,11.0,41,1,0,1,1,1,50,80,0


In [328]:
x.VSURCOND.value_counts()

0    11425
1     1987
Name: VSURCOND, dtype: int64

1. Daylight
2. Dark – Not Lighted
3. Dark – Lighted
4. Dawn
5. Dusk
6. Dark – Unknown Lighting
7. Other
8. Not Reported
9. Reported as Unknown

In [329]:
x.VSPD_LIM.value_counts()

55    3868
45    2588
65    1457
35    1219
70    1170
40     923
60     535
50     492
25     441
30     414
75     187
80      54
15      28
20      14
0       12
5        6
10       4
Name: VSPD_LIM, dtype: int64

In [273]:
x.describe()

Unnamed: 0,HOUR,MOD_YEAR,AGE,REST_USE,EJECTION,VE_FORMS,VSURCOND,ROLLOVER,TRAV_SP,VSPD_LIM,LGT_COND
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,13.297812,11.538821,39.823878,0.73541,0.11472,2.035743,0.147794,0.248869,45.497664,51.030033,0.929477
std,6.94596,6.998098,20.079475,0.441131,0.368752,1.232675,0.354909,0.565286,24.919114,12.408499,0.975065
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,5.0,23.0,0.0,0.0,1.0,0.0,0.0,30.0,45.0,0.0
50%,14.0,12.0,36.0,1.0,0.0,2.0,0.0,0.0,50.0,55.0,0.0
75%,19.0,16.0,55.0,1.0,0.0,2.0,0.0,0.0,64.0,60.0,2.0
max,99.0,64.0,98.0,1.0,3.0,14.0,1.0,3.0,130.0,80.0,2.0


In [330]:
train_y.unique()

array(['No Apparent Injury', 'Possible Injury', 'Fatal Injury',
       'Suspected Minor Injury', 'Suspected Serious Injury'], dtype=object)

In [25]:
#Normalizing Inputs to speed up SVM, and boost its performance
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

## SVM

In [None]:
#For each INJ Severity, set 1 of the inj_sev labels into 1, and rest into 0
for i in train_y.unique():

    train_y1 = pd.get_dummies(train_y)[i]

    test_y1 = pd.get_dummies(test_y)[i]

    svm = SVC(gamma = 0.5, C = 0.5)
    svm.fit(train_x, train_y1)

    test_predictions = svm.predict(test_x)
    conf_mat = confusion_matrix(test_y1, test_predictions)
    print('Accuracy:' + i)
    print(accuracy_score(test_y1, test_predictions))
    print('Confusion Matrix:')
    print(conf_mat)

Accuracy:No Apparent Injury
0.7291877658497063
Confusion Matrix:
[[7040   59]
 [2615  160]]
Accuracy:Possible Injury
0.904294105732226
Confusion Matrix:
[[8928    1]
 [ 944    1]]
Accuracy:Suspected Minor Injury
0.8739112821551549
Confusion Matrix:
[[8627    0]
 [1245    2]]
Accuracy:Fatal Injury
0.634190804132064
Confusion Matrix:
[[5988  123]
 [3489  274]]


In [191]:
SVC?

#### Testing Radial SVM

In [1]:
i = 'Fatal Injury'
train_y1 = pd.get_dummies(train_y)[i]
test_y1 = pd.get_dummies(test_y)[i]

svm = SVC(gamma = 0.5, C = 0.5)
svm.fit(train_x, train_y1)
test_predictions = svm.predict(test_x)
conf_mat = confusion_matrix(test_y1, test_predictions)
print('Accuracy:' + i)
print(accuracy_score(test_y1, test_predictions))
print('Confusion Matrix:')
print(conf_mat)

NameError: name 'pd' is not defined

#### Testing Linear SVM

In [32]:
i = 'Fatal Injury'
train_y1 = pd.get_dummies(train_y)[i]
test_y1 = pd.get_dummies(test_y)[i]

svm = SVC(kernel = 'linear')
svm.fit(train_x, train_y1)
test_predictions = svm.predict(test_x)
conf_mat = confusion_matrix(test_y1, test_predictions)
print('Accuracy:' + i)
print(accuracy_score(test_y1, test_predictions))
print('Confusion Matrix:')
print(conf_mat)

Accuracy:Suspected Serious Injury
0.7174397407332388
Confusion Matrix:
[[5299  812]
 [1978 1785]]


## Decision Trees

In [None]:
dt = DecisionTreeClassifier(criterion = 'gini', splitter = 'best')
#print(cross_val_score(dt, train_x, train_y1, cv=10))
dt.fit(train_x, train_y1)
test_preds = dt.predict(test_x)
conf_mat = confusion_matrix(test_y1, test_preds)
print('Accuracy:' +i)
print(accuracy_score(test_y1, test_preds))
print('Confusion Matrix:')
print(conf_mat)

In [23]:
for i in train_y.unique():

    train_y1 = pd.get_dummies(train_y)[i]

    test_y1 = pd.get_dummies(test_y)[i]
    
    dt = DecisionTreeClassifier(criterion = 'gini', splitter = 'best')
    #print(cross_val_score(dt, train_x, train_y1, cv=10))
    dt.fit(train_x, train_y1)
    test_preds = dt.predict(test_x)
    conf_mat = confusion_matrix(test_y1, test_preds)
    print('Accuracy:' +i)
    print(accuracy_score(test_y1, test_preds))
    print('Confusion Matrix:')
    print(conf_mat)

Accuracy:No Apparent Injury
0.734656674093579
Confusion Matrix:
[[5805 1294]
 [1326 1449]]
Accuracy:Possible Injury
0.8422118695564108
Confusion Matrix:
[[8107  822]
 [ 736  209]]
Accuracy:Suspected Minor Injury
0.7898521369252582
Confusion Matrix:
[[7492 1135]
 [ 940  307]]
Accuracy:Fatal Injury
0.6747012355681588
Confusion Matrix:
[[4529 1582]
 [1630 2133]]
Accuracy:Suspected Serious Injury
0.791877658497063
Confusion Matrix:
[[7590 1140]
 [ 915  229]]


In [129]:
TEST = pd.read_csv("df_final_speed.csv")
TEST.TRAV_SP.value_counts()

998    27690
999     4813
55      3096
0       2905
45      2245
       ...  
135        1
122        1
139        1
145        1
137        1
Name: TRAV_SP, Length: 137, dtype: int64

In [121]:
RFE?

## Log Regression

In [333]:
model = LogisticRegression(solver = 'lbfgs', multi_class = 'ovr', max_iter = 1000)
# create the RFE model and select 8 attributes
rfe = RFE(model, 6)
rfe = rfe.fit(x, y)
# summarize the selection of the attributes
print('Selected features: %s' % list(x.columns[rfe.support_]))

ValueError: could not convert string to float: 'Toyota'

In [None]:
LogisticRegression

In [116]:
cols = ['BODY_TYP', 'MOD_YEAR', 'REST_USE', 'EJECTION', 'VE_FORMS', 'VSURCOND', 'ROLLOVER', 'LGT_COND']
y = final.INJ_SEV
x = final[cols]
x.MOD_YEAR = 2019 - x.MOD_YEAR
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [104]:
test_x.shape

(9874, 8)

In [8]:
LogisticRegression?

In [114]:
#LR does one vs rest classification on its own...
model = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 1000)
model.fit(train_x, train_y)
test_preds = model.predict(test_x)
conf_mat = confusion_matrix(test_y, test_preds)
print('Accuracy:')
print(accuracy_score(test_y, test_preds))
print('Confusion Matrix:')
print(conf_mat)
print(classification_report(test_y,test_preds))

Accuracy:
0.5060765647154142
Confusion Matrix:
[[2683 1059    2   19    0]
 [ 473 2285   16    1    0]
 [ 275  653    4   12    1]
 [ 501  718    3   25    0]
 [ 578  543    2   21    0]]
                          precision    recall  f1-score   support

            Fatal Injury       0.59      0.71      0.65      3763
      No Apparent Injury       0.43      0.82      0.57      2775
         Possible Injury       0.15      0.00      0.01       945
  Suspected Minor Injury       0.32      0.02      0.04      1247
Suspected Serious Injury       0.00      0.00      0.00      1144

                accuracy                           0.51      9874
               macro avg       0.30      0.31      0.25      9874
            weighted avg       0.40      0.51      0.41      9874



In [115]:
for i in train_y.unique():

    train_y1 = pd.get_dummies(train_y)[i]

    test_y1 = pd.get_dummies(test_y)[i]
    
    model = LogisticRegression(solver='lbfgs',max_iter = 1000)
    
    model.fit(train_x, train_y1)
    test_preds = model.predict(test_x)
    conf_mat = confusion_matrix(test_y1, test_preds)
    print('Accuracy:' + i)
    print(accuracy_score(test_y1, test_preds))
    print('Confusion Matrix:')
    print(conf_mat)

Accuracy:No Apparent Injury
0.7304030787927891
Confusion Matrix:
[[6761  338]
 [2324  451]]
Accuracy:Possible Injury
0.9039902774964553
Confusion Matrix:
[[8925    4]
 [ 944    1]]
Accuracy:Suspected Minor Injury
0.8734049017622038
Confusion Matrix:
[[8624    3]
 [1247    0]]
Accuracy:Fatal Injury
0.7271622442779015
Confusion Matrix:
[[5447  664]
 [2030 1733]]
Accuracy:Suspected Serious Injury
0.8841401660927689
Confusion Matrix:
[[8730    0]
 [1144    0]]


In [60]:
LogisticRegression?