In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('DataSetTA.xlsx')
df.head()

Unnamed: 0,id,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh-year,abcat,occRole,deploy,injSeverity,AgeCar
0,1,25.069,alive,none,belted,1,f,26,1997,1990.0,unavail,driver,0,3.0,7.0
1,2,25.069,alive,airbag,belted,1,f,72,1997,1995.0,deploy,driver,1,1.0,2.0
2,3,32.379,alive,none,none,1,f,69,1997,1988.0,unavail,driver,0,4.0,9.0
3,4,495.444,alive,airbag,belted,1,f,53,1997,1995.0,deploy,driver,1,1.0,2.0
4,5,25.069,alive,none,belted,1,f,32,1997,1988.0,unavail,driver,0,3.0,9.0


In [3]:
# drop the missing values from dataset
df=df.dropna(axis=0)
df=df.dropna().reset_index(drop=True)

In [4]:
# drop the id column scince it is not going to be used for the classification
df = df.drop(['id'], axis=1)

In [5]:
df['injSeverity'].value_counts()

3.0    5873
0.0    4364
1.0    3903
2.0    2895
4.0     788
5.0      88
6.0       2
Name: injSeverity, dtype: int64

In [6]:
# drop the cluster 6 and 5, because we only have 2 and 88 values and it is impossible yo predict it

df.drop(df[(df['injSeverity'] == 5.0)].index, inplace=True)
df.drop(df[(df['injSeverity'] == 6.0)].index, inplace=True)

In [7]:
df['injSeverity'].value_counts()

3.0    5873
0.0    4364
1.0    3903
2.0    2895
4.0     788
Name: injSeverity, dtype: int64

In [8]:
# we can see that the wight column have outliers because, the wieght 0 indicates it is a missing value, so we will be droping lines where the weight is equal to 0

df.drop(df[(df['weight'] == 0)].index, inplace=True)

In [9]:
# encode the categorical variables into numerical variables using the label encoder

from sklearn.preprocessing import LabelEncoder

headers = ['dead','airbag','seatbelt','sex','abcat','occRole','deploy','injSeverity']

#making instance of labelencoder()
le = LabelEncoder()

for col in headers : 
    encoded = le.fit_transform(df[col])

    # removing the original column 'Purchased' from df
    df.drop(col, axis=1, inplace=True)
 
    # Appending the array to our dataFrame
    df[col] = encoded

df.head()

Unnamed: 0,weight,frontal,ageOFocc,yearacc,yearVeh-year,AgeCar,dead,airbag,seatbelt,sex,abcat,occRole,deploy,injSeverity
0,25.069,1,26,1997,1990.0,7.0,0,1,0,0,2,0,0,3
1,25.069,1,72,1997,1995.0,2.0,0,0,0,0,0,0,1,1
2,32.379,1,69,1997,1988.0,9.0,0,1,1,0,2,0,0,4
3,495.444,1,53,1997,1995.0,2.0,0,0,0,0,0,0,1,1
4,25.069,1,32,1997,1988.0,9.0,0,1,0,0,2,0,0,3


In [10]:
import copy

# save the claned version of the dataset for further developements
df_clean = copy.deepcopy(df)
df_clean_dead = copy.deepcopy(df)

### Train Test Split

In [11]:
X = df.drop('injSeverity', axis=1).values
y = df['injSeverity'].values

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state=50)

In [13]:
X_train.shape


(10574, 13)

In [14]:
X_test.shape

(7050, 13)

# Marge into two classes

In [15]:
#As mentionned, will merge the classes 0 and 1 in one class called possible injury, 
#and for the classes 2, 3 and 4, in one class called injury, 


In [16]:
# Merge classes
df_clean['injSeverity'] = df_clean['injSeverity'].replace(1, 0)
df_clean['injSeverity'] = df_clean['injSeverity'].replace(2, 1)
df_clean['injSeverity'] = df_clean['injSeverity'].replace(3, 1)
df_clean['injSeverity'] = df_clean['injSeverity'].replace(4, 1)

In [17]:
df_clean['injSeverity'].value_counts()

1    9480
0    8144
Name: injSeverity, dtype: int64

In [18]:
#We can clearly see that we have no longer problems of imblanaces, 
#because the first class have 9480 values and the second class have 8144 values,which is very balanced for our learning.

In [19]:
X = df_clean.drop('injSeverity', axis=1).values
y = df_clean['injSeverity'].values

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state=50)

In [21]:
from sklearn.svm import SVC

clf = SVC(decision_function_shape='ovo')
  
# fitting x samples and y classes 
clf.fit(X_train, y_train) 

y_pred = clf.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1506 1771]
 [ 526 3247]]
              precision    recall  f1-score   support

           0       0.74      0.46      0.57      3277
           1       0.65      0.86      0.74      3773

    accuracy                           0.67      7050
   macro avg       0.69      0.66      0.65      7050
weighted avg       0.69      0.67      0.66      7050



In [22]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=100, random_state=42)
decision_tree = decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[1875 1402]
 [ 723 3050]]
              precision    recall  f1-score   support

           0       0.72      0.57      0.64      3277
           1       0.69      0.81      0.74      3773

    accuracy                           0.70      7050
   macro avg       0.70      0.69      0.69      7050
weighted avg       0.70      0.70      0.69      7050



In [23]:
from sklearn.ensemble import RandomForestClassifier

dt_reg = RandomForestClassifier(max_depth=20, max_features='sqrt', min_samples_split=100, n_estimators=400)
dt_reg.fit(X_train,y_train)

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(dt_reg, open(filename, 'wb'))



In [24]:
y_pred = dt_reg.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[2013 1264]
 [ 790 2983]]
              precision    recall  f1-score   support

           0       0.72      0.61      0.66      3277
           1       0.70      0.79      0.74      3773

    accuracy                           0.71      7050
   macro avg       0.71      0.70      0.70      7050
weighted avg       0.71      0.71      0.71      7050



In [25]:
from xgboost import XGBClassifier

Xgb = XGBClassifier()
Xgb.fit(X_train,y_train)

y_pred = Xgb.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[2061 1216]
 [ 933 2840]]
              precision    recall  f1-score   support

           0       0.69      0.63      0.66      3277
           1       0.70      0.75      0.73      3773

    accuracy                           0.70      7050
   macro avg       0.69      0.69      0.69      7050
weighted avg       0.69      0.70      0.69      7050

