In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pylab as plt
from imblearn.over_sampling import SMOTE
#%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold

from scipy.stats import norm
from scipy import stats
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

#DS
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#KNN
from sklearn.neighbors import KNeighborsClassifier

#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

#LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV

#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn



## 1. Loading the dataset

In [2]:
#loading the data from CSV file 
data=pd.read_csv('final_Binary.csv')
data.head()

'''dataFeatures= ['Side Chest Airbag-Driver', 'Side Chest Airbag-Passenger',
       'AEB Vulnerable Road Users', 'Side Head Airbag-Driver',
       'Side Head Airbag-Passenger', 'Seatbelt Reminder-Passenger',
       'AEB Car-to-Car', 'Belt Loadlimiter-Rear', 'Belt Pretensioner-Rear',
       'Side Head Airbag-Rear', 'Lane Assist System', 'Seatbelt Reminder-Rear',
       'Safety Assist', 'Speed Assistance', 'Adult Occupant',
       'Centre Airbag-Driver', 'Child Occupant', 'Tested Model',
       'Isofix/i-Size-Passenger'] '''

dataFeatures= ['Adult Occupant', 'Belt Pretensioner-Rear', 'Safety Assist',
       'Child Occupant', 'Class', 'Side Pelvis Airbag-Rear',
       'AEB Vulnerable Road Users']


In [5]:
# print the columns in the dataset
data.columns

Index(['Adult Occupant', 'Belt Pretensioner-Rear', 'Safety Assist',
       'Child Occupant', 'Class', 'Side Pelvis Airbag-Rear',
       'AEB Vulnerable Road Users', 'Rate'],
      dtype='object')

In [3]:
'''# Create a MinMaxScaler object for numrical data
scaler = MinMaxScaler()

# Scaling the raw input features 
feature_cols=data.columns[:-1]
X= scaler.fit_transform(data[feature_cols])

print(f"The range of feature inputs are within {X.min()} to {X.max()}")'''

The range of feature inputs are within 0.0 to 1.0


## 2. Split the dataset 

In [3]:

from sklearn.model_selection import StratifiedShuffleSplit

feature_cols=data.columns[:-1]
# Get the split indexes
strat_shuf_split = StratifiedShuffleSplit(n_splits=1, 
                                          test_size=0.3, random_state=0)

train_idx, test_idx = next(strat_shuf_split.split(data[feature_cols], data['Rate']))

# Create the dataframes


X_train = data.loc[train_idx, dataFeatures]
y_train = data.loc[train_idx, 'Rate']

X_test  = data.loc[test_idx, dataFeatures]
y_test  = data.loc[test_idx, 'Rate']

print(f"Training dataset shape, X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Testing dataset shape, X_test: {X_test.shape}, y_test: {y_test.shape}")

Training dataset shape, X_train: (217, 7), y_train: (217,)
Testing dataset shape, X_test: (94, 7), y_test: (94,)


## 3. Smoot 

In [4]:
X= data.loc[:,dataFeatures]
y= data.loc[:,["Rate"]]

def splitSmote (model):
    # Initialize the Stratified K-fold Cross-validator with 5 splits
    sk=StratifiedKFold(n_splits=5)

    # Initialize the array to store the accuracy scores
    accuracy_scores = []

    # Perform cross-validation
    for train_index, test_index in sk.split(X, y):
        # Split the data into training and test sets
        x_train_fold, x_test_fold = X.loc[train_index,:], X.loc[test_index,:]
        y_train_fold, y_test_fold = y.loc[train_index,:], y.loc[test_index,:]

        #smote = SMOTE(sampling_strategy='minority')
        smote = SMOTE(sampling_strategy=0.5)
        x_sm, y_sm = smote.fit_resample(x_train_fold, y_train_fold)
        #Fit the model to the training data
        model.fit(x_sm, y_sm)
        # Make predictions on the test data
        y_pred = model.predict(x_test_fold)
        # Calculate the accuracy score and append it to the list
        accuracy_scores.append(accuracy_score(y_test_fold, y_pred))



    # Print the accuracy scores for each fold
    print("Accuracy scores for each fold: ", accuracy_scores)

    # Calculate the mean accuracy scores
    print("Mean accuracy score: ", np.mean(accuracy_scores))

## 4. Data normalization 

In [5]:
y_train.value_counts(normalize=True)

5    0.718894
4    0.161290
3    0.092166
1    0.009217
0    0.009217
2    0.009217
Name: Rate, dtype: float64

In [6]:
y_test.value_counts(normalize=True)

5    0.723404
4    0.159574
3    0.085106
2    0.010638
1    0.010638
0    0.010638
Name: Rate, dtype: float64

## 4. Models 

### 4.1 Decision Tree 

before optimization

In [5]:
# DecisionTreeClassifier
#importing the classfier
metrics=[]

clf=DecisionTreeClassifier(random_state=0)
#clf2=clf.fit(X_train,y_train)


#y_pred.append(pd.Series(clf2.predict(X_test), name='DecisionTreeClassifier'))
# Preciision, recall, f-score from the multi-class support function

# precision, recall, fscore, _ = score(y_test, clf2.predict(X_test), average='weighted')
# accuracy = accuracy_score(y_test, clf2.predict(X_test))
# metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))


splitSmote (clf)

Accuracy scores for each fold:  [0.9523809523809523, 0.967741935483871, 0.7903225806451613, 0.9838709677419355, 0.7258064516129032]
Mean accuracy score:  0.8840245775729647


After optimization

In [8]:
# DecisionTree opt


#optimization
param_grid={
"max_depth":[2,4,6],
"min_samples_split":[2,5,10],
"min_samples_leaf":[1,2,4]}
grid_search= GridSearchCV(estimator=clf,param_grid=param_grid,cv=5)

# grid_search.fit(X_train,y_train)
# print("Best hyper-param: ",grid_search.best_params_ )
# print("Best estimator: ",grid_search.best_estimator_ )
# print("Best score: ",grid_search.best_score_ )


# #precision, recall, fscore, _ = score(y_test, grid_search.predict(X_test), average='weighted')
# accuracy = accuracy_score(y_test, grid_search.predict(X_test))
# metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))


splitSmote (kCls)

Accuracy scores for each fold:  [0.9365079365079365, 0.8870967741935484, 0.7903225806451613, 0.967741935483871, 0.7741935483870968]
Mean accuracy score:  0.8711725550435228


In [14]:
#DT aftar opt

print("grid_search")

splitSmote(grid_search)

grid_search
Accuracy scores for each fold:  [0.9841269841269841, 0.9838709677419355, 0.7741935483870968, 0.9516129032258065, 0.6935483870967742]
Mean accuracy score:  0.8774705581157194


### 4.2 KNN

Before optimization

In [7]:
# KNN
#importing the classfier

kCls=KNeighborsClassifier()
splitSmote (kCls)
#kCls.fit(X_train,y_train)

#y_pred.append(pd.Series(kCls.predict(X_test), name='KNeighborsClassifier'))

#precision, recall, fscore, _ = score(y_test, kCls.predict(X_test), average='weighted')
#accuracy = accuracy_score(y_test, kCls.predict(X_test))
#metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))

#kCls=KNeighborsClassifier(n_neighbors=9)


Accuracy scores for each fold:  [0.9206349206349206, 0.9193548387096774, 0.7903225806451613, 0.9838709677419355, 0.7580645161290323]
Mean accuracy score:  0.8744495647721454


After optimization

In [None]:
#TODO: Write the code here

### 4.3 GradientBoosting

Before optimization

In [18]:
# GradientBoostingClassifier
#importing the classfier

gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_features=4, max_depth=2, random_state=0)
#gb_clf2.fit(X_train, y_train)

#y_pred.append(pd.Series(gb_clf2.predict(X_test), name='GradientBoostingClassifier'))

#precision, recall, fscore, _ = score(y_test, gb_clf2.predict(X_test), average='weighted')
#accuracy = accuracy_score(y_test, gb_clf2.predict(X_test))
#metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))

splitSmote (gb_clf2)



Accuracy scores for each fold:  [0.9523809523809523, 0.967741935483871, 0.7903225806451613, 0.9516129032258065, 0.7419354838709677]
Mean accuracy score:  0.8807987711213517


After optimization

In [None]:
#TODO: Write the code here

### 4.4 LogisticRegression

Before optimization

In [20]:
#LogisticRegression

#Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2","elasticnet"], 'solver'  : ['newton-cg', 'lbfgs', 'liblinear']}
logreg=LogisticRegression()
lr=GridSearchCV(logreg,grid,cv=10)
# lr.fit(X_train,y_train)

# a=lr.best_params_
# b=lr.best_score_
# print("tuned hpyerparameters :(best parameters) ",lr.best_params_)
# print("accuracy :",lr.best_score_)
# print("Best estimator: ",lr.best_estimator_ )

#y_pred.append(pd.Series(lr.predict(X_test), name='LogisticRegression'))

# precision, recall, fscore, _ = score(y_test, lr.predict(X_test), average='weighted')
# accuracy = accuracy_score(y_test, lr.predict(X_test))
# metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))

#LogisticRegression
#from sklearn.linear_model import LogisticRegressionCV
#lr= LogisticRegressionCV(Cs=a['C'], penalty=a['penalty'], solver=a['solver']).fit(X_train, y_train)

splitSmote (lr)

Accuracy scores for each fold:  [0.9047619047619048, 0.9516129032258065, 0.7903225806451613, 0.9354838709677419, 0.6451612903225806]
Mean accuracy score:  0.8454685099846391


After optimization

In [None]:
#TODO: write LogisticRegression optimization code here

### 4.5 Random Forest

Before optimization

In [21]:
#RandomForestClassifier
RF= RandomForestClassifier(criterion="gini",
                           max_depth=8,
                           min_samples_split=10,
                           random_state= 0)
# RF.fit(X_train,y_train)

# #y_pred.append(pd.Series(RF.predict(X_test), name='RandomForestClassifier'))

# precision, recall, fscore, _ = score(y_test, RF.predict(X_test), average='weighted')
# accuracy = accuracy_score(y_test, RF.predict(X_test))
# metrics.append(pd.Series({'precision':precision, 'recall':recall,'fscore':fscore, 'accuracy':accuracy}))


splitSmote (RF)

  model.fit(x_sm, y_sm)
  model.fit(x_sm, y_sm)
  model.fit(x_sm, y_sm)
  model.fit(x_sm, y_sm)


Accuracy scores for each fold:  [0.9365079365079365, 1.0, 0.7903225806451613, 0.9838709677419355, 0.7419354838709677]
Mean accuracy score:  0.8905273937532001


  model.fit(x_sm, y_sm)


After optimization

In [None]:
#TODO: Write RandomForest optimization code here

## 5. Training

### 5.1 Before Optimazation 

In [22]:
# Base models 
kCls=KNeighborsClassifier()
print("KNN")
splitSmote (kCls)

gb_clf2 = GradientBoostingClassifier()
print("GradientBoosting")
splitSmote (gb_clf2)

logreg=LogisticRegression()
print("LogisticRegression")
splitSmote (logreg)

RF= RandomForestClassifier()
print("RandomForest")
splitSmote (RF)

clf=DecisionTreeClassifier()
print("DecisionTree")
splitSmote (clf)






KNN
Accuracy scores for each fold:  [0.9365079365079365, 0.9193548387096774, 0.7903225806451613, 0.9516129032258065, 0.7741935483870968]
Mean accuracy score:  0.8743983614951356
GradientBoosting
Accuracy scores for each fold:  [0.9682539682539683, 1.0, 0.7903225806451613, 0.9838709677419355, 0.7419354838709677]
Mean accuracy score:  0.8968766001024064
LogisticRegression
Accuracy scores for each fold:  [0.9047619047619048, 1.0, 0.8225806451612904, 0.967741935483871, 0.6612903225806451]
Mean accuracy score:  0.8712749615975423
RandomForest


  model.fit(x_sm, y_sm)
  model.fit(x_sm, y_sm)
  model.fit(x_sm, y_sm)
  model.fit(x_sm, y_sm)


Accuracy scores for each fold:  [0.9523809523809523, 1.0, 0.7903225806451613, 0.967741935483871, 0.7741935483870968]
Mean accuracy score:  0.8969278033794164
DecisionTree
Accuracy scores for each fold:  [0.9841269841269841, 0.967741935483871, 0.7903225806451613, 0.9516129032258065, 0.7096774193548387]
Mean accuracy score:  0.8806963645673322


  model.fit(x_sm, y_sm)


### After Optimization

In [None]:
#TODO: write the code here

## 5. Evaluation

### 5.1 Before Optimization

In [13]:

def report( y_test, pred ):
    #report 
    print(classification_report(y_test,pred,target_names=['0','1','2','3','4','5']))


def confusionMatrix():
    #confusion_matrix
    #the result will show how mwny sucessful predition and wrong from each class

    cm = confusion_matrix(y_test, pred)
    plt.figure(figsize=(10,7))

    sns.heatmap(cm, annot=True, fmt='.2g', cmap='Blues')

    # TN   FP
    # FN   TP

In [15]:
metrics = pd.concat(metrics, axis=1,names=['DecisionTree','DecisionTreeOpt','KNN','GradientBoosting','LogisticRegression','RandomForest'])


TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [16]:
metrics.columns=['DecisionTree','DecisionTreeOpt','KNN','GradientBoosting','LogisticRegression','RandomForest']
metrics

Unnamed: 0,DecisionTree,DecisionTreeOpt,KNN,GradientBoosting,LogisticRegression,RandomForest
precision,1.0,1.0,0.73349,0.963374,0.961147,0.944874
recall,1.0,1.0,0.776596,0.968085,0.968085,0.968085
fscore,1.0,1.0,0.74923,0.963933,0.962621,0.954647
accuracy,1.0,1.0,0.776596,0.968085,0.968085,0.968085


### 5.2 After Optimization

In [None]:
#TODO: write the code here