## _Exploratory Analysis_

### _Import Libraries_

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
# !pip install seaborn
import seaborn as sns

#!pip install imblearn
#if the above command does not work to install imblearn package run the following command in your terminal
# conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score, precision_score

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# Define custom function to print accuracy, precision and recall

def convert_for_sklearn(label_list):
    return [1 if i == 'yes' else 0 for i in label_list]


def accuracy_precision_recall_metrics(y_true, y_pred):
    
    y_test_scoring = convert_for_sklearn(y_true)
    test_pred_scoring = convert_for_sklearn(y_pred)

    acc = accuracy_score(y_true= y_test_scoring, y_pred = test_pred_scoring)
    prec = precision_score(y_true= y_test_scoring, y_pred = test_pred_scoring)
    rec = recall_score(y_true= y_test_scoring, y_pred = test_pred_scoring)
    
    print("Test Precision: ",prec)
    print("Test Recall: ",rec)
    print("Test Accuracy: ",acc)

### _Read in the data_

In [3]:
train_data = pd.read_csv("/Users/nadeemm/Downloads/Kaggle Data sets/HackerEarth/3c055e822d5b11ea/train.csv", sep=',', header=0, na_values='unknown')
test_data =  pd.read_csv("/Users/nadeemm/Downloads/Kaggle Data sets/HackerEarth/3c055e822d5b11ea/test.csv", sep=',', header=0, na_values='unknown')

print(train_data.shape)
print(test_data.shape)

train_data.head()

(10000, 12)
(2500, 11)


Unnamed: 0,Severity,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
0,Minor_Damage_And_Injuries,49.223744,14,22,71.285324,0.272118,78.04,2,31335.476824,3,0.424352,7570
1,Minor_Damage_And_Injuries,62.465753,10,27,72.288058,0.423939,84.54,2,26024.711057,2,0.35235,12128
2,Significant_Damage_And_Fatalities,63.059361,13,16,66.362808,0.322604,78.86,7,39269.053927,3,0.003364,2181
3,Significant_Damage_And_Serious_Injuries,48.082192,11,9,74.703737,0.337029,81.79,3,42771.4992,1,0.211728,5946
4,Significant_Damage_And_Fatalities,26.484018,13,25,47.948952,0.54114,77.16,3,35509.228515,2,0.176883,9054


In [4]:
print(list(train_data.columns))

['Severity', 'Safety_Score', 'Days_Since_Inspection', 'Total_Safety_Complaints', 'Control_Metric', 'Turbulence_In_gforces', 'Cabin_Temperature', 'Accident_Type_Code', 'Max_Elevation', 'Violations', 'Adverse_Weather_Metric', 'Accident_ID']


In [5]:
# What are the data types?
train_data.dtypes

Severity                    object
Safety_Score               float64
Days_Since_Inspection        int64
Total_Safety_Complaints      int64
Control_Metric             float64
Turbulence_In_gforces      float64
Cabin_Temperature          float64
Accident_Type_Code           int64
Max_Elevation              float64
Violations                   int64
Adverse_Weather_Metric     float64
Accident_ID                  int64
dtype: object

In [6]:
train_data.describe()

Unnamed: 0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,41.876406,12.9311,6.5643,65.145324,0.381495,79.969331,3.8149,32001.803282,2.0122,0.255635,6266.5542
std,16.138072,3.539803,6.971982,11.882934,0.121301,2.759739,1.902577,9431.995196,1.03998,0.381128,3610.170288
min,0.0,1.0,0.0,0.0,0.134,74.74,1.0,831.695553,0.0,0.000316,2.0
25%,30.593607,11.0,2.0,56.927985,0.293665,77.96,2.0,25757.636908,1.0,0.012063,3139.75
50%,41.278539,13.0,4.0,65.587967,0.365879,79.54,4.0,32060.336419,2.0,0.074467,6280.5
75%,52.511416,15.0,9.0,73.336372,0.451346,81.56,5.0,38380.641513,3.0,0.354059,9391.5
max,100.0,23.0,54.0,100.0,0.882648,97.51,7.0,64297.651218,5.0,2.365378,12500.0


In [7]:
train_data.describe(include=['object'])

Unnamed: 0,Severity
count,10000
unique,4
top,Highly_Fatal_And_Damaging
freq,3049


In [8]:
train_data.Severity.value_counts()

Highly_Fatal_And_Damaging                  3049
Significant_Damage_And_Serious_Injuries    2729
Minor_Damage_And_Injuries                  2527
Significant_Damage_And_Fatalities          1695
Name: Severity, dtype: int64

In [9]:
train_data.groupby('Severity').mean()

Unnamed: 0_level_0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
Severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Highly_Fatal_And_Damaging,33.353611,12.764513,6.33716,66.541401,0.367432,80.023526,3.467366,31940.145431,1.963267,0.302307,6307.96917
Minor_Damage_And_Injuries,47.800305,13.170162,6.827859,64.350232,0.386255,79.903961,3.26949,32225.568306,1.981797,0.311014,6292.467353
Significant_Damage_And_Fatalities,47.102113,13.071386,6.486726,60.059293,0.412759,80.041516,4.500885,31300.789137,2.175811,0.247845,6181.897345
Significant_Damage_And_Serious_Injuries,42.667443,12.808721,6.622206,67.48075,0.373382,79.924478,4.282155,32298.893626,1.993404,0.157048,6248.868816


In [10]:
train_data.groupby('Severity').mean()

Unnamed: 0_level_0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
Severity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Highly_Fatal_And_Damaging,33.353611,12.764513,6.33716,66.541401,0.367432,80.023526,3.467366,31940.145431,1.963267,0.302307,6307.96917
Minor_Damage_And_Injuries,47.800305,13.170162,6.827859,64.350232,0.386255,79.903961,3.26949,32225.568306,1.981797,0.311014,6292.467353
Significant_Damage_And_Fatalities,47.102113,13.071386,6.486726,60.059293,0.412759,80.041516,4.500885,31300.789137,2.175811,0.247845,6181.897345
Significant_Damage_And_Serious_Injuries,42.667443,12.808721,6.622206,67.48075,0.373382,79.924478,4.282155,32298.893626,1.993404,0.157048,6248.868816


In [11]:
train_data.Severity.value_counts()

Highly_Fatal_And_Damaging                  3049
Significant_Damage_And_Serious_Injuries    2729
Minor_Damage_And_Injuries                  2527
Significant_Damage_And_Fatalities          1695
Name: Severity, dtype: int64

#### _Type Casting_

In [12]:
for col in ['Severity']:
    train_data[col] = train_data[col].astype('category')

In [13]:
train_data.dtypes

Severity                   category
Safety_Score                float64
Days_Since_Inspection         int64
Total_Safety_Complaints       int64
Control_Metric              float64
Turbulence_In_gforces       float64
Cabin_Temperature           float64
Accident_Type_Code            int64
Max_Elevation               float64
Violations                    int64
Adverse_Weather_Metric      float64
Accident_ID                   int64
dtype: object

In [14]:
cat_attr = list(train_data.select_dtypes("category").columns)
num_attr = list(train_data.columns.difference(cat_attr))
cat_attr.pop()

'Severity'

In [15]:
train_data.isnull().sum()

Severity                   0
Safety_Score               0
Days_Since_Inspection      0
Total_Safety_Complaints    0
Control_Metric             0
Turbulence_In_gforces      0
Cabin_Temperature          0
Accident_Type_Code         0
Max_Elevation              0
Violations                 0
Adverse_Weather_Metric     0
Accident_ID                0
dtype: int64

In [16]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attr),
        ('cat', categorical_transformer, cat_attr)])

In [17]:
clf_logreg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

### _Train-Test Split_

In [19]:
y = train_data['Severity']
x = train_data.drop(['Severity'],axis = 1)

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 1234)

In [22]:
X_train.head()

Unnamed: 0,Safety_Score,Days_Since_Inspection,Total_Safety_Complaints,Control_Metric,Turbulence_In_gforces,Cabin_Temperature,Accident_Type_Code,Max_Elevation,Violations,Adverse_Weather_Metric,Accident_ID
7408,50.502283,10,0,74.567001,0.242186,82.58,3,27627.739934,2,0.13755,3909
358,41.643836,13,2,80.993619,0.353618,82.19,2,39903.496728,5,0.541605,7095
9390,40.182648,16,7,59.890611,0.287264,79.45,2,43459.931755,1,0.585156,74
1745,36.164384,11,13,65.405652,0.353257,79.73,4,34120.059024,1,0.062135,6537
3470,55.068493,9,5,62.534184,0.34316,82.32,7,42528.134178,2,0.004017,6009


### _Build Logistic Regression Model - 1_

In [20]:
clf_logreg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

### _Evaluate Model_

In [21]:
train_pred = clf_logreg.predict(X_train)
test_pred = clf_logreg.predict(X_test)

print("Train_pred",clf_logreg.score(X_train, y_train))
print("Test_pred",clf_logreg.score(X_test, y_test))

score = 100*(f1_score(y_true = y_train, y_pred = train_pred, average = 'weighted'))
print("F1_score :",score)

Train_pred 0.6518571428571428
Test_pred 0.6436666666666667
F1_score : 64.16496784740583


### _Build Decision Tree Model - 2_

In [22]:
%%time
clf_dt = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

dt_param_grid = {'classifier__criterion': ['entropy', 'gini'], 'classifier__max_depth': [6,8,10,12], 
                 "classifier__min_samples_split": [2, 10, 20],"classifier__min_samples_leaf": [1, 5, 10]}

dt_grid = GridSearchCV(clf_dt, param_grid=dt_param_grid, cv=5)

dt_grid.fit(X_train,y_train)

CPU times: user 21.3 s, sys: 343 ms, total: 21.6 s
Wall time: 21.8 s


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [23]:
dt_grid.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': 12,
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 10}

In [24]:
train_pred = dt_grid.predict(X_train)
test_pred = dt_grid.predict(X_test)

print(dt_grid.score(X_train, y_train))
print(dt_grid.score(X_test, y_test))

0.9744285714285714
0.9346666666666666


In [25]:
score_1 = 100*(f1_score(y_true = y_train, y_pred = train_pred, average = 'weighted'))
print("F1_score :",score_1)

F1_score : 97.44352335210937


### _Build Random Forest Model - 3_ (Using Stratified KFold)

__Stratified K-Folds cross-validator__

This cross-validation object is a **variation** of KFold that returns stratified folds. The folds are made by **preserving the percentage of samples for each class**.

In [26]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

In [27]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=143)

param_grid = {"classifier__n_estimators" : [150, 250, 300],
              "classifier__max_depth" : [5,8,10],
              "classifier__max_features" : [3, 5, 7],
              "classifier__min_samples_leaf" : [4, 6, 8, 10]}

rf_grid = GridSearchCV(clf, param_grid= param_grid, cv=kfold)

In [29]:
%%time
rf_grid.fit(X_train,y_train)

CPU times: user 23min 22s, sys: 17.9 s, total: 23min 40s
Wall time: 25min 17s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=143, shuffle=True),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                

In [30]:
rf_grid.best_params_

{'classifier__max_depth': 10,
 'classifier__max_features': 7,
 'classifier__min_samples_leaf': 4,
 'classifier__n_estimators': 150}

In [31]:
train_pred = rf_grid.predict(X_train)
test_pred = rf_grid.predict(X_test)

print(rf_grid.score(X_train, y_train))
print(rf_grid.score(X_test, y_test))

score_2 = 100*(f1_score(y_true = y_train, y_pred = train_pred, average = 'weighted'))
print("F1_score :",score_2)

0.965
0.9383333333333334
F1_score : 96.50569440568366


### _Build Gradient Boosting - 4_

In [32]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('GBM',GradientBoostingClassifier())])

In [33]:
%%time
gbm_param_grid = {'GBM__max_depth': [8,10,12,14], 'GBM__subsample': [0.8, 0.6,], 'GBM__max_features':[0.2, 0.3], 
              'GBM__n_estimators': [10, 20, 30]}

gbm_grid = GridSearchCV(clf, param_grid=gbm_param_grid, cv=3)

gbm_grid.fit(X_train,y_train)

CPU times: user 5min 42s, sys: 2.56 s, total: 5min 45s
Wall time: 5min 55s


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [34]:
gbm_grid.best_params_

{'GBM__max_depth': 10,
 'GBM__max_features': 0.3,
 'GBM__n_estimators': 30,
 'GBM__subsample': 0.8}

In [35]:
train_pred = gbm_grid.predict(X_train)
test_pred = gbm_grid.predict(X_test)

print(gbm_grid.score(X_train, y_train))
print(gbm_grid.score(X_test, y_test))

score_3 = 100*(f1_score(y_true = y_train, y_pred = train_pred, average = 'weighted'))
print("F1_score :",score_3)

0.9997142857142857
0.9183333333333333
F1_score : 99.97142575619442


In [37]:
rf1 = RandomForestClassifier(n_estimators=10)

In [38]:
rf1.fit(X_train,y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
y_pred=rf1.predict(X_test)

In [40]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score


In [41]:
y_train_pred=rf1.predict(X_train)
print("F1_Score for Train:",f1_score(y_train,y_train_pred,average='macro'))

F1_Score for Train: 0.9965165703082025


In [42]:
print("Accuracy :",accuracy_score(y_test,y_pred))

Accuracy : 0.893


In [44]:
sub=rf1.predict(test_data)

In [45]:
final= pd.DataFrame({'Accident_ID' : test_data['Accident_ID'],
                     'Severity' : sub})

In [46]:
#final.to_csv("submission_copy2.csv",index=False)
#final.shape

(2500, 2)

### _Build Model   ANN_