In [None]:
import numpy as np
#in case we need to repeat experiment
np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')


import seaborn as sns
sns.set()

from sklearn.metrics import classification_report, confusion_matrix 

#Enable
enable_grid_search=False


In [None]:
#df = pd.read_csv('NCDB_2016.csv', engine = 'python')
df = pd.read_csv('data01_simple.csv', engine = 'python')
df

In [None]:
print(df.isnull().sum().sum())

In [None]:
print(df[df.index.astype('str').str.contains('[^0-9]')].sum().sum())

In [None]:
df_cat = df.astype('category').copy()

In [None]:
df_int = df.astype('int').copy()

In [None]:
print(df.columns)

## Convert Class Variable to Binary

In [None]:
## Convert Class Variable to Binary
### Merge Injury and Fatality as a single class
### we will compare the results.
df_binary_class = df_cat.copy()

#perform the conversion in two steps to avoid any unwanted side effects
df_binary_class['P_ISEV'] = df_binary_class['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
df_binary_class['P_ISEV'] = df_binary_class['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'1'})
print((df_binary_class['P_ISEV']=='0').sum())
print((df_binary_class['P_ISEV']=='1').sum())
print(df_binary_class['P_ISEV'].unique())

## Split Training and Testing for Binary class

In [None]:
#Split between data and class
Ybinary = df_binary_class[df_binary_class.columns[-1]]
Xbinary = df_binary_class[df_binary_class.columns[0:df_binary_class.columns.size -1]]
#print(Xbinary, Ybinary)

#### Split Test(70%) and Train (30%) for Bianry class 

In [None]:
#sprint into train and test 70/30
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
Xbinary_train, Xbinary_test, Ybinary_train, Ybinary_test = train_test_split(Xbinary, Ybinary, test_size=0.3, random_state=0)

In [None]:
print(Xbinary_train, Xbinary_test, Ybinary_train, Ybinary_test)

## Undersample the majority for the 3 class evaluation

#### Subset Data into class Fatal

In [None]:
# fatal
is_fatal =  df_cat['P_ISEV']==3
is_fatal_count = is_fatal.sum()
print(is_fatal_count)
df_cat_fatal = df_cat[is_fatal]
display(df_cat_fatal)


#### Subset Data into class Injury

In [None]:
# injury
is_injury =  df_cat['P_ISEV']==2
is_injury_count = is_injury.sum()
print(is_injury_count)
df_cat_injury = df_cat[is_injury]
display(df_cat_injury)


#### Subset Data into class safe

In [None]:
# non_injury
is_safe =  df_cat['P_ISEV']==1
#print(is_safe)
is_safe_count = is_safe.sum()
print(is_safe_count)
df_cat_safe = df_cat[is_safe]
display(df_cat_safe)

In [None]:
# get the size of fatal datafram
min_size = df_cat_fatal.index.size
print(min_size)

In [None]:
# get size of injury
df_cat_safe.index.size

In [None]:
# size of injury
df_cat_injury.index.size

## Create Final dataframe for tenary class modeling

In [None]:
# randomly sample n number of injury and no injury and append to fatal
df_cat_injury_select = df_cat_injury.sample(n=min_size)
df_cat_safe_select = df_cat_safe.sample(n=min_size)


In [None]:
df_cat_injury_select.shape

In [None]:
df_cat_safe_select.shape

In [None]:
#concat the three dataframes
df_final = pd.concat([df_cat_fatal, df_cat_injury_select, df_cat_safe_select])
df_final.shape

In [None]:
df_final.P_ISEV.unique()

## Write cleaned data to file for future use.

In [None]:
#lets write the datafile for future use
df_final.to_csv('cleandata.csv', encoding='utf-8', index=False)

In [None]:
#Split between data and class

Y = df_final[df_cat.columns[-1]]
X = df_final[df_cat.columns[0:df_cat.columns.size -1]]
#print(X,Y)

## Clustering based on K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, init='random', n_init=10, max_iter=300, tol=1e-04)
ykm = kmeans.fit(X)

In [None]:
ykm.cluster_centers_

In [None]:
display(ykm.labels_)

## Feature selection using Random Forest

### Feature Selection 

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
forest.fit(X, Y)

### Get the inportant features from random forest

In [None]:
importFeatures = forest.feature_importances_

### List the features by importancce

In [None]:
importFeatures

In [None]:
indices = np.argsort(importFeatures)[::-1]
print(indices)
featureLabel = X.columns[0:]
print(featureLabel)
rankedFeature = []
for f in range(X.shape[1]):
    rankedFeature.append(featureLabel[indices[f]])
    print("%2d) %-*s %f" % (f+1, 30,  featureLabel[indices[f]], importFeatures[indices[f]]))
print(rankedFeature)

### Reduce the number of features

In [None]:
#select features that contribute more than 0.05
#[df_cat.columns[0:df_cat.columns.size -1]]
X_Selected = X[rankedFeature[0:10]]
display(X_Selected)
X_Selected.shape

#sprint into train and test 70/30
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_Selected, Y, test_size=0.3, random_state=0)

### SVM GridSearch for Optimal Parms

In [None]:
#This operation is computationaly expensive.
#Enable as required.
enable_grid_search = False
if enable_grid_search:
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=3)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    #{'C': 1000, 'gamma': 0.001}
    print(grid.best_estimator_)
    #SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
    #  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    #  max_iter=-1, probability=False, random_state=None, shrinking=True,
    #  tol=0.001, verbose=False)
    grid_predictions = grid.predict(X_test)
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, grid_predictions)
    print(confusion_matrix)
    print(classification_report(Y_test,grid_predictions))

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1, random_state=0)
lr.fit(X_train, Y_train)

In [None]:
y_pred = lr.predict(X_test)
#display(y_pred)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

In [None]:
# print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
lr.intercept_

In [None]:
# print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
lr.coef_

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

In [None]:
# with L1 regularization
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=1000, random_state=0)
lr.fit(X_train, Y_train)

In [None]:
y_pred = lr.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

### Support Vector Machines

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='sigmoid', C=1, random_state=0)
svm.fit(X_train, Y_train)

In [None]:
y_pred = svm.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of SVM regression classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

### Decison Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy',max_depth=5, random_state=0)
tree.fit(X_train, Y_train)

In [None]:
y_pred = tree.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=1000, random_state=0, n_jobs=2)
forest.fit(X_train, Y_train)

In [None]:
y_pred = forest.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

### K-N-N

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
knn.fit(X_train, Y_train)

In [None]:
y_pred = knn.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of RandomForest classifier on test set: {:.2f}'.format(forest.score(X_test, Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

### SVM

In [None]:
from sklearn.svm import SVC
#svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
#svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
svm = SVC(verbose = 3)

In [None]:
svm.fit(X_train, Y_train)

In [None]:
y_pred = svm.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
print('Accuracy of SVM regression classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_pred))

### Performance Tuning using GridSearch

In [None]:
enable_grid_search = False
if enable_grid_search:
    from sklearn.grid_search import GridSearchCV
    from sklearn.svm import SVC
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=3)
    grid.fit(X_train, Y_train)
    grid.best_params_
    #Result: {'C': 1000, 'gamma': 0.001}
    grid.best_estimator_
        #SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
        #decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
        #max_iter=-1, probability=False, random_state=None, shrinking=True,
        #tol=0.001, verbose=False)
    grid_predictions = grid.predict(X_test)
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(Y_test, grid_predictions)
    print(confusion_matrix)
    print(classification_report(Y_test,grid_predictions))
    
    
#[[4810 1501  697]
# [3549 2112 1362]
# [ 801  905 5036]]
#             precision    recall  f1-score   support
#
#          1       0.53      0.69      0.60      7008
#          2       0.47      0.30      0.37      7023
#          3       0.71      0.75      0.73      6742
#
#avg / total       0.57      0.58      0.56     20773

## ANN - Multilayer Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier
#mlpc = MLPClassifier(alpha=1)
mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=1000)

#mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)

In [None]:
mlpc.fit(X_train, Y_train)

In [None]:
y_mlcp_pred = mlpc.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, y_mlcp_pred)
print(confusion_matrix)
print(classification_report(Y_test,y_mlcp_pred))

In [None]:

#check sigmoid and rbf
#from sklearn.ensemble import BaggingClassifier
#from sklearn.svm import SVC
#clf = BaggingClassifier(SVC(C=1.0,
#        cache_size=200,
#        class_weight=None,
#        coef0=0.0,
#        decision_function_shape=None,
#        degree=3,
#        gamma='auto',
#        kernel='linear',
#        max_iter=-1,
#        probability=False,
#        random_state=None,
#        shrinking=True,
#        tol=0.001,
#        verbose=False,
#        ))

In [None]:
#clf.fit(X_train, Y_train)