In [22]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
#from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file


#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

#Sample Size (5963272)
#sampleN = 3000000
#SampleN = 5963000
sampleN = 3500000

import time

#datafile
#datafile = 'NCDB_FULL_Removed_All_Missing_Values_Multi_Class.csv'
datafile = 'NCDB_FULL_Removed_All_Missing_Values_Binary_Class_Transformed.csv'

#Model Store
file_lr = 'lr_dec_02.model'
file_lr_l1 = 'lr_l2_dec_01.model'
file_dt = 'dt_dec_02.model'
file_svm = 'svm_dec_02.model'
file_knn = 'knn_dec_02.model'
file_mlp = 'mlp_dec_02.model'
file_kmean = 'kmean_dec_02.model'
file_nbayes = 'nbayes_dec_02.model'

file_final_train = 'final_train_dec_02.data'
file_final_test = 'final_test_dec_02.data'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
feature_all = True
defaultFeatures = ['P_AGE', 'V_YEAR', 'C_HOUR', 'C_YEAR', 'C_MNTH', 'C_CONF', 'C_WDAY', 'C_VEHS', 'P_USER', 'P_SEX']

enable_lr_l1 = True
predict_lr_l1 = True

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = True
enable_mlp = True
enable_kmean = True
enable_nbayes = True

predict_lr = True
predict_dt = True
predict_svm = True
predict_knn = True
predict_mlp = True
predict_nbayes = True


#Multiclass classification, binary if falase
multiclass = False

In [23]:
print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_grid_search:
    print("Grid Search: Enabled")
else:
    print("Grid Search: Disabled")

if feature_all:
    print("All Features: Enabled")
else:
    print("All Features: Disabled")
    
if enable_kmean:
    print("K-means: Enabled")
else:
    print("K-means: Disabled")

if enable_lr_l1:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")


Sample size: 3500000
Multi-Class Classification: Disabled
Grid Search: Disabled
All Features: Enabled
K-means: Enabled
Logistic Regression: Enabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Enabled
MLP: Enabled


In [24]:
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

Sat Dec  1 02:54:37 2018


In [25]:
#load data
df = pd.read_csv(datafile, engine = 'python')
print(df.head(2))

   C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  \
0       1       1       1       1       2      34       2       1       1   
1       1       1       1       1       2      34       2       1       1   

   C_RALN  C_TRAF  V_YEAR  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       1       1       2      0     33      1       1       2  
1       1       1       2      0     70      1       1       1  


In [26]:
print(df.isnull().sum().sum())

0


In [27]:
print(df[df.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0


In [28]:
df_cat = df.astype('category').copy()

In [29]:
totalRows = df_cat.index.size
print("Number of Rows in the dataset: {}".format(totalRows))

Number of Rows in the dataset: 3655334


In [30]:
print(df_cat.columns)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_YEAR', 'P_SEX', 'P_AGE',
       'P_PSN', 'P_USER', 'P_ISEV'],
      dtype='object')


In [31]:
#One-Hot-Encoding of categorical
#TBD

## Convert Class Variable to Binary if multi class disabled

In [32]:
## Convert Class Variable to Binary
### Merge Injury and Fatality as a single class
### we will compare the results.
if multiclass:
    #Undersample the majority for the 3 class evaluation
    
    df_class = df_cat.copy()
    
    # subset fatal class
    is_fatal =  df_class['P_ISEV']==3
    is_fatal_count = is_fatal.sum()
    print("Number of Fatal: {}".format(is_fatal_count))
    df_class_fatal = df_class[is_fatal]
    print(df_class_fatal.head(2))
    
    # subset injury class
    is_injury =  df_class['P_ISEV']==2
    is_injury_count = is_injury.sum()
    print("Number of Injury: {}".format(is_injury_count))
    df_class_injury = df_class[is_injury]
    print(df_class_injury.head(2))
    
    # subset non_injury class
    is_safe =  df_class['P_ISEV']==1
    is_safe_count = is_safe.sum()
    print("Number of Non-Injury: {}".format(is_safe_count))
    df_class_safe = df_class[is_safe]
    print(df_class_safe.head(2))
    
    # get the size of fatal datafram
    min_size = df_class_fatal.index.size
    print("Size of Fatal Subset: {}".format(min_size))
    
    # get size of injury
    print("Size of injury Subset: {}".format(df_class_injury.index.size))
    
    # size of non-fatal
    print("Size of non-fatal Subset: {}".format(df_class_safe.index.size))
    
    # randomly sample n number of injury and no injury and append to fatal
    df_class_injury_select = df_class_injury.sample(n=min_size)
    print("Shape of injury sampled dataframe: {}".format(df_class_injury_select.shape))
    df_class_safe_select = df_class_safe.sample(n=min_size)
    print("Shape of nom-injury sampled dataframe: {}".format(df_class_safe_select.shape))
    
    #concat the three dataframes
    df_underSample = pd.concat([df_class_fatal, df_class_injury_select, df_class_safe_select])
    print(df_underSample.shape)
    
    #TBD
    if sampleN < df_underSample.index.size:
        df_sample = df_underSample.sample(n=sampleN)
    else:
        df_sample = df_underSample.sample(n=df_underSample.index.size)
    
    #perform the conversion in two steps to avoid any unwanted side effects
    df_sample['P_ISEV'] = df_sample['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
    df_sample['P_ISEV'] = df_sample['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'2'})
    print((df_sample['P_ISEV']=='0').sum())
    print((df_sample['P_ISEV']=='1').sum())
    print((df_sample['P_ISEV']=='2').sum())
    print(df_sample['P_ISEV'].unique())
else:
    df_class = df_cat.copy()

    #perform the conversion in two steps to avoid any unwanted side effects
    df_class['P_ISEV'] = df_class['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
    df_class['P_ISEV'] = df_class['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'1'})
    print((df_class['P_ISEV']=='0').sum())
    print((df_class['P_ISEV']=='1').sum())
    print(df_class['P_ISEV'].unique())
    
    df_sample = df_class.sample(n=sampleN)

print("Size of dataframe for modeling: {}".format(df_sample.index.size))

1570775
2084559
['1' '0']
Size of dataframe for modeling: 3500000


In [33]:
print(df_sample[df_sample.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0


In [34]:
# convert to the correct type
df_sample = df_sample.astype('category')
df_sample['C_YEAR'] = df_sample['C_YEAR'].astype(CategoricalDtype(ordered=True))
df_sample['C_MNTH'] = df_sample['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_sample['C_WDAY'] = df_sample['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_sample['C_HOUR'] = df_sample['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_sample['V_YEAR'] = df_sample['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_sample['P_AGE'] = df_sample['P_AGE'].astype('int')
df_sample['P_ISEV'] = df_sample['P_ISEV'].astype('int')
print(df_sample.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3500000 entries, 2687805 to 2542334
Data columns (total 17 columns):
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_YEAR    category
P_SEX     category
P_AGE     int32
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(15), int32(2)
memory usage: 103.5 MB
None


## Split Training and Testing for Binary class

In [35]:
#Split between data and class
Y = df_sample[df_sample.columns[-1]]
X = df_sample[df_sample.columns[0:df_sample.columns.size -1]]

# split data into X and y
#X = df_sample.iloc[:,0:16]
#Y = df_sample.iloc[:,-1]

In [36]:
print(Y.unique())

[1 0]


In [37]:
print(X.head(3))

        C_YEAR C_MNTH C_WDAY C_HOUR C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN  \
2687805     13      8      1      3      2     36      2      1      1      1   
2739281     13     11      2      4      1      6      2      3      2      1   
2624349     13      3      6      1      2     36      2      1      1      1   

        C_TRAF V_YEAR P_SEX  P_AGE P_PSN P_USER  
2687805      1      2     1     42     1      2  
2739281      1      2     0     38     1      1  
2624349      1      3     0     25     1      1  


#### Split Test(70%) and Train (30%) for Bianry class 

In [38]:
#sprint into train and test 70/30
#X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=0)
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3)

### save the final train and test data for future model evaluation.

In [39]:
FullTrain = X_train.copy()
FullTrain['P_ISEV'] = Y_train.copy()
FullTrain.to_csv(file_final_train, encoding='utf-8', index=False)
print(FullTrain.head(10))

FullTest = X_test.copy()
FullTest['P_ISEV'] = Y_test.copy()
FullTest.to_csv(file_final_test, encoding='utf-8', index=False)
print(FullTest.head(10))

        C_YEAR C_MNTH C_WDAY C_HOUR C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN  \
617039       3      9      2      4      2     21      2      1      1      1   
3277496     16     11      5      0      1      6      2      3      2      1   
1429986      7      3      7      2      4     21      1      1      1      2   
14031        1      1      6      4      2     22      1      4      3      1   
1788391      8     11      5      1      2     35      2      1      1      2   
3151005     16      2      4      1      2     33      1      4      3      3   
1146192      5     12      2      2      2     36      2      1      1      2   
1159432      5     12      7      2      2     31      1      4      3      1   
3042484     15      7      5      3      2     21      1      1      1      2   
779849       4      5      7      0      2     21      1      1      1      1   

        C_TRAF V_YEAR P_SEX  P_AGE P_PSN P_USER  P_ISEV  
617039       1      3     1     27     1      1   

## Clustering based on K-Means Clustering

In [40]:
if enable_kmean:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    print("K-Means Clustering: Start")
    kmeans = KMeans(n_clusters=3, init='random', n_init=10, tol=1e-04, verbose= verbose_level, max_iter=1000)
    print(kmeans)
    
    print("K-Means Clustering: Build")
    ykm = kmeans.fit(X_train)
    
    if pyscript:
        print(ykm.cluster_centers_)
        print(ykm.labels_)
    else:
        display(ykm.cluster_centers_)
        display(ykm.labels_)
    
    # save model to file
    pickle.dump(ykm, open(file_kmean, "wb"))
    
    print("K-Means Clustering: End")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))


Sat Dec  1 02:57:17 2018
K-Means Clustering: Start
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=1000,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=3)
K-Means Clustering: Build
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 660871183.6981761
start iteration
done sorting
end inner loop
Iteration 1, inertia 576012641.6652817
start iteration
done sorting
end inner loop
Iteration 2, inertia 570340124.3896588
start iteration
done sorting
end inner loop
Iteration 3, inertia 564987630.1290531
start iteration
done sorting
end inner loop
Iteration 4, inertia 559653648.2239835
start iteration
done sorting
end inner loop
Iteration 5, inertia 557995855.3770834
start iteration
done sorting
end inner loop
Iteration 6, inertia 557395407.562223
start iteration
done sorting
end inner loop
Iteration 7, inertia 557223723.7739966
start iteration
done sorting
end inner loop
Iterati

Iteration 12, inertia 557148805.0122266
start iteration
done sorting
end inner loop
Iteration 13, inertia 557147223.1871184
center shift 3.729982e-02 within tolerance 3.278924e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 627885115.6519587
start iteration
done sorting
end inner loop
Iteration 1, inertia 568959453.3149986
start iteration
done sorting
end inner loop
Iteration 2, inertia 564098481.6821291
start iteration
done sorting
end inner loop
Iteration 3, inertia 562840439.5990058
start iteration
done sorting
end inner loop
Iteration 4, inertia 562513840.642583
start iteration
done sorting
end inner loop
Iteration 5, inertia 562461211.2686478
start iteration
done sorting
end inner loop
Iteration 6, inertia 562429651.2111588
start iteration
done sorting
end inner loop
Iteration 7, inertia 562409176.2886137
start iteration
done sorting
end inner loop
Iteration 8, inertia 562394362.8847816
start iteration
done sorting
end inner loop
Iterat

### SVM GridSearch for Optimal Parms

In [41]:
#This operation is computationaly expensive.
if enable_grid_search:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level)
    print(grid)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    svm_c = grid.best_params_.get('C')
    svm_gamma = grid.best_params_.get('gamma')
    print(grid.best_estimator_)
    grid_predictions = grid.predict(X_test)
    cfn_matrix_grid = confusion_matrix(Y_test, grid_predictions)
    print(cfn_matrix_grid)
    print(classification_report(Y_test,grid_predictions))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

## Logistic Regression Model

In [42]:
if enable_lr:
    print("Logistic Regression: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='auto', 
                            verbose=verbose_level, n_jobs=10, max_iter=1000)
    print(lr)
    print("Logistic Regression: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr, "wb"))
    
    
if predict_lr:
    # load model from file
    loaded_model = pickle.load(open(file_lr, "rb"))
    print("Logistic Regression: Predict")
    y_pred = lr.predict(X_test)

    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Intercept")
    print(lr.intercept_)

    # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Coefficients")
    print(lr.coef_)

    print("Logistic Regression: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg)
    
    print("Logistic Regression: Classification Report")
    print(classification_report(Y_test, y_pred))

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start
Sat Dec  1 02:59:35 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='auto',
          n_jobs=10, penalty='l2', random_state=0, solver='saga',
          tol=0.0001, verbose=3, warm_start=False)
Logistic Regression: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 13 epochs took 20 seconds
Logistic Regression: Predict


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   20.5s finished


Accuracy of logistic regression classifier on train set: 0.62
Accuracy of logistic regression classifier on test set: 0.62
Logistic Regression: Intercept
[1.68091606]
Logistic Regression: Coefficients
[[ 0.00436564  0.00073385 -0.00414846 -0.071603   -0.43396847  0.00112751
  -0.12727912 -0.01778397  0.1304307   0.15589815  0.04017879 -0.1227636
  -0.6848408   0.0059682  -0.51629328  0.25351272]]
Logistic Regression: Confusion Matrix
[[195411 256055]
 [140429 458105]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.58      0.43      0.50    451466
           1       0.64      0.77      0.70    598534

   micro avg       0.62      0.62      0.62   1050000
   macro avg       0.61      0.60      0.60   1050000
weighted avg       0.62      0.62      0.61   1050000

Sat Dec  1 03:00:02 2018
Logistic Regression: End


### Logistic Regression with L1 Regularization

In [43]:
if (enable_lr_l1):
    # with L1 regularization
    print("Logistic Regression with L1 Regularization: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(penalty='l1', C=1, solver='saga', multi_class='auto', 
                            verbose=verbose_level, n_jobs = 10, max_iter=1000)
    print(lr)
    print("Logistic Regression with L1 Regularization: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr_l1, "wb"))

if (predict_lr_l1):
    # load model from file
    loaded_model = pickle.load(open(file_lr_l1, "rb"))
    print("Logistic Regression with L1 Regularization: Predict")
    y_pred = lr.predict(X_test)
    
    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    print("Logistic Regression with L1 Regularization: Confusion Matrix")
    cnf_matrix_lg_l1 = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg_l1)
    
    print(classification_report(Y_test,y_pred))
    print("Logistic Regression with L1 Regularization: Classification Report")

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression with L1 Regularization: End")

Logistic Regression with L1 Regularization: Start
Sat Dec  1 03:00:02 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='auto',
          n_jobs=10, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=3, warm_start=False)
Logistic Regression with L1 Regularization: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 15 epochs took 26 seconds
Logistic Regression with L1 Regularization: Predict


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   26.1s finished


Accuracy of logistic regression classifier on train set: 0.62
Accuracy of logistic regression classifier on test set: 0.62
Logistic Regression with L1 Regularization: Confusion Matrix
[[195176 256290]
 [140236 458298]]
              precision    recall  f1-score   support

           0       0.58      0.43      0.50    451466
           1       0.64      0.77      0.70    598534

   micro avg       0.62      0.62      0.62   1050000
   macro avg       0.61      0.60      0.60   1050000
weighted avg       0.62      0.62      0.61   1050000

Logistic Regression with L1 Regularization: Classification Report
Sat Dec  1 03:00:34 2018
Logistic Regression with L1 Regularization: End


### Naive Bayes

In [44]:
# Gaussian Naive Bayes Classification
if enable_nbayes:
    print("Naive Bayes: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    nbayes = GaussianNB()
    print(nbayes)
    print("Naive Bayes: Fit")
    nbayes.fit(X_train, Y_train)
    # save model to file
    pickle.dump(nbayes, open(file_nbayes, "wb"))

if predict_nbayes:
    # load model from file
    loaded_model = pickle.load(open(file_nbayes, "rb"))
    print("Naive Bayes: Predict")
    y_pred = nbayes.predict(X_test)
    print('Accuracy of Naive Bayes classifier on train set: {:.2f}'.format(nbayes.score(X_train, Y_train)))
    print('Accuracy of Naove Nayes classifier on test set: {:.2f}'.format(nbayes.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred)
    print("Naive Bayes: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Naive Bayes: Classification Report")
    print(classification_report(Y_test,y_pred))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Naive Bayes: End")

Naive Bayes: Start
Sat Dec  1 03:00:35 2018
GaussianNB(priors=None, var_smoothing=1e-09)
Naive Bayes: Fit
Naive Bayes: Predict
Accuracy of Naive Bayes classifier on train set: 0.59
Accuracy of Naove Nayes classifier on test set: 0.59
Naive Bayes: Confusion Matrix
[[280528 170938]
 [257559 340975]]
Naive Bayes: Classification Report
              precision    recall  f1-score   support

           0       0.52      0.62      0.57    451466
           1       0.67      0.57      0.61    598534

   micro avg       0.59      0.59      0.59   1050000
   macro avg       0.59      0.60      0.59   1050000
weighted avg       0.60      0.59      0.59   1050000

Sat Dec  1 03:00:43 2018
Naive Bayes: End


### Decision Tree

In [45]:
if enable_dt:
    print("Decision Tree: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    tree = DecisionTreeClassifier(criterion='entropy',max_depth=50)
    print(tree)
    print("Decision Tree: Fit")
    tree.fit(X_train, Y_train)
    # save model to file
    pickle.dump(tree, open(file_dt, "wb"))

if predict_dt:
    # load model from file
    loaded_model = pickle.load(open(file_dt, "rb"))
    print("Decision Tree: Predict")
    y_pred = tree.predict(X_test)
    print('Accuracy of Decision Tree classifier on train set: {:.2f}'.format(tree.score(X_train, Y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred)
    print("Decision Tree: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Decision Tree: Classification Report")
    print(classification_report(Y_test,y_pred))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Decision Tree: End")

Decision Tree: Start
Sat Dec  1 03:00:43 2018
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Decision Tree: Fit
Decision Tree: Predict
Accuracy of Decision Tree classifier on train set: 1.00
Accuracy of Decision Tree classifier on test set: 0.59
Decision Tree: Confusion Matrix
[[237492 213974]
 [221283 377251]]
Decision Tree: Classification Report
              precision    recall  f1-score   support

           0       0.52      0.53      0.52    451466
           1       0.64      0.63      0.63    598534

   micro avg       0.59      0.59      0.59   1050000
   macro avg       0.58      0.58      0.58   1050000
weighted avg       0.59      0.59      0.59   1050000

Sat Dec  1 03:01:25

### K-N-N

In [46]:
if enable_knn:
    print("KNN: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski', n_jobs = 10)
    print(knn)
    print("KNN: Fit")
    knn.fit(X_train, Y_train)

    # save model to file
    pickle.dump(knn, open(file_knn, "wb"))

if predict_knn:
    # load model from file
    loaded_model = pickle.load(open(file_knn, "rb"))
    
    print("KNN: Predict")
    y_pred = knn.predict(X_test)
    print('Accuracy of KNN classifier on train set: {:.2f}'.format(knn.score(X_train, Y_train)))
    print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, Y_test)))
    
    print("KNN: Confusion Matrix")
    cnf_matrix_knn = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_knn)

    print("KNN: Classification Report")
    print(classification_report(Y_test,y_pred))

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

    print("KNN: End")

KNN: Start
Sat Dec  1 03:01:25 2018
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=10, n_neighbors=5, p=2,
           weights='uniform')
KNN: Fit
KNN: Predict
Accuracy of KNN classifier on train set: 0.74
Accuracy of KNN classifier on test set: 0.60
KNN: Confusion Matrix
[[228725 222741]
 [197477 401057]]
KNN: Classification Report
              precision    recall  f1-score   support

           0       0.54      0.51      0.52    451466
           1       0.64      0.67      0.66    598534

   micro avg       0.60      0.60      0.60   1050000
   macro avg       0.59      0.59      0.59   1050000
weighted avg       0.60      0.60      0.60   1050000

Sat Dec  1 03:34:37 2018
KNN: End


## ANN - Multilayer Perceptron

In [47]:
if enable_mlp:
    print("Multilayer Preceptron: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    #mlpc = MLPClassifier(alpha=1)
    #mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=100, verbose=verbose_level)
    mlpc = MLPClassifier(hidden_layer_sizes=(25, 25, 25), verbose=verbose_level, max_iter=1000)
    print(mlpc)
    #mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
    print("Multilayer Preceptron: fit")
    mlpc.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(mlpc, open(file_mlp, "wb"))
    
if predict_mlp:
    
    # load model from file
    loaded_model = pickle.load(open(file_mlp, "rb"))
    print("Multilayer Preceptron: Predict")
    y_pred = mlpc.predict(X_test)

    print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
    print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))
    
    print("Multilayer Preceptron: Confusion Matrix")
    cnf_matrix_mlp = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_mlp)
    
    print("Multilayer Preceptron: Classificiation Report")
    print(classification_report(Y_test,y_pred))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("Multilayer Preceptron: End")

Multilayer Preceptron: Start
Sat Dec  1 03:34:37 2018
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(25, 25, 25), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=3, warm_start=False)
Multilayer Preceptron: fit
Iteration 1, loss = 0.63178562
Iteration 2, loss = 0.62142621
Iteration 3, loss = 0.61737828
Iteration 4, loss = 0.61530847
Iteration 5, loss = 0.61405752
Iteration 6, loss = 0.61340958
Iteration 7, loss = 0.61283719
Iteration 8, loss = 0.61227489
Iteration 9, loss = 0.61186819
Iteration 10, loss = 0.61149607
Iteration 11, loss = 0.61119114
Iteration 12, loss = 0.61093501
Iteration 13, loss = 0.61067899
Iteration 14, loss = 0.61051348
Iteration 15, loss = 0

### SVM

In [48]:
if enable_svm:
    print("SVM: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    #svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
    #svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
    #svm = SVC(C=svm_c, gamma=svm_gamma, verbose = verbose_level)
    #SVN prediction is taking for ever, limiting the max_iter to 100 instead of -1 (no limit)
    svm = SVC(C=1, gamma = 'auto', verbose = verbose_level, max_iter=1000)
    print(svm)
    print("SVM: Fit")
    svm.fit(X_train, Y_train)

    # save model to file
    pickle.dump(svm, open(file_svm, "wb"))
    
if predict_svm:
    # load model from file
    loaded_model = pickle.load(open(file_svm, "rb"))
    print("SVM: Predict")
    y_pred = svm.predict(X_test)
    
    print('Accuracy of SVM classifier on train set: {:.2f}'.format(svm.score(X_train, Y_train)))
    print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))
    
    print("SVM: Confusion Matrix")
    cnf_matrix_svm = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_svm)
    
    print("SVM: Classfication Report")
    print(classification_report(Y_test,y_pred))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("SVM: End")

SVM: Start
Sat Dec  1 03:43:15 2018
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)
SVM: Fit
[LibSVM]



SVM: Predict
Accuracy of SVM classifier on train set: 0.43
Accuracy of SVM classifier on test set: 0.43
SVM: Confusion Matrix
[[442180   9286]
 [585958  12576]]
SVM: Classfication Report
              precision    recall  f1-score   support

           0       0.43      0.98      0.60    451466
           1       0.58      0.02      0.04    598534

   micro avg       0.43      0.43      0.43   1050000
   macro avg       0.50      0.50      0.32   1050000
weighted avg       0.51      0.43      0.28   1050000

Sat Dec  1 03:49:29 2018
SVM: End


In [49]:
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))

Sat Dec  1 03:49:29 2018
