In [35]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file
import time

#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

sampleN = 4300000

model_max_iter = 1000
datafile = 'CKME136X10_2018_Data_CTF.csv'
datestr = 'dec_03_binary_run_01'

#Model Store
file_lr = 'lr_' + datestr + '.model'
file_lr_l1 = 'lr_l2_' + datestr + '.model'
file_dt = 'dt_' + datestr + '.model'
file_svm = 'svm_' + datestr + '.model'
file_knn = 'knn_' + datestr + '.model'
file_mlp = 'mlp_' + datestr + '.model'
file_kmean = 'kmean_' + datestr + '.model'
file_nbayes = 'nbayes_' + datestr + '.model'

file_final_train = 'final_train_' + datestr + '.csv'
file_final_test = 'final_test_' + datestr + '.csv'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
feature_all = True
defaultFeatures = ['P_AGE', 'V_YEAR', 'C_HOUR', 'C_YEAR', 'C_MNTH', 'C_CONF', 'C_WDAY', 'C_VEHS', 'P_USER', 'P_SEX']

enable_lr_l1 = True
predict_lr_l1 = True

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = True
enable_mlp = True
enable_kmean = True
enable_nbayes = True

predict_lr = True
predict_dt = True
predict_svm = True
predict_knn = True
predict_mlp = True
predict_nbayes = True


#Multiclass classification, binary if falase
multiclass = False

In [36]:
print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_grid_search:
    print("Grid Search: Enabled")
else:
    print("Grid Search: Disabled")

if feature_all:
    print("All Features: Enabled")
else:
    print("All Features: Disabled")
    
if enable_kmean:
    print("K-means: Enabled")
else:
    print("K-means: Disabled")

if enable_lr_l1:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")


Sample size: 100000
Multi-Class Classification: Disabled
Grid Search: Disabled
All Features: Enabled
K-means: Enabled
Logistic Regression: Enabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Enabled
MLP: Enabled


In [37]:
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

Sun Dec  2 18:02:17 2018


In [38]:
#load data
df = pd.read_csv(datafile, engine = 'python')
print(df.head(2))

   C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  \
0       1       1       1       2       1       4       4       1       4   
1       1       1       1       1       1       3       4       1       2   

   C_RALN  C_TRAF  V_TYPE  V_YEAR  P_SEX  P_AGE  P_PSN  P_SAFE  P_USER  P_ISEV  
0       3       7       1       3      0     25      1       2       1       2  
1       1       7       1       3      1     65      1       2       1       2  


In [39]:
print(df.isnull().sum().sum())

0


In [40]:
print(df[df.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0


In [41]:
df_cat = df.astype('category').copy()

In [42]:
totalRows = df_cat.index.size
print("Number of Rows in the dataset: {}".format(totalRows))

Number of Rows in the dataset: 4336558


In [43]:
print(df_cat.columns)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_TYPE', 'V_YEAR', 'P_SEX',
       'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER', 'P_ISEV'],
      dtype='object')


In [44]:
#One-Hot-Encoding of categorical
#TBD

## Convert Class Variable to Binary if multi class disabled

In [45]:
## Convert Class Variable to Binary
### Merge Injury and Fatality as a single class
### we will compare the results.
if multiclass:
    #Undersample the majority for the 3 class evaluation
    
    df_class = df_cat.copy()
    
    # subset fatal class
    is_fatal =  df_class['P_ISEV']==3
    is_fatal_count = is_fatal.sum()
    print("Number of Fatal: {}".format(is_fatal_count))
    df_class_fatal = df_class[is_fatal]
    print(df_class_fatal.head(2))
    
    # subset injury class
    is_injury =  df_class['P_ISEV']==2
    is_injury_count = is_injury.sum()
    print("Number of Injury: {}".format(is_injury_count))
    df_class_injury = df_class[is_injury]
    print(df_class_injury.head(2))
    
    # subset non_injury class
    is_safe =  df_class['P_ISEV']==1
    is_safe_count = is_safe.sum()
    print("Number of Non-Injury: {}".format(is_safe_count))
    df_class_safe = df_class[is_safe]
    print(df_class_safe.head(2))
    
    # get the size of fatal datafram
    min_size = df_class_fatal.index.size
    print("Size of Fatal Subset: {}".format(min_size))
    
    # get size of injury
    print("Size of injury Subset: {}".format(df_class_injury.index.size))
    
    # size of non-fatal
    print("Size of non-fatal Subset: {}".format(df_class_safe.index.size))
    
    # randomly sample n number of injury and no injury and append to fatal
    df_class_injury_select = df_class_injury.sample(n=min_size)
    print("Shape of injury sampled dataframe: {}".format(df_class_injury_select.shape))
    df_class_safe_select = df_class_safe.sample(n=min_size)
    print("Shape of nom-injury sampled dataframe: {}".format(df_class_safe_select.shape))
    
    #concat the three dataframes
    df_underSample = pd.concat([df_class_fatal, df_class_injury_select, df_class_safe_select])
    print(df_underSample.shape)
    
    #TBD
    if sampleN < df_underSample.index.size:
        df_sample = df_underSample.sample(n=sampleN)
    else:
        df_sample = df_underSample.sample(n=df_underSample.index.size)
    
    #perform the conversion in two steps to avoid any unwanted side effects
    df_sample['P_ISEV'] = df_sample['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
    df_sample['P_ISEV'] = df_sample['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'2'})
    print((df_sample['P_ISEV']=='0').sum())
    print((df_sample['P_ISEV']=='1').sum())
    print((df_sample['P_ISEV']=='2').sum())
    print(df_sample['P_ISEV'].unique())
else:
    df_class = df_cat.copy()

    #perform the conversion in two steps to avoid any unwanted side effects
    df_class['P_ISEV'] = df_class['P_ISEV'].map({1: 'safe', 2: 'injury', 3:'fatal'})
    df_class['P_ISEV'] = df_class['P_ISEV'].map({'safe': '0', 'injury': '1', 'fatal':'1'})
    print((df_class['P_ISEV']=='0').sum())
    print((df_class['P_ISEV']=='1').sum())
    print(df_class['P_ISEV'].unique())
    
    df_sample = df_class.sample(n=sampleN)

print("Size of dataframe for modeling: {}".format(df_sample.index.size))

1812162
2524396
['1' '0']
Size of dataframe for modeling: 100000


In [46]:
print(df_sample[df_sample.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0


In [47]:
# convert to the correct type
df['C_YEAR'] = df['C_YEAR'].astype(CategoricalDtype(ordered=True))
df['C_MNTH'] = df['C_MNTH'].astype(CategoricalDtype(ordered=True))
df['C_WDAY'] = df['C_WDAY'].astype(CategoricalDtype(ordered=True))
df['C_HOUR'] = df['C_HOUR'].astype(CategoricalDtype(ordered=True))
df['C_VEHS'] = df['C_VEHS'].astype('int')
df['V_YEAR'] = df['V_YEAR'].astype(CategoricalDtype(ordered=True))
df['P_PSN'] = df['P_PSN'].astype(CategoricalDtype(ordered=True))
df['P_AGE'] = df['P_AGE'].astype('int')
df_sample['P_ISEV'] = df_sample['P_ISEV'].astype('int')
print(df_sample.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 4035669 to 2943292
Data columns (total 19 columns):
C_YEAR    100000 non-null category
C_MNTH    100000 non-null category
C_WDAY    100000 non-null category
C_HOUR    100000 non-null category
C_VEHS    100000 non-null category
C_CONF    100000 non-null category
C_RCFG    100000 non-null category
C_WTHR    100000 non-null category
C_RSUR    100000 non-null category
C_RALN    100000 non-null category
C_TRAF    100000 non-null category
V_TYPE    100000 non-null category
V_YEAR    100000 non-null category
P_SEX     100000 non-null category
P_AGE     100000 non-null category
P_PSN     100000 non-null category
P_SAFE    100000 non-null category
P_USER    100000 non-null category
P_ISEV    100000 non-null int32
dtypes: category(18), int32(1)
memory usage: 2.9 MB
None


## Split Training and Testing for Binary class

In [48]:
#Split between data and class
Y = df_sample[df_sample.columns[-1]]
X = df_sample[df_sample.columns[0:df_sample.columns.size -1]]

# split data into X and y
#X = df_sample.iloc[:,0:16]
#Y = df_sample.iloc[:,-1]

In [49]:
print(Y.unique())

[1 0]


In [50]:
print(X.head(3))

        C_YEAR C_MNTH C_WDAY C_HOUR C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN  \
4035669     17      8      2      1      2     35      2      1      1      1   
1374245      6      1      5      2      1      4      2      5      4      2   
1646812      7      2      1      1      1      6      2      3      2      1   

        C_TRAF V_TYPE V_YEAR P_SEX P_AGE P_PSN P_SAFE P_USER  
4035669      7      1      5     0    72     1      2      1  
1374245      7      1      4     0    19     1      2      1  
1646812      1      1      4     0    79     1      2      1  


#### Split Test(70%) and Train (30%) for Bianry class 

In [51]:
#sprint into train and test 70/30
#X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3, random_state=0)
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.3)

### save the final train and test data for future model evaluation.

In [52]:
FullTrain = X_train.copy()
FullTrain['P_ISEV'] = Y_train.copy()
FullTrain.to_csv(file_final_train, encoding='utf-8', index=False)
print(FullTrain.head(10))

FullTest = X_test.copy()
FullTest['P_ISEV'] = Y_test.copy()
FullTest.to_csv(file_final_test, encoding='utf-8', index=False)
print(FullTest.head(10))

        C_YEAR C_MNTH C_WDAY C_HOUR C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN  \
3947516     17      2      6      0      2     21      2      4      4      1   
4283057     18     10      2      1      2     21      4      1      1      1   
2821978     11     12      4      2      2     21      1      1      2      1   
870475       4      4      1      3      4     21      1      1      1      1   
2481189     10      6      4      2      2     21      1      1      1      3   
3059730     13      1      1      3      2     22      1      1      2      1   
3951136     17      3      1      2      2     22      1      1      1      1   
1564522      6     10      5      1      3     51      2      3      2      1   
2713785     11      7      1      1      2     36      2      1      1      1   
1429235      6      4      4      3      2      6      2      1      1      1   

        C_TRAF V_TYPE V_YEAR P_SEX P_AGE P_PSN P_SAFE P_USER  P_ISEV  
3947516      2      1      5     0   

## Clustering based on K-Means Clustering

In [53]:
if enable_kmean:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    print("K-Means Clustering: Start")
    kmeans = KMeans(n_clusters=3, init='random', n_init=10, tol=1e-04, verbose= verbose_level, max_iter=model_max_iter)
    print(kmeans)
    
    print("K-Means Clustering: Build")
    ykm = kmeans.fit(X_train)
    
    if pyscript:
        print(ykm.cluster_centers_)
        print(ykm.labels_)
    else:
        display(ykm.cluster_centers_)
        display(ykm.labels_)
    
    # save model to file
    pickle.dump(ykm, open(file_kmean, "wb"))
    
    print("K-Means Clustering: End")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))


Sun Dec  2 18:04:42 2018
K-Means Clustering: Start
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=1000,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=3)
K-Means Clustering: Build
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 32964405.23405784
start iteration
done sorting
end inner loop
Iteration 1, inertia 30940639.987917688
start iteration
done sorting
end inner loop
Iteration 2, inertia 27718188.16631597
start iteration
done sorting
end inner loop
Iteration 3, inertia 23290693.974585395
start iteration
done sorting
end inner loop
Iteration 4, inertia 21475979.090534598
start iteration
done sorting
end inner loop
Iteration 5, inertia 21072001.62830391
start iteration
done sorting
end inner loop
Iteration 6, inertia 20895201.509040337
start iteration
done sorting
end inner loop
Iteration 7, inertia 20772379.50239139
start iteration
done sorting
end inner loop
It

Iteration 22, inertia 20584449.121372543
start iteration
done sorting
end inner loop
Iteration 23, inertia 20584393.384840738
center shift 4.405765e-02 within tolerance 3.292257e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 21675337.480296824
start iteration
done sorting
end inner loop
Iteration 1, inertia 19527059.073517162
start iteration
done sorting
end inner loop
Iteration 2, inertia 19441721.614632636
start iteration
done sorting
end inner loop
Iteration 3, inertia 19428435.96533936
start iteration
done sorting
end inner loop
Iteration 4, inertia 19421594.104213234
start iteration
done sorting
end inner loop
Iteration 5, inertia 19420069.436835904
start iteration
done sorting
end inner loop
Iteration 6, inertia 19419795.788651336
start iteration
done sorting
end inner loop
Iteration 7, inertia 19419672.555205423
center shift 5.707385e-02 within tolerance 3.292257e-03
Initialization complete
start iteration
done sorting
end inner loop

### SVM GridSearch for Optimal Parms

In [54]:
#This operation is computationaly expensive.
if enable_grid_search:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level)
    print(grid)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    svm_c = grid.best_params_.get('C')
    svm_gamma = grid.best_params_.get('gamma')
    print(grid.best_estimator_)
    grid_predictions = grid.predict(X_test)
    cfn_matrix_grid = confusion_matrix(Y_test, grid_predictions)
    print(cfn_matrix_grid)
    print(classification_report(Y_test,grid_predictions))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

## Logistic Regression Model

In [55]:
if enable_lr:
    print("Logistic Regression: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='auto', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr, "wb"))
    
    
if predict_lr:
    # load model from file
    loaded_model = pickle.load(open(file_lr, "rb"))
    print("Logistic Regression: Predict")
    y_pred = lr.predict(X_test)

    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Intercept")
    print(lr.intercept_)

    # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Coefficients")
    print(lr.coef_)

    print("Logistic Regression: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg)
    
    print("Logistic Regression: Classification Report")
    print(classification_report(Y_test, y_pred))

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start
Sun Dec  2 18:04:47 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='auto',
          n_jobs=10, penalty='l2', random_state=0, solver='saga',
          tol=0.0001, verbose=3, warm_start=False)
Logistic Regression: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 171 epochs took 5 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.64
Accuracy of logistic regression classifier on test set: 0.64
Logistic Regression: Intercept
[2.10892553]
Logistic Regression: Coefficients
[[ 9.96489692e-03 -9.70543762e-05 -5.86507754e-03 -7.54283526e-02
  -4.13065329e-01 -1.34949777e-03 -9.19852451e-02  1.43856304e-02
   4.90666864e-02  1.30873663e-01  3.83137071e-02  8.49061163e-02
  -2.01051559e-01 -7.29137075e-01  6.84988089e-03 -3.90707993e-01
  -1.55474309e-01  3.02406883e-01]]
Logistic Regression: Confusion Matrix


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    5.0s finished


[[ 5649  7031]
 [ 3879 13441]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.59      0.45      0.51     12680
           1       0.66      0.78      0.71     17320

   micro avg       0.64      0.64      0.64     30000
   macro avg       0.62      0.61      0.61     30000
weighted avg       0.63      0.64      0.63     30000

Sun Dec  2 18:04:53 2018
Logistic Regression: End


### Logistic Regression with L1 Regularization

In [56]:
if (enable_lr_l1):
    # with L1 regularization
    print("Logistic Regression with L1 Regularization: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(penalty='l1', C=1, solver='saga', multi_class='auto', 
                            verbose=verbose_level, n_jobs = 10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression with L1 Regularization: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr_l1, "wb"))

if (predict_lr_l1):
    # load model from file
    loaded_model = pickle.load(open(file_lr_l1, "rb"))
    print("Logistic Regression with L1 Regularization: Predict")
    y_pred = lr.predict(X_test)
    
    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    print("Logistic Regression with L1 Regularization: Confusion Matrix")
    cnf_matrix_lg_l1 = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg_l1)
    
    print(classification_report(Y_test,y_pred))
    print("Logistic Regression with L1 Regularization: Classification Report")

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression with L1 Regularization: End")

Logistic Regression with L1 Regularization: Start
Sun Dec  2 18:04:53 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='auto',
          n_jobs=10, penalty='l1', random_state=None, solver='saga',
          tol=0.0001, verbose=3, warm_start=False)
Logistic Regression with L1 Regularization: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 168 epochs took 6 seconds
Logistic Regression with L1 Regularization: Predict
Accuracy of logistic regression classifier on train set: 0.64
Accuracy of logistic regression classifier on test set: 0.64
Logistic Regression with L1 Regularization: Confusion Matrix
[[ 5648  7032]
 [ 3878 13442]]


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    6.1s finished


              precision    recall  f1-score   support

           0       0.59      0.45      0.51     12680
           1       0.66      0.78      0.71     17320

   micro avg       0.64      0.64      0.64     30000
   macro avg       0.62      0.61      0.61     30000
weighted avg       0.63      0.64      0.63     30000

Logistic Regression with L1 Regularization: Classification Report
Sun Dec  2 18:04:59 2018
Logistic Regression with L1 Regularization: End


### Naive Bayes

In [57]:
# Gaussian Naive Bayes Classification
if enable_nbayes:
    print("Naive Bayes: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    nbayes = GaussianNB()
    print(nbayes)
    print("Naive Bayes: Fit")
    nbayes.fit(X_train, Y_train)
    # save model to file
    pickle.dump(nbayes, open(file_nbayes, "wb"))

if predict_nbayes:
    # load model from file
    loaded_model = pickle.load(open(file_nbayes, "rb"))
    print("Naive Bayes: Predict")
    y_pred = nbayes.predict(X_test)
    print('Accuracy of Naive Bayes classifier on train set: {:.2f}'.format(nbayes.score(X_train, Y_train)))
    print('Accuracy of Naove Nayes classifier on test set: {:.2f}'.format(nbayes.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred)
    print("Naive Bayes: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Naive Bayes: Classification Report")
    print(classification_report(Y_test,y_pred))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Naive Bayes: End")

Naive Bayes: Start
Sun Dec  2 18:04:59 2018
GaussianNB(priors=None, var_smoothing=1e-09)
Naive Bayes: Fit
Naive Bayes: Predict
Accuracy of Naive Bayes classifier on train set: 0.54
Accuracy of Naove Nayes classifier on test set: 0.55
Naive Bayes: Confusion Matrix
[[10076  2604]
 [10957  6363]]
Naive Bayes: Classification Report
              precision    recall  f1-score   support

           0       0.48      0.79      0.60     12680
           1       0.71      0.37      0.48     17320

   micro avg       0.55      0.55      0.55     30000
   macro avg       0.59      0.58      0.54     30000
weighted avg       0.61      0.55      0.53     30000

Sun Dec  2 18:04:59 2018
Naive Bayes: End


### Decision Tree

In [58]:
if enable_dt:
    print("Decision Tree: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    tree = DecisionTreeClassifier(criterion='entropy',max_depth=50)
    print(tree)
    print("Decision Tree: Fit")
    tree.fit(X_train, Y_train)
    # save model to file
    pickle.dump(tree, open(file_dt, "wb"))

if predict_dt:
    # load model from file
    loaded_model = pickle.load(open(file_dt, "rb"))
    print("Decision Tree: Predict")
    y_pred = tree.predict(X_test)
    print('Accuracy of Decision Tree classifier on train set: {:.2f}'.format(tree.score(X_train, Y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred)
    print("Decision Tree: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Decision Tree: Classification Report")
    print(classification_report(Y_test,y_pred))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Decision Tree: End")

Decision Tree: Start
Sun Dec  2 18:04:59 2018
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Decision Tree: Fit
Decision Tree: Predict
Accuracy of Decision Tree classifier on train set: 1.00
Accuracy of Decision Tree classifier on test set: 0.61
Decision Tree: Confusion Matrix
[[ 6760  5920]
 [ 5891 11429]]
Decision Tree: Classification Report
              precision    recall  f1-score   support

           0       0.53      0.53      0.53     12680
           1       0.66      0.66      0.66     17320

   micro avg       0.61      0.61      0.61     30000
   macro avg       0.60      0.60      0.60     30000
weighted avg       0.61      0.61      0.61     30000

Sun Dec  2 18:05:00 201

### K-N-N

In [59]:
if enable_knn:
    print("KNN: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski', n_jobs = 10)
    print(knn)
    print("KNN: Fit")
    knn.fit(X_train, Y_train)

    # save model to file
    pickle.dump(knn, open(file_knn, "wb"))

if predict_knn:
    # load model from file
    loaded_model = pickle.load(open(file_knn, "rb"))
    
    print("KNN: Predict")
    y_pred = knn.predict(X_test)
    print('Accuracy of KNN classifier on train set: {:.2f}'.format(knn.score(X_train, Y_train)))
    print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, Y_test)))
    
    print("KNN: Confusion Matrix")
    cnf_matrix_knn = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_knn)

    print("KNN: Classification Report")
    print(classification_report(Y_test,y_pred))

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

    print("KNN: End")

KNN: Start
Sun Dec  2 18:05:00 2018
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=10, n_neighbors=5, p=2,
           weights='uniform')
KNN: Fit
KNN: Predict
Accuracy of KNN classifier on train set: 0.75
Accuracy of KNN classifier on test set: 0.61
KNN: Confusion Matrix
[[ 6098  6582]
 [ 5162 12158]]
KNN: Classification Report
              precision    recall  f1-score   support

           0       0.54      0.48      0.51     12680
           1       0.65      0.70      0.67     17320

   micro avg       0.61      0.61      0.61     30000
   macro avg       0.60      0.59      0.59     30000
weighted avg       0.60      0.61      0.60     30000

Sun Dec  2 18:05:05 2018
KNN: End


## ANN - Multilayer Perceptron

In [60]:
if enable_mlp:
    print("Multilayer Preceptron: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    #mlpc = MLPClassifier(alpha=1)
    #mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=model_max_iter, verbose=verbose_level)
    mlpc = MLPClassifier(hidden_layer_sizes=(25, 25, 25), verbose=verbose_level, max_iter=model_max_iter)
    print(mlpc)
    #mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
    print("Multilayer Preceptron: fit")
    mlpc.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(mlpc, open(file_mlp, "wb"))
    
if predict_mlp:
    
    # load model from file
    loaded_model = pickle.load(open(file_mlp, "rb"))
    print("Multilayer Preceptron: Predict")
    y_pred = mlpc.predict(X_test)

    print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
    print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))
    
    print("Multilayer Preceptron: Confusion Matrix")
    cnf_matrix_mlp = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_mlp)
    
    print("Multilayer Preceptron: Classificiation Report")
    print(classification_report(Y_test,y_pred))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("Multilayer Preceptron: End")

Multilayer Preceptron: Start
Sun Dec  2 18:05:05 2018
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(25, 25, 25), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=3, warm_start=False)
Multilayer Preceptron: fit
Iteration 1, loss = 0.64593347
Iteration 2, loss = 0.61854109
Iteration 3, loss = 0.60661035
Iteration 4, loss = 0.60251818
Iteration 5, loss = 0.59998747
Iteration 6, loss = 0.59884675
Iteration 7, loss = 0.59768068
Iteration 8, loss = 0.59704098
Iteration 9, loss = 0.59627915
Iteration 10, loss = 0.59407944
Iteration 11, loss = 0.59440428
Iteration 12, loss = 0.59310135
Iteration 13, loss = 0.59289441
Iteration 14, loss = 0.59213067
Iteration 15, loss = 0

Iteration 237, loss = 0.56534169
Iteration 238, loss = 0.56530701
Iteration 239, loss = 0.56573264
Iteration 240, loss = 0.56556008
Iteration 241, loss = 0.56581817
Iteration 242, loss = 0.56538968
Iteration 243, loss = 0.56567726
Iteration 244, loss = 0.56522010
Iteration 245, loss = 0.56565549
Iteration 246, loss = 0.56511486
Iteration 247, loss = 0.56564569
Iteration 248, loss = 0.56476852
Iteration 249, loss = 0.56448517
Iteration 250, loss = 0.56501485
Iteration 251, loss = 0.56560523
Iteration 252, loss = 0.56507850
Iteration 253, loss = 0.56494024
Iteration 254, loss = 0.56535923
Iteration 255, loss = 0.56477252
Iteration 256, loss = 0.56444147
Iteration 257, loss = 0.56528696
Iteration 258, loss = 0.56436637
Iteration 259, loss = 0.56485252
Iteration 260, loss = 0.56436304
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Multilayer Preceptron: Predict
Accuracy of Multilayer Perceptron classifier on train set: 0.68
Accuracy of Multilayer 

### SVM

In [61]:
if enable_svm:
    print("SVM: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    #svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
    #svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
    #svm = SVC(C=svm_c, gamma=svm_gamma, verbose = verbose_level)
    #SVN prediction is taking for ever, limiting the max_iter to 100 instead of -1 (no limit)
    svm = SVC(C=1, gamma = 'auto', verbose = verbose_level, max_iter=model_max_iter)
    print(svm)
    print("SVM: Fit")
    svm.fit(X_train, Y_train)

    # save model to file
    pickle.dump(svm, open(file_svm, "wb"))
    
if predict_svm:
    # load model from file
    loaded_model = pickle.load(open(file_svm, "rb"))
    print("SVM: Predict")
    y_pred = svm.predict(X_test)
    
    print('Accuracy of SVM classifier on train set: {:.2f}'.format(svm.score(X_train, Y_train)))
    print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))
    
    print("SVM: Confusion Matrix")
    cnf_matrix_svm = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_svm)
    
    print("SVM: Classfication Report")
    print(classification_report(Y_test,y_pred))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("SVM: End")

SVM: Start
Sun Dec  2 18:06:11 2018
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)
SVM: Fit
[LibSVM]



SVM: Predict
Accuracy of SVM classifier on train set: 0.43
Accuracy of SVM classifier on test set: 0.43
SVM: Confusion Matrix
[[11680  1000]
 [16186  1134]]
SVM: Classfication Report
              precision    recall  f1-score   support

           0       0.42      0.92      0.58     12680
           1       0.53      0.07      0.12     17320

   micro avg       0.43      0.43      0.43     30000
   macro avg       0.48      0.49      0.35     30000
weighted avg       0.48      0.43      0.31     30000

Sun Dec  2 18:06:22 2018
SVM: End


In [62]:
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))

Sun Dec  2 18:06:22 2018
