In [5]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file
import time

#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

#sampleN = 4300000

#Multiclass classification, binary if falase
multiclass = False
over_sample = False

#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test
    
model_max_iter = 1000
datestr = 'dec_04_binary_run_1000_BU'

#Model Store
file_lr = 'lr_' + datestr + '.model'
file_lr_l1 = 'lr_l2_' + datestr + '.model'
file_dt = 'dt_' + datestr + '.model'
file_svm = 'svm_' + datestr + '.model'
file_knn = 'knn_' + datestr + '.model'
file_mlp = 'mlp_' + datestr + '.model'
file_kmean = 'kmean_' + datestr + '.model'
file_nbayes = 'nbayes_' + datestr + '.model'

file_final_train = 'final_train_' + datestr + '.csv'
file_final_test = 'final_test_' + datestr + '.csv'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
feature_all = True
defaultFeatures = ['P_AGE', 'V_YEAR', 'C_HOUR', 'C_YEAR', 'C_MNTH', 'C_CONF', 'C_WDAY', 'C_VEHS', 'P_USER', 'P_SEX']

enable_lr_l1 = True
predict_lr_l1 = True

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = True
enable_mlp = True
enable_kmean = True
enable_nbayes = True

predict_lr = True
predict_dt = True
predict_svm = True
predict_knn = True
predict_mlp = True
predict_nbayes = True

In [6]:
#print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_grid_search:
    print("Grid Search: Enabled")
else:
    print("Grid Search: Disabled")

if feature_all:
    print("All Features: Enabled")
else:
    print("All Features: Disabled")
    
if enable_kmean:
    print("K-means: Enabled")
else:
    print("K-means: Disabled")

if enable_lr_l1:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")


Multi-Class Classification: Disabled
Grid Search: Disabled
All Features: Enabled
K-means: Enabled
Logistic Regression: Enabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Enabled
MLP: Enabled


In [7]:
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

Tue Dec  4 21:17:13 2018


In [8]:
#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

   C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  \
0      11       8       3       0       3      36       2       1       1   
1      17       3       5       2       3      21       2       1       1   

   C_RALN  C_TRAF  V_TYPE  V_YEAR  P_SEX  P_AGE  P_PSN  P_SAFE  P_USER  P_ISEV  
0       1       1       5       4      1     24      2       2       2       0  
1       1       1       7       5      1     27      1       2       1       0  
   C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  \
0       5       4       7       4       1       6       1       3       2   
1       4       5       6       3       1       6       2       1       1   

   C_RALN  C_TRAF  V_TYPE  V_YEAR  P_SEX  P_AGE  P_PSN  P_SAFE  P_USER  P_ISEV  
0       1       7       1       4      0     10      2       2       2       0  
1       2       1       1       3      1     42      1       2       1       0  


In [9]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

0
0


In [10]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0
0


In [11]:
df_test_cat = df_test.astype('category').copy()
df_train_cat = df_train.astype('category').copy()

In [12]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

Number of Rows in test data: 1300968
Number of Rows in train data: 2537026


In [13]:
print(df_train_cat.columns)
print(df_train_cat.dtypes)

Index(['C_YEAR', 'C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG',
       'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_TYPE', 'V_YEAR', 'P_SEX',
       'P_AGE', 'P_PSN', 'P_SAFE', 'P_USER', 'P_ISEV'],
      dtype='object')
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_TYPE    category
V_YEAR    category
P_SEX     category
P_AGE     category
P_PSN     category
P_SAFE    category
P_USER    category
P_ISEV    category
dtype: object


In [14]:
#One-Hot-Encoding of categorical
#TBD

In [15]:
#print(df_test_cat[df_test_cat.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
#print(df_train_cat[df_train_cat.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

### type cast train data

In [16]:
# convert to the correct type
df_train_cat['C_YEAR'] = df_train_cat['C_YEAR'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_MNTH'] = df_train_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_WDAY'] = df_train_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_HOUR'] = df_train_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_VEHS'] = df_train_cat['C_VEHS'].astype('int')
df_train_cat['V_YEAR'] = df_train_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_PSN'] = df_train_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_AGE'] = df_train_cat['P_AGE'].astype('int')
df_train_cat['P_ISEV'] = df_train_cat['P_ISEV'].astype('int')

print(df_train_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2537026 entries, 0 to 2537025
Data columns (total 19 columns):
C_YEAR    category
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    int64
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
V_TYPE    category
V_YEAR    category
P_SEX     category
P_AGE     int64
P_PSN     category
P_SAFE    category
P_USER    category
P_ISEV    int64
dtypes: category(16), int64(3)
memory usage: 96.8 MB
None


### type cast test data

In [17]:
# convert to the correct type
df_test_cat['C_YEAR'] = df_test_cat['C_YEAR'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_MNTH'] = df_test_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_WDAY'] = df_test_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_HOUR'] = df_test_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_VEHS'] = df_test_cat['C_VEHS'].astype('int')
df_test_cat['V_YEAR'] = df_test_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_PSN'] = df_test_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_AGE'] = df_test_cat['P_AGE'].astype('int')
df_test_cat['P_ISEV'] = df_test_cat['P_ISEV'].astype('int')
print(df_test_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300968 entries, 0 to 1300967
Data columns (total 19 columns):
C_YEAR    1300968 non-null category
C_MNTH    1300968 non-null category
C_WDAY    1300968 non-null category
C_HOUR    1300968 non-null category
C_VEHS    1300968 non-null int64
C_CONF    1300968 non-null category
C_RCFG    1300968 non-null category
C_WTHR    1300968 non-null category
C_RSUR    1300968 non-null category
C_RALN    1300968 non-null category
C_TRAF    1300968 non-null category
V_TYPE    1300968 non-null category
V_YEAR    1300968 non-null category
P_SEX     1300968 non-null category
P_AGE     1300968 non-null int64
P_PSN     1300968 non-null category
P_SAFE    1300968 non-null category
P_USER    1300968 non-null category
P_ISEV    1300968 non-null int64
dtypes: category(16), int64(3)
memory usage: 49.6 MB
None


## Split Training and Testing for Binary class

In [18]:
#Split between data and class for training
Y_train = df_train_cat[df_train_cat.columns[-1]]
X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

Y_test = df_test_cat[df_test_cat.columns[-1]]
X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

# split data into X and y
#X = df_sample.iloc[:,0:16]
#Y = df_sample.iloc[:,-1]

In [19]:
print(Y_train.unique())

[0 1]


In [20]:
print(X_train.head(3))

  C_YEAR C_MNTH C_WDAY C_HOUR  C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN  \
0      5      4      7      4       1      6      1      3      2      1   
1      4      5      6      3       1      6      2      1      1      2   
2      1      2      6      0       3     21      1      4      4      1   

  C_TRAF V_TYPE V_YEAR P_SEX  P_AGE P_PSN P_SAFE P_USER  
0      7      1      4     0     10     2      2      2  
1      1      1      3     1     42     1      2      1  
2      7      6      3     0     17     1      2      2  


## Clustering based on K-Means Clustering

In [21]:
if enable_kmean:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    print("K-Means Clustering: Start")
    kmeans = KMeans(n_clusters=3, init='random', n_init=10, tol=1e-04, verbose= verbose_level, max_iter=model_max_iter)
    print(kmeans)
    
    print("K-Means Clustering: Build")
    ykm = kmeans.fit(X_train)
    
    if pyscript:
        print(ykm.cluster_centers_)
        print(ykm.labels_)
    else:
        display(ykm.cluster_centers_)
        display(ykm.labels_)
    
    # save model to file
    pickle.dump(ykm, open(file_kmean, "wb"))
    
    print("K-Means Clustering: End")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))


Tue Dec  4 21:19:27 2018
K-Means Clustering: Start
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=1000,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=3)
K-Means Clustering: Build
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 826189331.6912264
start iteration
done sorting
end inner loop
Iteration 1, inertia 743728607.1481513
start iteration
done sorting
end inner loop
Iteration 2, inertia 720988191.1436778
start iteration
done sorting
end inner loop
Iteration 3, inertia 714287617.1276181
start iteration
done sorting
end inner loop
Iteration 4, inertia 711386270.1052762
start iteration
done sorting
end inner loop
Iteration 5, inertia 709767360.6394216
start iteration
done sorting
end inner loop
Iteration 6, inertia 708417377.7990619
start iteration
done sorting
end inner loop
Iteration 7, inertia 706500403.3477348
start iteration
done sorting
end inner loop
Iterat

Iteration 9, inertia 727007853.6168746
start iteration
done sorting
end inner loop
Iteration 10, inertia 726947380.4141393
start iteration
done sorting
end inner loop
Iteration 11, inertia 726909680.457874
start iteration
done sorting
end inner loop
Iteration 12, inertia 726875424.7844552
start iteration
done sorting
end inner loop
Iteration 13, inertia 726804666.0011026
start iteration
done sorting
end inner loop
Iteration 14, inertia 726669101.1980345
start iteration
done sorting
end inner loop
Iteration 15, inertia 726601960.4146353
start iteration
done sorting
end inner loop
Iteration 16, inertia 726599079.9213473
center shift 3.970430e-02 within tolerance 3.247793e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1050254717.7857907
start iteration
done sorting
end inner loop
Iteration 1, inertia 828176607.4427689
start iteration
done sorting
end inner loop
Iteration 2, inertia 728200069.5304596
start iteration
done sorting
end inner loop


Iteration 45, inertia 692746876.3378167
start iteration
done sorting
end inner loop
Iteration 46, inertia 692733438.6193198
start iteration
done sorting
end inner loop
Iteration 47, inertia 692729117.9775976
start iteration
done sorting
end inner loop
Iteration 48, inertia 692728335.4856639
center shift 2.166065e-02 within tolerance 3.247793e-03
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1097449095.2082121
start iteration
done sorting
end inner loop
Iteration 1, inertia 930383413.2612684
start iteration
done sorting
end inner loop
Iteration 2, inertia 785505561.7838489
start iteration
done sorting
end inner loop
Iteration 3, inertia 721959291.277948
start iteration
done sorting
end inner loop
Iteration 4, inertia 708138757.9574724
start iteration
done sorting
end inner loop
Iteration 5, inertia 705450438.1440082
start iteration
done sorting
end inner loop
Iteration 6, inertia 704597046.2563963
start iteration
done sorting
end inner loop
Ite

### SVM GridSearch for Optimal Parms

In [22]:
#This operation is computationaly expensive.
if enable_grid_search:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level, n_jobs = 10)
    print(grid)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    svm_c = grid.best_params_.get('C')
    svm_gamma = grid.best_params_.get('gamma')
    print(grid.best_estimator_)
    grid_predictions = grid.predict(X_test)
    cfn_matrix_grid = confusion_matrix(Y_test, grid_predictions)
    print(cfn_matrix_grid)
    print(classification_report(Y_test,grid_predictions))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

## Logistic Regression Model

In [None]:
if enable_lr:
    print("Logistic Regression: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr, "wb"))
    
    
if predict_lr:
    # load model from file
    loaded_model = pickle.load(open(file_lr, "rb"))
    print("Logistic Regression: Predict")
    y_pred = lr.predict(X_test)

    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Intercept")
    print(lr.intercept_)

    # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Coefficients")
    print(lr.coef_)

    print("Logistic Regression: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg)
    
    print("Logistic Regression: Classification Report")
    print(classification_report(Y_test, y_pred))

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start
Tue Dec  4 21:22:45 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)
Logistic Regression: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 13 epochs took 24 seconds
Logistic Regression: Predict


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   24.5s finished


Accuracy of logistic regression classifier on train set: 0.63
Accuracy of logistic regression classifier on test set: 0.63
Logistic Regression: Intercept
[1.81115337]
Logistic Regression: Coefficients
[[ 0.01441726  0.00090733 -0.00517628 -0.06828971 -0.42899233 -0.00110551
  -0.08845583  0.01004259  0.0638277   0.14032783  0.03962497  0.08363148
  -0.23448005 -0.71741377  0.00600312 -0.38268175 -0.146203    0.29418861]]
Logistic Regression: Confusion Matrix
[[354150 189499]
 [295223 462096]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.55      0.65      0.59    543649
           1       0.71      0.61      0.66    757319

   micro avg       0.63      0.63      0.63   1300968
   macro avg       0.63      0.63      0.62   1300968
weighted avg       0.64      0.63      0.63   1300968

Tue Dec  4 21:23:19 2018
Logistic Regression: End


### Logistic Regression with L1 Regularization

In [None]:
if (enable_lr_l1):
    # with L1 regularization
    print("Logistic Regression with L1 Regularization: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(penalty='l1', C=1, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs = 10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression with L1 Regularization: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr_l1, "wb"))

if (predict_lr_l1):
    # load model from file
    loaded_model = pickle.load(open(file_lr_l1, "rb"))
    print("Logistic Regression with L1 Regularization: Predict")
    y_pred = lr.predict(X_test)
    
    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    print("Logistic Regression with L1 Regularization: Confusion Matrix")
    cnf_matrix_lg_l1 = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg_l1)
    
    print(classification_report(Y_test,y_pred))
    print("Logistic Regression with L1 Regularization: Classification Report")

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression with L1 Regularization: End")

Logistic Regression with L1 Regularization: Start
Tue Dec  4 21:23:19 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=10,
          penalty='l1', random_state=None, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)
Logistic Regression with L1 Regularization: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 13 epochs took 26 seconds
Logistic Regression with L1 Regularization: Predict


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   26.2s finished


Accuracy of logistic regression classifier on train set: 0.63
Accuracy of logistic regression classifier on test set: 0.63
Logistic Regression with L1 Regularization: Confusion Matrix
[[353894 189755]
 [294878 462441]]
              precision    recall  f1-score   support

           0       0.55      0.65      0.59    543649
           1       0.71      0.61      0.66    757319

   micro avg       0.63      0.63      0.63   1300968
   macro avg       0.63      0.63      0.62   1300968
weighted avg       0.64      0.63      0.63   1300968

Logistic Regression with L1 Regularization: Classification Report
Tue Dec  4 21:23:54 2018
Logistic Regression with L1 Regularization: End


### Naive Bayes

In [None]:
# Gaussian Naive Bayes Classification
if enable_nbayes:
    print("Naive Bayes: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    nbayes = GaussianNB()
    print(nbayes)
    print("Naive Bayes: Fit")
    nbayes.fit(X_train, Y_train)
    # save model to file
    pickle.dump(nbayes, open(file_nbayes, "wb"))

if predict_nbayes:
    # load model from file
    loaded_model = pickle.load(open(file_nbayes, "rb"))
    print("Naive Bayes: Predict")
    y_pred = nbayes.predict(X_test)
    print('Accuracy of Naive Bayes classifier on train set: {:.2f}'.format(nbayes.score(X_train, Y_train)))
    print('Accuracy of Naove Nayes classifier on test set: {:.2f}'.format(nbayes.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred)
    print("Naive Bayes: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Naive Bayes: Classification Report")
    print(classification_report(Y_test,y_pred))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Naive Bayes: End")

Naive Bayes: Start
Tue Dec  4 21:23:54 2018
GaussianNB(priors=None, var_smoothing=1e-09)
Naive Bayes: Fit
Naive Bayes: Predict
Accuracy of Naive Bayes classifier on train set: 0.58
Accuracy of Naove Nayes classifier on test set: 0.53
Naive Bayes: Confusion Matrix
[[450207  93442]
 [512227 245092]]
Naive Bayes: Classification Report
              precision    recall  f1-score   support

           0       0.47      0.83      0.60    543649
           1       0.72      0.32      0.45    757319

   micro avg       0.53      0.53      0.53   1300968
   macro avg       0.60      0.58      0.52   1300968
weighted avg       0.62      0.53      0.51   1300968

Tue Dec  4 21:24:05 2018
Naive Bayes: End


### Decision Tree

In [None]:
if enable_dt:
    print("Decision Tree: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    tree = DecisionTreeClassifier(criterion='entropy',max_depth=50)
    print(tree)
    print("Decision Tree: Fit")
    tree.fit(X_train, Y_train)
    # save model to file
    pickle.dump(tree, open(file_dt, "wb"))

if predict_dt:
    # load model from file
    loaded_model = pickle.load(open(file_dt, "rb"))
    print("Decision Tree: Predict")
    y_pred = tree.predict(X_test)
    print('Accuracy of Decision Tree classifier on train set: {:.2f}'.format(tree.score(X_train, Y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred)
    print("Decision Tree: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Decision Tree: Classification Report")
    print(classification_report(Y_test,y_pred))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Decision Tree: End")

Decision Tree: Start
Tue Dec  4 21:24:05 2018
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Decision Tree: Fit
Decision Tree: Predict
Accuracy of Decision Tree classifier on train set: 0.99
Accuracy of Decision Tree classifier on test set: 0.61
Decision Tree: Confusion Matrix
[[336030 207619]
 [293608 463711]]
Decision Tree: Classification Report
              precision    recall  f1-score   support

           0       0.53      0.62      0.57    543649
           1       0.69      0.61      0.65    757319

   micro avg       0.61      0.61      0.61   1300968
   macro avg       0.61      0.62      0.61   1300968
weighted avg       0.63      0.61      0.62   1300968

Tue Dec  4 21:25:04

### K-N-N

In [None]:
if enable_knn:
    print("KNN: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski', n_jobs = -1)
    print(knn)
    print("KNN: Fit")
    knn.fit(X_train, Y_train)

    # save model to file
    pickle.dump(knn, open(file_knn, "wb"))

if predict_knn:
    # load model from file
    loaded_model = pickle.load(open(file_knn, "rb"))
    
    print("KNN: Predict")
    y_pred = knn.predict(X_test)
    print('Accuracy of KNN classifier on train set: {:.2f}'.format(knn.score(X_train, Y_train)))
    print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, Y_test)))
    
    print("KNN: Confusion Matrix")
    cnf_matrix_knn = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_knn)

    print("KNN: Classification Report")
    print(classification_report(Y_test,y_pred))

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

    print("KNN: End")

KNN: Start
Tue Dec  4 21:25:04 2018
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')
KNN: Fit
KNN: Predict
Accuracy of KNN classifier on train set: 0.76
Accuracy of KNN classifier on test set: 0.62
KNN: Confusion Matrix
[[353776 189873]
 [302672 454647]]
KNN: Classification Report
              precision    recall  f1-score   support

           0       0.54      0.65      0.59    543649
           1       0.71      0.60      0.65    757319

   micro avg       0.62      0.62      0.62   1300968
   macro avg       0.62      0.63      0.62   1300968
weighted avg       0.64      0.62      0.62   1300968

Tue Dec  4 22:36:25 2018
KNN: End


## ANN - Multilayer Perceptron

In [None]:
if enable_mlp:
    print("Multilayer Preceptron: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    #mlpc = MLPClassifier(alpha=1)
    #mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=model_max_iter, verbose=verbose_level)
    mlpc = MLPClassifier(hidden_layer_sizes=(25, 25, 25), verbose=verbose_level, max_iter=model_max_iter)
    print(mlpc)
    #mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
    print("Multilayer Preceptron: fit")
    mlpc.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(mlpc, open(file_mlp, "wb"))
    
if predict_mlp:
    
    # load model from file
    loaded_model = pickle.load(open(file_mlp, "rb"))
    print("Multilayer Preceptron: Predict")
    y_pred = mlpc.predict(X_test)

    print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
    print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))
    
    print("Multilayer Preceptron: Confusion Matrix")
    cnf_matrix_mlp = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_mlp)
    
    print("Multilayer Preceptron: Classificiation Report")
    print(classification_report(Y_test,y_pred))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("Multilayer Preceptron: End")

Multilayer Preceptron: Start
Tue Dec  4 22:36:25 2018
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(25, 25, 25), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=3, warm_start=False)
Multilayer Preceptron: fit
Iteration 1, loss = 0.60980575
Iteration 2, loss = 0.59258136
Iteration 3, loss = 0.58562434
Iteration 4, loss = 0.58262255
Iteration 5, loss = 0.58135142
Iteration 6, loss = 0.58017606
Iteration 7, loss = 0.57940530
Iteration 8, loss = 0.57907593
Iteration 9, loss = 0.57853862
Iteration 10, loss = 0.57820478
Iteration 11, loss = 0.57784142
Iteration 12, loss = 0.57758402
Iteration 13, loss = 0.57734173
Iteration 14, loss = 0.57701423
Iteration 15, loss = 0

### SVM

In [None]:
if enable_svm:
    print("SVM: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    #svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
    #svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
    #svm = SVC(C=svm_c, gamma=svm_gamma, verbose = verbose_level)
    #SVN prediction is taking for ever, limiting the max_iter to 100 instead of -1 (no limit)
    svm = SVC(C=1, gamma = 'auto', verbose = verbose_level, max_iter=model_max_iter)
    print(svm)
    print("SVM: Fit")
    svm.fit(X_train, Y_train)

    # save model to file
    pickle.dump(svm, open(file_svm, "wb"))
    
if predict_svm:
    # load model from file
    loaded_model = pickle.load(open(file_svm, "rb"))
    print("SVM: Predict")
    y_pred = svm.predict(X_test)
    
    print('Accuracy of SVM classifier on train set: {:.2f}'.format(svm.score(X_train, Y_train)))
    print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))
    
    print("SVM: Confusion Matrix")
    cnf_matrix_svm = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_svm)
    
    print("SVM: Classfication Report")
    print(classification_report(Y_test,y_pred))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("SVM: End")

SVM: Start
Tue Dec  4 22:49:44 2018
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)
SVM: Fit
[LibSVM]



SVM: Predict


In [None]:
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))