In [1]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file
import time

#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

#sampleN = 4300000

#Multiclass classification, binary if falase
multiclass = True
over_sample = True
balanced = False

if multiclass:
    labels=[2, 1, 0]
else:
    labels=[1, 0]

#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if balanced:
    if multiclass:
        inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
        inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
        inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
    else:
        inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
        inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
        inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

    if over_sample:
        datafile_train = inputfile_train_O
    else:
        datafile_train = inputfile_train_U
else:
    inputfile_test = 'CKME136X10_2018_Data_Cleaned_Transformed.csv'
    df_unbalanced = 'CKME136X10_2018_Data_Cleaned_Transformed.csv'

datafile_test = inputfile_test
    
cluster = False
if (cluster):    
    datafile_train = 'CKME136X10_2018_Cluster1.csv'

cluster1_outputfile = 'CKME136X10_2018_Cluster1.csv'
cluster2_outputfile = 'CKME136X10_2018_Cluster2.csv'
cluster3_outputfile = 'CKME136X10_2018_Cluster3.csv'


    
    
model_max_iter = 100
#datestr = 'dec_08_binary_run_100_BUD'
datestr = 'dec_08_binary_run_100_balanced_over'

#Model Store
file_lr = 'lr_' + datestr + '.model'
file_lr_l1 = 'lr_l2_' + datestr + '.model'
file_dt = 'dt_' + datestr + '.model'
file_svm = 'svm_' + datestr + '.model'
file_knn = 'knn_' + datestr + '.model'
file_mlp = 'mlp_' + datestr + '.model'
file_kmean = 'kmean_' + datestr + '.model'
file_nbayes = 'nbayes_' + datestr + '.model'

file_final_train = 'final_train_' + datestr + '.csv'
file_final_test = 'final_test_' + datestr + '.csv'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
feature_all = True
num_clusters = 2
defaultFeatures = ['P_AGE', 'V_YEAR', 'C_HOUR', 'C_YEAR', 'C_MNTH', 'C_CONF', 'C_WDAY', 'C_VEHS', 'P_USER', 'P_SEX']

enable_lr_l1 = False
predict_lr_l1 = False

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = True
enable_mlp = True
enable_kmean = False
enable_nbayes = True

predict_lr = True
predict_dt = True
predict_svm = True
predict_knn = True
predict_mlp = True
predict_nbayes = True

  from numpy.core.umath_tests import inner1d


In [2]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

In [3]:
#print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_grid_search:
    print("Grid Search: Enabled")
else:
    print("Grid Search: Disabled")

if feature_all:
    print("All Features: Enabled")
else:
    print("All Features: Disabled")
    
if enable_kmean:
    print("K-means: Enabled")
else:
    print("K-means: Disabled")

if enable_lr_l1:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")


Multi-Class Classification: Enabled
Grid Search: Disabled
All Features: Enabled
K-means: Disabled
Logistic Regression: Disabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Enabled
MLP: Enabled


In [4]:
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

Sun Dec  9 09:03:05 2018


In [5]:
#load data
if balanced: 
    df_test = pd.read_csv(datafile_test, engine = 'python')
    df_train = pd.read_csv(datafile_train, engine = 'python')
    df = df_train.copy()

    print(df_test.head(2))
    print(df_train.head(2))
else:
    df_unbalanced = pd.read_csv(datafile_test, engine = 'python')


In [6]:
df_tmp1 = df_unbalanced.copy()

In [7]:
# not the best approch, needs to be rewritten.  As the data is split before balanced, we do not need this
# step for balanced dataset

if (not balanced):

    ## Split Training and Test set 70/30 split, so we don't bleed information to test set
    #Split between data and class
    ubY = df_unbalanced[df_unbalanced.columns[-1]].copy()
    if (not multiclass):
        ubY.replace(to_replace = 1, value = 0, inplace = True)
        ubY.replace(to_replace = 2, value = 1, inplace = True)
        ubY.replace(to_replace = 3, value = 1, inplace = True)
    else:
        ubY.replace(to_replace = 1, value = 0, inplace = True)
        ubY.replace(to_replace = 2, value = 1, inplace = True)
        ubY.replace(to_replace = 3, value = 2, inplace = True)

    ubX = df_unbalanced[df_unbalanced.columns[0:df_unbalanced.columns.size -1]].copy()

    ubX_train, ubX_test, ubY_train, ubY_test = model_selection.train_test_split(ubX, ubY, test_size=0.3, stratify=ubY)
    
    df_test = ubX_test
    df_test['P_ISEV'] = ubY_test
    df_train = ubX_train
    df_train['P_ISEV'] = ubY_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
display(df_test)
display(df_train)

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,P_SEX,P_AGE,P_PSN,P_USER,P_ISEV
359271,3,4,3,2,36,2,1,1,1,1,1,5,1,2,1
2606506,10,5,5,4,21,1,3,2,1,7,2,5,1,2,0
1825687,1,1,4,2,36,2,1,2,1,7,2,2,1,1,0
3518317,7,2,4,2,21,1,1,1,1,7,2,5,1,1,0
2364682,11,7,1,1,6,1,1,1,1,7,2,2,1,1,1
525095,9,6,3,2,31,1,1,1,3,7,2,4,1,1,1
40296,2,6,5,1,35,2,2,2,1,1,2,3,1,1,0
175369,8,3,1,1,2,1,2,2,1,7,2,4,1,1,1
4766983,10,5,2,2,35,3,4,2,3,7,1,4,1,1,0
3909883,2,3,5,1,4,1,1,1,3,7,2,5,1,1,1


Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,P_SEX,P_AGE,P_PSN,P_USER,P_ISEV
1568449,2,3,1,1,51,1,1,2,1,7,1,4,1,1,1
4613477,2,6,2,2,22,1,1,4,2,7,1,4,1,1,0
2409065,1,6,1,1,6,1,1,1,2,7,2,4,3,2,1
2609411,10,6,1,1,51,2,1,1,1,2,2,3,2,2,1
3279541,7,4,2,2,25,4,1,1,1,7,1,4,1,1,0
246546,10,7,3,3,21,2,1,1,1,1,2,4,1,1,0
1554953,1,6,2,2,21,2,1,4,1,7,1,5,1,2,1
773512,7,5,3,1,3,1,1,1,1,7,1,4,1,1,1
4765725,10,5,2,2,23,1,1,1,1,7,2,1,2,2,0
897886,12,3,3,2,35,2,1,1,1,1,2,4,1,1,0


In [9]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

0
0


In [10]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0
0


In [11]:
df_test_cat = df_test.astype('category').copy()
df_train_cat = df_train.astype('category').copy()

In [12]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

Number of Rows in test data: 1444846
Number of Rows in train data: 3371307


In [13]:
print(df_train_cat.columns)
print(df_train_cat.dtypes)

Index(['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG', 'C_WTHR',
       'C_RSUR', 'C_RALN', 'C_TRAF', 'P_SEX', 'P_AGE', 'P_PSN', 'P_USER',
       'P_ISEV'],
      dtype='object')
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    category
dtype: object


In [14]:
#One-Hot-Encoding of categorical
#TBD


In [15]:
#print(df_test_cat[df_test_cat.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
#print(df_train_cat[df_train_cat.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

### type cast train data

In [16]:
# convert to the correct type
df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3371307 entries, 1568449 to 798915
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(14), int32(1)
memory usage: 83.6 MB
None


### type cast test data

In [17]:
# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444846 entries, 359271 to 1248637
Data columns (total 15 columns):
C_MNTH    1444846 non-null category
C_WDAY    1444846 non-null category
C_HOUR    1444846 non-null category
C_VEHS    1444846 non-null category
C_CONF    1444846 non-null category
C_RCFG    1444846 non-null category
C_WTHR    1444846 non-null category
C_RSUR    1444846 non-null category
C_RALN    1444846 non-null category
C_TRAF    1444846 non-null category
P_SEX     1444846 non-null category
P_AGE     1444846 non-null category
P_PSN     1444846 non-null category
P_USER    1444846 non-null category
P_ISEV    1444846 non-null int32
dtypes: category(14), int32(1)
memory usage: 35.8 MB
None


## Split Training and Testing for Binary class

In [18]:
#Split between data and class for training
Y_train = df_train_cat[df_train_cat.columns[-1]]
X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

Y_test = df_test_cat[df_test_cat.columns[-1]]
X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

# split data into X and y
#X = df_sample.iloc[:,0:16]
#Y = df_sample.iloc[:,-1]

In [19]:
print(Y_train.unique())
print(Y_train.groupby(Y_train).size())
print()
print(Y_test.unique())
print(Y_test.groupby(Y_test).size())

[1 0 2]
P_ISEV
0    1432039
1    1913673
2      25595
Name: P_ISEV, dtype: int64

[1 0 2]
P_ISEV
0    613731
1    820145
2     10970
Name: P_ISEV, dtype: int64


In [20]:
print(X_train.head(3))

        C_MNTH C_WDAY C_HOUR C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN C_TRAF  \
1568449      2      3      1      1     51      1      1      2      1      7   
4613477      2      6      2      2     22      1      1      4      2      7   
2409065      1      6      1      1      6      1      1      1      2      7   

        P_SEX P_AGE P_PSN P_USER  
1568449     1     4     1      1  
4613477     1     4     1      1  
2409065     2     4     3      2  


In [21]:
dummies = False
if (dummies):
#one hot encode train and test
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)
    display(X_train)
    display(X_test)
    print(X_train.shape)

## Clustering based on K-Means Clustering

In [22]:
if enable_kmean:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    print("K-Means Clustering: Start")
    kmeans = KMeans(n_clusters=num_clusters, init='random', n_init=10, tol=1e-04, verbose= verbose_level, max_iter=model_max_iter)
    print(kmeans)
    
    print("K-Means Clustering: Build")
    ykm = kmeans.fit(X_train)
    
    if pyscript:
        print(ykm.cluster_centers_)
        print(ykm.labels_)
    else:
        display(ykm.cluster_centers_)
        display(ykm.labels_)
    
    # save model to file
    pickle.dump(ykm, open(file_kmean, "wb"))
    
    print("K-Means Clustering: End")
    t_start =  time.time()
    
    XY_train = X_train.copy()
    XY_train['P_ISEV'] = Y_train.copy()
 
    cluster1 = XY_train[loaded_model.labels_ == 0]
    print("cluster1 Shape {}".format(cluster1.shape))
    cluster1.to_csv(cluster1_outputfile, encoding='utf-8', index=False)
    
    cluster2 = XY_train[loaded_model.labels_ == 1]
    print("cluster2 Shape {}".format(cluster2.shape))
    cluster2.to_csv(cluster2_outputfile, encoding='utf-8', index=False)

    if num_clusters == 3:
        cluster3 = XY_train[loaded_model.labels_ == 2]
        print("cluster3 Shape {}".format(cluster3.shape))
        cluster3.to_csv(cluster3_outputfile, encoding='utf-8', index=False)
        
    print(time.asctime( time.localtime(t_start) ))

    

### SVM GridSearch for Optimal Parms

In [23]:
#This operation is computationaly expensive.
if enable_grid_search:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level, n_jobs = 10)
    print(grid)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    svm_c = grid.best_params_.get('C')
    svm_gamma = grid.best_params_.get('gamma')
    print(grid.best_estimator_)
    grid_predictions = grid.predict(X_test)
    cfn_matrix_grid = confusion_matrix(Y_test, grid_predictions)
    print(cfn_matrix_grid)
    print(classification_report(Y_test,grid_predictions))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

## Logistic Regression Model

In [24]:
if enable_lr:
    print("Logistic Regression: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr, "wb"))
    
    
if predict_lr:
    # load model from file
    loaded_model = pickle.load(open(file_lr, "rb"))
    print("Logistic Regression: Predict")
    y_pred = lr.predict(X_test)

    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Intercept")
    print(lr.intercept_)

    # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Coefficients")
    print(lr.coef_)
    print()
    print("Logistic Regression: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_lg)
    print()
    print("Logistic Regression: Classification Report")
    print(classification_report(Y_test, y_pred, labels=labels))
    print()
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start
Sun Dec  9 09:05:13 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)
Logistic Regression: Fit
convergence after 15 epochs took 125 seconds
convergence after 15 epochs took 125 seconds
convergence after 42 epochs took 181 seconds
Logistic Regression: Predict


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.0min finished


Accuracy of logistic regression classifier on train set: 0.63
Accuracy of logistic regression classifier on test set: 0.63
Logistic Regression: Intercept
[-0.90377635  1.00330754 -8.4680837 ]
Logistic Regression: Coefficients
[[ 5.73325925e-04  1.41352316e-02  7.27514856e-02  4.63551602e-01
   2.63957986e-03  6.99996136e-02 -2.36587600e-02 -3.74653293e-02
  -1.53468587e-01 -3.98761328e-02  6.00597156e-01 -1.48990460e-01
   1.77097566e-01 -6.80036007e-01]
 [-1.08001743e-03 -1.53619874e-02 -6.81538481e-02 -4.46188215e-01
  -2.61552141e-03 -5.93022542e-02  2.34346586e-02  3.82289907e-02
   1.32455666e-01  3.49869619e-02 -6.12682974e-01  1.32818259e-01
  -1.97136251e-01  6.47019758e-01]
 [ 1.43465069e-02  4.13972244e-02 -1.06022324e-01 -4.07705052e-01
   5.56167279e-03 -2.36979619e-01  9.92879779e-03 -3.31419766e-02
   3.42046807e-01  1.87919156e-01  5.46507624e-01  3.27902033e-01
   3.11427401e-01  4.29558689e-01]]

Logistic Regression: Confusion Matrix
[[     0   9344   1626]
 [     0 61

  'precision', 'predicted', average, warn_for)


### Logistic Regression with L1 Regularization

In [25]:
if (enable_lr_l1):
    # with L1 regularization
    print("Logistic Regression with L1 Regularization: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(penalty='l1', C=1, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs = 10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression with L1 Regularization: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr_l1, "wb"))

if (predict_lr_l1):
    # load model from file
    loaded_model = pickle.load(open(file_lr_l1, "rb"))
    print("Logistic Regression with L1 Regularization: Predict")
    y_pred = lr.predict(X_test)
    
    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    print("Logistic Regression with L1 Regularization: Confusion Matrix")
    cnf_matrix_lg_l1 = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg_l1)
    
    print(classification_report(Y_test,y_pred))
    print("Logistic Regression with L1 Regularization: Classification Report")

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression with L1 Regularization: End")

Logistic Regression with L1 Regularization: End


### Naive Bayes

In [26]:
# Gaussian Naive Bayes Classification
if enable_nbayes:
    print("Naive Bayes: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    nbayes = GaussianNB()
    print(nbayes)
    print("Naive Bayes: Fit")
    nbayes.fit(X_train, Y_train)
    # save model to file
    pickle.dump(nbayes, open(file_nbayes, "wb"))

if predict_nbayes:
    # load model from file
    loaded_model = pickle.load(open(file_nbayes, "rb"))
    print("Naive Bayes: Predict")
    y_pred = nbayes.predict(X_test)
    print('Accuracy of Naive Bayes classifier on train set: {:.2f}'.format(nbayes.score(X_train, Y_train)))
    print('Accuracy of Naove Nayes classifier on test set: {:.2f}'.format(nbayes.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred, labels=labels)
    print("Naive Bayes: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Naive Bayes: Classification Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Naive Bayes: End")

Naive Bayes: Start
Sun Dec  9 09:08:21 2018
GaussianNB(priors=None)
Naive Bayes: Fit
Naive Bayes: Predict
Accuracy of Naive Bayes classifier on train set: 0.58
Accuracy of Naove Nayes classifier on test set: 0.58
Naive Bayes: Confusion Matrix
[[   502   7695   2773]
 [  5993 426491 387661]
 [   601 205176 407954]]
Naive Bayes: Classification Report
             precision    recall  f1-score   support

          2       0.07      0.05      0.06     10970
          1       0.67      0.52      0.58    820145
          0       0.51      0.66      0.58    613731

avg / total       0.60      0.58      0.58   1444846

Sun Dec  9 09:08:31 2018
Naive Bayes: End


### Decision Tree

In [27]:
if enable_dt:
    print("Decision Tree: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    tree = DecisionTreeClassifier(criterion='entropy',max_depth=50)
    print(tree)
    print("Decision Tree: Fit")
    tree.fit(X_train, Y_train)
    # save model to file
    pickle.dump(tree, open(file_dt, "wb"))

if predict_dt:
    # load model from file
    loaded_model = pickle.load(open(file_dt, "rb"))
    print("Decision Tree: Predict")
    y_pred = tree.predict(X_test)
    print('Accuracy of Decision Tree classifier on train set: {:.2f}'.format(tree.score(X_train, Y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))
    
    print()
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred, labels=labels)
    print("Decision Tree: Confusion Matrix")
    print(cnf_matrix_dt)
    print()
    print("Decision Tree: Classification Report")
    print()
    print(classification_report(Y_test,y_pred, labels=labels))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Decision Tree: End")

Decision Tree: Start
Sun Dec  9 09:08:31 2018
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Decision Tree: Fit
Decision Tree: Predict
Accuracy of Decision Tree classifier on train set: 0.84
Accuracy of Decision Tree classifier on test set: 0.61

Decision Tree: Confusion Matrix
[[   557   7687   2726]
 [  6564 505369 308212]
 [  1913 241854 369964]]

Decision Tree: Classification Report

             precision    recall  f1-score   support

          2       0.06      0.05      0.06     10970
          1       0.67      0.62      0.64    820145
          0       0.54      0.60      0.57    613731

avg / total       0.61      0.61      0.61   1444846

Sun Dec  9 09:09:14 2018
Decision Tre

## ANN - Multilayer Perceptron

In [28]:
if enable_mlp:
    print("Multilayer Preceptron: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    #mlpc = MLPClassifier(alpha=1)
    #mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=model_max_iter, verbose=verbose_level)
    #mlpc = MLPClassifier(hidden_layer_sizes=(25, 25, 25), verbose=verbose_level, max_iter=model_max_iter)
    mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), verbose=verbose_level, max_iter=model_max_iter, tol = 0.0001)
    print(mlpc)
    #mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
    print("Multilayer Preceptron: fit")
    mlpc.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(mlpc, open(file_mlp, "wb"))
    
if predict_mlp:
    
    # load model from file
    loaded_model = pickle.load(open(file_mlp, "rb"))
    print("Multilayer Preceptron: Predict")
    y_pred = mlpc.predict(X_test)

    print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
    print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))
    print()
    print("Multilayer Preceptron: Confusion Matrix")
    cnf_matrix_mlp = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_mlp)
    print()
    print("Multilayer Preceptron: Classificiation Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    (print)
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("Multilayer Preceptron: End")

Multilayer Preceptron: Start
Sun Dec  9 09:09:14 2018
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(12, 12, 12), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=3, warm_start=False)
Multilayer Preceptron: fit
Iteration 1, loss = 0.65963028
Iteration 2, loss = 0.63809190
Iteration 3, loss = 0.63419921
Iteration 4, loss = 0.63201876
Iteration 5, loss = 0.63094726
Iteration 6, loss = 0.63026313
Iteration 7, loss = 0.62991358
Iteration 8, loss = 0.62971347
Iteration 9, loss = 0.62945203
Iteration 10, loss = 0.62931291
Iteration 11, loss = 0.62910065
Iteration 12, loss = 0.62885852
Iteration 13, loss = 0.62874139
Iteration 14, loss = 0.62865998
Iteration 15, loss = 0.62857503
Iteration 16

  'precision', 'predicted', average, warn_for)


### SVM

In [29]:
if enable_svm:
    print("SVM: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    #svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
    #svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
    #svm = SVC(C=svm_c, gamma=svm_gamma, verbose = verbose_level)
    #SVN prediction is taking for ever, limiting the max_iter to 100 instead of -1 (no limit)
    svm = SVC(C=1, gamma = 'auto', verbose = verbose_level, max_iter=model_max_iter)
    print(svm)
    print("SVM: Fit")
    svm.fit(X_train, Y_train)

    # save model to file
    pickle.dump(svm, open(file_svm, "wb"))
    
if predict_svm:
    # load model from file
    loaded_model = pickle.load(open(file_svm, "rb"))
    print("SVM: Predict")
    y_pred = svm.predict(X_test)
    
    print('Accuracy of SVM classifier on train set: {:.2f}'.format(svm.score(X_train, Y_train)))
    print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))
    
    print("SVM: Confusion Matrix")
    cnf_matrix_svm = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_svm)
    
    print("SVM: Classfication Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("SVM: End")

SVM: Start
Sun Dec  9 09:13:29 2018
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=100, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)
SVM: Fit
[LibSVM]



SVM: Predict
Accuracy of SVM classifier on train set: 0.42
Accuracy of SVM classifier on test set: 0.42
SVM: Confusion Matrix
[[     0      9  10961]
 [     0  10884 809261]
 [     0  11626 602105]]
SVM: Classfication Report
             precision    recall  f1-score   support

          2       0.00      0.00      0.00     10970
          1       0.48      0.01      0.03    820145
          0       0.42      0.98      0.59    613731

avg / total       0.45      0.42      0.27   1444846

Sun Dec  9 09:15:42 2018
SVM: End


  'precision', 'predicted', average, warn_for)


### K-N-N

In [30]:
if enable_knn:
    print("KNN: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski', n_jobs = -1)
    print(knn)
    print("KNN: Fit")
    knn.fit(X_train, Y_train)

    # save model to file
    pickle.dump(knn, open(file_knn, "wb"))

if predict_knn:
    # load model from file
    loaded_model = pickle.load(open(file_knn, "rb"))
    
    print("KNN: Predict")
    y_pred = knn.predict(X_test)
    print('Accuracy of KNN classifier on train set: {:.2f}'.format(knn.score(X_train, Y_train)))
    print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, Y_test)))
    print()
    print("KNN: Confusion Matrix")
    cnf_matrix_knn = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_knn)
    print()
    print("KNN: Classification Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    print()
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

    print("KNN: End")

KNN: Start
Sun Dec  9 09:15:42 2018
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')
KNN: Fit
KNN: Predict
Accuracy of KNN classifier on train set: 0.73
Accuracy of KNN classifier on test set: 0.62

KNN: Confusion Matrix
[[    57   8810   2103]
 [   233 559178 260734]
 [    44 274581 339106]]

KNN: Classification Report
             precision    recall  f1-score   support

          2       0.17      0.01      0.01     10970
          1       0.66      0.68      0.67    820145
          0       0.56      0.55      0.56    613731

avg / total       0.62      0.62      0.62   1444846


Sun Dec  9 12:04:03 2018
KNN: End


In [31]:
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))

Sun Dec  9 12:04:03 2018
