In [57]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file
import time

#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

#sampleN = 4300000

#Multiclass classification, binary if falase
multiclass = False
over_sample = True
balanced = True

if multiclass:
    labels=[2, 1, 0]
else:
    labels=[1, 0]

#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if balanced:
    if multiclass:
        inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
        inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
        inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
    else:
        inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
        inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
        inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

    if over_sample:
        datafile_train = inputfile_train_O
    else:
        datafile_train = inputfile_train_U
else:
    inputfile_test = 'CKME136X10_2018_Data_Cleaned_Transformed.csv'

datafile_test = inputfile_test
    
model_max_iter = 100
#datestr = 'dec_08_binary_run_100_BUD'
datestr = 'dec_08_binary_run_100_balanced'

#Model Store
file_lr = 'lr_' + datestr + '.model'
file_lr_l1 = 'lr_l2_' + datestr + '.model'
file_dt = 'dt_' + datestr + '.model'
file_svm = 'svm_' + datestr + '.model'
file_knn = 'knn_' + datestr + '.model'
file_mlp = 'mlp_' + datestr + '.model'
file_kmean = 'kmean_' + datestr + '.model'
file_nbayes = 'nbayes_' + datestr + '.model'

file_final_train = 'final_train_' + datestr + '.csv'
file_final_test = 'final_test_' + datestr + '.csv'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
feature_all = True
defaultFeatures = ['P_AGE', 'V_YEAR', 'C_HOUR', 'C_YEAR', 'C_MNTH', 'C_CONF', 'C_WDAY', 'C_VEHS', 'P_USER', 'P_SEX']

enable_lr_l1 = False
predict_lr_l1 = False

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = False
enable_mlp = True
enable_kmean = False
enable_nbayes = True

predict_lr = True
predict_dt = True
predict_svm = True
predict_knn = False
predict_mlp = True
predict_nbayes = True

In [27]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

In [28]:
#print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_grid_search:
    print("Grid Search: Enabled")
else:
    print("Grid Search: Disabled")

if feature_all:
    print("All Features: Enabled")
else:
    print("All Features: Disabled")
    
if enable_kmean:
    print("K-means: Enabled")
else:
    print("K-means: Disabled")

if enable_lr_l1:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")


Multi-Class Classification: Disabled
Grid Search: Disabled
All Features: Enabled
K-means: Disabled
Logistic Regression: Disabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Disabled
MLP: Enabled


In [29]:
t_start =  time.time()
print(time.asctime( time.localtime(t_start) ))

Sat Dec  8 19:07:25 2018


In [30]:
#load data
if balanced: 
    df_test = pd.read_csv(datafile_test, engine = 'python')
    df_train = pd.read_csv(datafile_train, engine = 'python')
    df = df_train.copy()

    print(df_test.head(2))
    print(df_train.head(2))
else:
    df_unbalanced = pd.read_csv(datafile_test, engine = 'python')


In [31]:
df_tmp1 = df_unbalanced.copy()

In [32]:
# not the best approch, needs to be rewritten.  As the data is split before balanced, we do not need this
# step for balanced dataset

if (not balanced):

    ## Split Training and Test set 70/30 split, so we don't bleed information to test set
    #Split between data and class
    ubY = df_unbalanced[df_unbalanced.columns[-1]].copy()
    if (not multiclass):
        ubY.replace(to_replace = 1, value = 0, inplace = True)
        ubY.replace(to_replace = 2, value = 1, inplace = True)
        ubY.replace(to_replace = 3, value = 1, inplace = True)
    else:
        ubY.replace(to_replace = 1, value = 0, inplace = True)
        ubY.replace(to_replace = 2, value = 1, inplace = True)
        ubY.replace(to_replace = 3, value = 2, inplace = True)

    ubX = df_unbalanced[df_unbalanced.columns[0:df_unbalanced.columns.size -1]].copy()

    ubX_train, ubX_test, ubY_train, ubY_test = model_selection.train_test_split(ubX, ubY, test_size=0.3, stratify=ubY)
    
    df_test = ubX_test
    df_test['P_ISEV'] = ubY_test
    df_train = ubX_train
    df_train['P_ISEV'] = ubY_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [33]:
display(df_test)
display(df_train)

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,P_SEX,P_AGE,P_PSN,P_USER,P_ISEV
2360994,11,6,2,2,23,1,2,1,1,7,2,4,1,1,1
18631,1,6,2,2,34,2,1,1,1,1,1,3,1,2,1
3417479,1,5,5,2,32,1,4,6,1,7,2,4,1,1,0
929582,1,4,1,1,6,1,1,4,2,7,2,5,1,1,1
528501,9,6,5,3,21,1,1,1,2,1,1,1,1,2,0
2590487,9,7,1,1,6,2,1,1,1,1,2,2,2,2,0
125378,6,4,4,4,21,1,1,1,1,7,1,4,1,1,1
1800723,12,1,3,2,21,2,2,2,1,7,1,3,1,1,1
4052039,9,6,4,2,21,2,2,1,1,8,2,2,1,1,0
1053029,6,6,4,2,36,2,1,1,1,1,1,3,1,2,1


Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_VEHS,C_CONF,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,P_SEX,P_AGE,P_PSN,P_USER,P_ISEV
800176,8,5,4,2,21,1,1,1,1,7,2,4,1,1,1
978255,3,5,3,2,21,1,1,1,1,7,2,1,2,2,0
4248866,8,2,5,2,36,2,1,1,1,2,1,1,1,2,1
2374678,12,3,3,4,35,2,1,1,1,1,2,4,1,1,1
1346038,5,7,4,1,3,1,2,1,3,7,2,5,1,1,1
2634938,11,6,3,3,21,3,1,1,1,7,2,5,1,1,0
1623904,5,2,4,2,21,2,1,1,1,7,2,4,1,1,1
3394812,12,5,4,2,51,1,2,4,1,7,2,1,2,2,0
3834312,10,5,3,2,36,2,3,2,1,1,1,5,1,1,1
935215,1,5,4,1,6,1,1,1,1,7,2,4,1,1,0


In [34]:
print(df_test.isnull().sum().sum())
print(df_train.isnull().sum().sum())

0
0


In [35]:
print(df_test[df_test.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
print(df_train[df_train.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

0
0


In [36]:
df_test_cat = df_test.astype('category').copy()
df_train_cat = df_train.astype('category').copy()

In [37]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

Number of Rows in test data: 1444846
Number of Rows in train data: 3371307


In [38]:
print(df_train_cat.columns)
print(df_train_cat.dtypes)

Index(['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_VEHS', 'C_CONF', 'C_RCFG', 'C_WTHR',
       'C_RSUR', 'C_RALN', 'C_TRAF', 'P_SEX', 'P_AGE', 'P_PSN', 'P_USER',
       'P_ISEV'],
      dtype='object')
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    category
dtype: object


In [39]:
#One-Hot-Encoding of categorical
#TBD


In [40]:
#print(df_test_cat[df_test_cat.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())
#print(df_train_cat[df_train_cat.columns].apply(lambda x: x.astype(str).str.contains('[^0-9]')).sum().sum())

### type cast train data

In [41]:
# convert to the correct type
df_train_cat = convert_type(df_train_cat)
print(df_train_cat.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3371307 entries, 800176 to 3881042
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF    category
P_SEX     category
P_AGE     category
P_PSN     category
P_USER    category
P_ISEV    int32
dtypes: category(14), int32(1)
memory usage: 83.6 MB
None


### type cast test data

In [42]:
# convert to the correct type
df_test_cat = convert_type(df_test_cat)
print(df_test_cat.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444846 entries, 2360994 to 3982201
Data columns (total 15 columns):
C_MNTH    1444846 non-null category
C_WDAY    1444846 non-null category
C_HOUR    1444846 non-null category
C_VEHS    1444846 non-null category
C_CONF    1444846 non-null category
C_RCFG    1444846 non-null category
C_WTHR    1444846 non-null category
C_RSUR    1444846 non-null category
C_RALN    1444846 non-null category
C_TRAF    1444846 non-null category
P_SEX     1444846 non-null category
P_AGE     1444846 non-null category
P_PSN     1444846 non-null category
P_USER    1444846 non-null category
P_ISEV    1444846 non-null int32
dtypes: category(14), int32(1)
memory usage: 35.8 MB
None


## Split Training and Testing for Binary class

In [43]:
#Split between data and class for training
Y_train = df_train_cat[df_train_cat.columns[-1]]
X_train = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

Y_test = df_test_cat[df_test_cat.columns[-1]]
X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

# split data into X and y
#X = df_sample.iloc[:,0:16]
#Y = df_sample.iloc[:,-1]

In [44]:
print(Y_train.unique())
print(Y_train.groupby(Y_train).size())
print()
print(Y_test.unique())
print(Y_test.groupby(Y_test).size())

[1 0]
P_ISEV
0    1432039
1    1939268
Name: P_ISEV, dtype: int64

[1 0]
P_ISEV
0    613731
1    831115
Name: P_ISEV, dtype: int64


In [45]:
print(X_train.head(3))

        C_MNTH C_WDAY C_HOUR C_VEHS C_CONF C_RCFG C_WTHR C_RSUR C_RALN C_TRAF  \
800176       8      5      4      2     21      1      1      1      1      7   
978255       3      5      3      2     21      1      1      1      1      7   
4248866      8      2      5      2     36      2      1      1      1      2   

        P_SEX P_AGE P_PSN P_USER  
800176      2     4     1      1  
978255      2     1     2      2  
4248866     1     1     1      2  


In [46]:
dummies = False
if (dummies):
#one hot encode train and test
    X_train = pd.get_dummies(X_train)
    X_test = pd.get_dummies(X_test)
    display(X_train)
    display(X_test)
    print(X_train.shape)

## Clustering based on K-Means Clustering

In [47]:
if enable_kmean:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    print("K-Means Clustering: Start")
    kmeans = KMeans(n_clusters=3, init='random', n_init=10, tol=1e-04, verbose= verbose_level, max_iter=model_max_iter)
    print(kmeans)
    
    print("K-Means Clustering: Build")
    ykm = kmeans.fit(X_train)
    
    if pyscript:
        print(ykm.cluster_centers_)
        print(ykm.labels_)
    else:
        display(ykm.cluster_centers_)
        display(ykm.labels_)
    
    # save model to file
    pickle.dump(ykm, open(file_kmean, "wb"))
    
    print("K-Means Clustering: End")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))


### SVM GridSearch for Optimal Parms

In [48]:
#This operation is computationaly expensive.
if enable_grid_search:
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}
    grid = GridSearchCV(SVC(), param_grid, verbose=verbose_level, n_jobs = 10)
    print(grid)
    grid.fit(X_train, Y_train)
    print(grid.best_params_)
    svm_c = grid.best_params_.get('C')
    svm_gamma = grid.best_params_.get('gamma')
    print(grid.best_estimator_)
    grid_predictions = grid.predict(X_test)
    cfn_matrix_grid = confusion_matrix(Y_test, grid_predictions)
    print(cfn_matrix_grid)
    print(classification_report(Y_test,grid_predictions))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

## Logistic Regression Model

In [58]:
if enable_lr:
    print("Logistic Regression: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr, "wb"))
    
    
if predict_lr:
    # load model from file
    loaded_model = pickle.load(open(file_lr, "rb"))
    print("Logistic Regression: Predict")
    y_pred = lr.predict(X_test)

    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Intercept")
    print(lr.intercept_)

    # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Coefficients")
    print(lr.coef_)
    print()
    print("Logistic Regression: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_lg)
    print()
    print("Logistic Regression: Classification Report")
    print(classification_report(Y_test, y_pred, labels=labels))
    print()
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start
Sat Dec  8 20:21:56 2018
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)
Logistic Regression: Fit
convergence after 16 epochs took 37 seconds
Logistic Regression: Predict


[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   36.1s finished


Accuracy of logistic regression classifier on train set: 0.63
Accuracy of logistic regression classifier on test set: 0.63
Logistic Regression: Intercept
[0.90328395]
Logistic Regression: Coefficients
[[-6.16874281e-04 -1.47012507e-02 -7.15894763e-02 -4.62987910e-01
  -2.75370483e-03 -7.10724115e-02  2.28640828e-02  3.75069538e-02
   1.56705904e-01  3.92911489e-02 -6.00777550e-01  1.48877722e-01
  -1.73084986e-01  6.78680336e-01]]

Logistic Regression: Confusion Matrix
[[629402 201713]
 [327173 286558]]

Logistic Regression: Classification Report
             precision    recall  f1-score   support

          1       0.66      0.76      0.70    831115
          0       0.59      0.47      0.52    613731

avg / total       0.63      0.63      0.63   1444846


Sat Dec  8 20:22:40 2018
Logistic Regression: End


### Logistic Regression with L1 Regularization

In [50]:
if (enable_lr_l1):
    # with L1 regularization
    print("Logistic Regression with L1 Regularization: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    lr = LogisticRegression(penalty='l1', C=1, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs = 10, max_iter=model_max_iter)
    print(lr)
    print("Logistic Regression with L1 Regularization: Fit")
    lr.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(lr, open(file_lr_l1, "wb"))

if (predict_lr_l1):
    # load model from file
    loaded_model = pickle.load(open(file_lr_l1, "rb"))
    print("Logistic Regression with L1 Regularization: Predict")
    y_pred = lr.predict(X_test)
    
    print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(lr.score(X_train, Y_train)))
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(lr.score(X_test, Y_test)))

    print("Logistic Regression with L1 Regularization: Confusion Matrix")
    cnf_matrix_lg_l1 = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg_l1)
    
    print(classification_report(Y_test,y_pred))
    print("Logistic Regression with L1 Regularization: Classification Report")

    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression with L1 Regularization: End")

Logistic Regression with L1 Regularization: End


### Naive Bayes

In [59]:
# Gaussian Naive Bayes Classification
if enable_nbayes:
    print("Naive Bayes: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    nbayes = GaussianNB()
    print(nbayes)
    print("Naive Bayes: Fit")
    nbayes.fit(X_train, Y_train)
    # save model to file
    pickle.dump(nbayes, open(file_nbayes, "wb"))

if predict_nbayes:
    # load model from file
    loaded_model = pickle.load(open(file_nbayes, "rb"))
    print("Naive Bayes: Predict")
    y_pred = nbayes.predict(X_test)
    print('Accuracy of Naive Bayes classifier on train set: {:.2f}'.format(nbayes.score(X_train, Y_train)))
    print('Accuracy of Naove Nayes classifier on test set: {:.2f}'.format(nbayes.score(X_test, Y_test)))
    
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred, labels=labels)
    print("Naive Bayes: Confusion Matrix")
    print(cnf_matrix_dt)
    print("Naive Bayes: Classification Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Naive Bayes: End")

Naive Bayes: Start
Sat Dec  8 20:23:47 2018
GaussianNB(priors=None)
Naive Bayes: Fit
Naive Bayes: Predict
Accuracy of Naive Bayes classifier on train set: 0.59
Accuracy of Naove Nayes classifier on test set: 0.59
Naive Bayes: Confusion Matrix
[[439434 391681]
 [206769 406962]]
Naive Bayes: Classification Report
             precision    recall  f1-score   support

          1       0.68      0.53      0.59    831115
          0       0.51      0.66      0.58    613731

avg / total       0.61      0.59      0.59   1444846

Sat Dec  8 20:23:57 2018
Naive Bayes: End


### Decision Tree

In [60]:
if enable_dt:
    print("Decision Tree: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    tree = DecisionTreeClassifier(criterion='entropy',max_depth=50)
    print(tree)
    print("Decision Tree: Fit")
    tree.fit(X_train, Y_train)
    # save model to file
    pickle.dump(tree, open(file_dt, "wb"))

if predict_dt:
    # load model from file
    loaded_model = pickle.load(open(file_dt, "rb"))
    print("Decision Tree: Predict")
    y_pred = tree.predict(X_test)
    print('Accuracy of Decision Tree classifier on train set: {:.2f}'.format(tree.score(X_train, Y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(tree.score(X_test, Y_test)))
    
    print()
    cnf_matrix_dt = confusion_matrix(Y_test, y_pred, labels=labels)
    print("Decision Tree: Confusion Matrix")
    print(cnf_matrix_dt)
    print()
    print("Decision Tree: Classification Report")
    print()
    print(classification_report(Y_test,y_pred, labels=labels))
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    print("Decision Tree: End")

Decision Tree: Start
Sat Dec  8 20:23:57 2018
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Decision Tree: Fit
Decision Tree: Predict
Accuracy of Decision Tree classifier on train set: 0.85
Accuracy of Decision Tree classifier on test set: 0.62

Decision Tree: Confusion Matrix
[[520860 310255]
 [244228 369503]]

Decision Tree: Classification Report

             precision    recall  f1-score   support

          1       0.68      0.63      0.65    831115
          0       0.54      0.60      0.57    613731

avg / total       0.62      0.62      0.62   1444846

Sat Dec  8 20:24:47 2018
Decision Tree: End


### K-N-N

In [53]:
if enable_knn:
    print("KNN: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski', n_jobs = -1)
    print(knn)
    print("KNN: Fit")
    knn.fit(X_train, Y_train)

    # save model to file
    pickle.dump(knn, open(file_knn, "wb"))

if predict_knn:
    # load model from file
    loaded_model = pickle.load(open(file_knn, "rb"))
    
    print("KNN: Predict")
    y_pred = knn.predict(X_test)
    print('Accuracy of KNN classifier on train set: {:.2f}'.format(knn.score(X_train, Y_train)))
    print('Accuracy of KNN classifier on test set: {:.2f}'.format(knn.score(X_test, Y_test)))
    print()
    print("KNN: Confusion Matrix")
    cnf_matrix_knn = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_knn)
    print()
    print("KNN: Classification Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    print()
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))

    print("KNN: End")

## ANN - Multilayer Perceptron

In [62]:
if enable_mlp:
    print("Multilayer Preceptron: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    
    #mlpc = MLPClassifier(alpha=1)
    #mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), max_iter=model_max_iter, verbose=verbose_level)
    #mlpc = MLPClassifier(hidden_layer_sizes=(25, 25, 25), verbose=verbose_level, max_iter=model_max_iter)
    mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), verbose=verbose_level, max_iter=model_max_iter, tol = 0.0001)
    print(mlpc)
    #mlp = multilayer_perceptron(n_hidden =2, activation='logistic', algorithm='sgd', random_state=3)
    print("Multilayer Preceptron: fit")
    mlpc.fit(X_train, Y_train)
    
    # save model to file
    pickle.dump(mlpc, open(file_mlp, "wb"))
    
if predict_mlp:
    
    # load model from file
    loaded_model = pickle.load(open(file_mlp, "rb"))
    print("Multilayer Preceptron: Predict")
    y_pred = mlpc.predict(X_test)

    print('Accuracy of Multilayer Perceptron classifier on train set: {:.2f}'.format(mlpc.score(X_train, Y_train)))
    print('Accuracy of Multilayer Perceptron classifier on test set: {:.2f}'.format(mlpc.score(X_test, Y_test)))
    print()
    print("Multilayer Preceptron: Confusion Matrix")
    cnf_matrix_mlp = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_mlp)
    print()
    print("Multilayer Preceptron: Classificiation Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    (print)
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("Multilayer Preceptron: End")

Multilayer Preceptron: Start
Sat Dec  8 20:37:59 2018
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(12, 12, 12), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=3, warm_start=False)
Multilayer Preceptron: fit
Iteration 1, loss = 0.60843744
Iteration 2, loss = 0.59878281
Iteration 3, loss = 0.59625361
Iteration 4, loss = 0.59523437
Iteration 5, loss = 0.59470495
Iteration 6, loss = 0.59434857
Iteration 7, loss = 0.59400352
Iteration 8, loss = 0.59380144
Iteration 9, loss = 0.59358945
Iteration 10, loss = 0.59337852
Iteration 11, loss = 0.59306868
Iteration 12, loss = 0.59289636
Iteration 13, loss = 0.59278677
Iteration 14, loss = 0.59270350
Iteration 15, loss = 0.59264548
Iteration 16

### SVM

In [63]:
if enable_svm:
    print("SVM: Start")
    t_start =  time.time()
    print(time.asctime( time.localtime(t_start) ))
    #svm = SVC(C=1, random_state=0, kernel='sigmoid', verbose=True)
    #svm = SVC(C=1, random_state=0, kernel='linear', verbose=True, cache_size=200)
    #svm = SVC(C=svm_c, gamma=svm_gamma, verbose = verbose_level)
    #SVN prediction is taking for ever, limiting the max_iter to 100 instead of -1 (no limit)
    svm = SVC(C=1, gamma = 'auto', verbose = verbose_level, max_iter=model_max_iter)
    print(svm)
    print("SVM: Fit")
    svm.fit(X_train, Y_train)

    # save model to file
    pickle.dump(svm, open(file_svm, "wb"))
    
if predict_svm:
    # load model from file
    loaded_model = pickle.load(open(file_svm, "rb"))
    print("SVM: Predict")
    y_pred = svm.predict(X_test)
    
    print('Accuracy of SVM classifier on train set: {:.2f}'.format(svm.score(X_train, Y_train)))
    print('Accuracy of SVM classifier on test set: {:.2f}'.format(svm.score(X_test, Y_test)))
    
    print("SVM: Confusion Matrix")
    cnf_matrix_svm = confusion_matrix(Y_test, y_pred, labels=labels)
    print(cnf_matrix_svm)
    
    print("SVM: Classfication Report")
    print(classification_report(Y_test,y_pred, labels=labels))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    print("SVM: End")

SVM: Start
Sat Dec  8 20:41:34 2018
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=100, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)
SVM: Fit
[LibSVM]



SVM: Predict
Accuracy of SVM classifier on train set: 0.43
Accuracy of SVM classifier on test set: 0.43
SVM: Confusion Matrix
[[ 23335 807780]
 [ 18856 594875]]
SVM: Classfication Report
             precision    recall  f1-score   support

          1       0.55      0.03      0.05    831115
          0       0.42      0.97      0.59    613731

avg / total       0.50      0.43      0.28   1444846

Sat Dec  8 20:42:32 2018
SVM: End


In [56]:
t_end =  time.time()
print(time.asctime( time.localtime(t_end) ))

Sat Dec  8 19:17:55 2018
