In [1]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file
import time

#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

#Multiclass classification, binary if falase
multiclass = False
over_sample = True

#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test
    
model_max_iter = 1000
datestr = 'dec_05_binary_run_1000_KBO'

#Model Store
file_lr = 'lr_' + datestr + '.model'
file_lr_l1 = 'lr_l2_' + datestr + '.model'
file_dt = 'dt_' + datestr + '.model'
file_svm = 'svm_' + datestr + '.model'
file_knn = 'knn_' + datestr + '.model'
file_mlp = 'mlp_' + datestr + '.model'
file_kmean = 'kmean_' + datestr + '.model'
file_nbayes = 'nbayes_' + datestr + '.model'

file_final_train = 'final_train_' + datestr + '.csv'
file_final_test = 'final_test_' + datestr + '.csv'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
nfold = 10

enable_lr_l1 = True
predict_lr_l1 = True

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = True
enable_mlp = True
enable_nbayes = True

predict_lr = False
predict_dt = True
predict_svm = True
predict_knn = True
predict_mlp = True
predict_nbayes = True

In [2]:
#print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_lr:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_nbayes:
    print("Naive Bayes: Enabled")
else:
    print("Naive Bayes: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")

Multi-Class Classification: Disabled
Logistic Regression: Enabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Enabled
Naive Bayes: Enabled
MLP: Enabled


In [3]:
#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

   C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  \
0      11       8       3       0       3      36       2       1       1   
1      17       3       5       2       3      21       2       1       1   

   C_RALN  C_TRAF  V_TYPE  V_YEAR  P_SEX  P_AGE  P_PSN  P_SAFE  P_USER  P_ISEV  
0       1       1       5       4      1     24      2       2       2       0  
1       1       1       7       5      1     27      1       2       1       0  
   C_YEAR  C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  \
0       5       4       7       4       1       6       1       3       2   
1       5       1       5       1       2      21       2       1       6   

   C_RALN  C_TRAF  V_TYPE  V_YEAR  P_SEX  P_AGE  P_PSN  P_SAFE  P_USER  P_ISEV  
0       1       7       1       4      0     10      2       2       2       0  
1       1       1       1       3      0     33      1       2       1       1  


In [4]:
df_test_cat = df_test.astype('category').copy()
df_train_cat = df_train.astype('category').copy()

In [5]:
# convert to the correct type
df_train_cat['C_YEAR'] = df_train_cat['C_YEAR'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_MNTH'] = df_train_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_WDAY'] = df_train_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_HOUR'] = df_train_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_train_cat['C_VEHS'] = df_train_cat['C_VEHS'].astype('int')
df_train_cat['V_YEAR'] = df_train_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_PSN'] = df_train_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_train_cat['P_AGE'] = df_train_cat['P_AGE'].astype('int')
df_train_cat['P_ISEV'] = df_train_cat['P_ISEV'].astype('int')

In [6]:
# convert to the correct type
df_test_cat['C_YEAR'] = df_test_cat['C_YEAR'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_MNTH'] = df_test_cat['C_MNTH'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_WDAY'] = df_test_cat['C_WDAY'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_HOUR'] = df_test_cat['C_HOUR'].astype(CategoricalDtype(ordered=True))
df_test_cat['C_VEHS'] = df_test_cat['C_VEHS'].astype('int')
df_test_cat['V_YEAR'] = df_test_cat['V_YEAR'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_PSN'] = df_test_cat['P_PSN'].astype(CategoricalDtype(ordered=True))
df_test_cat['P_AGE'] = df_test_cat['P_AGE'].astype('int')
df_test_cat['P_ISEV'] = df_test_cat['P_ISEV'].astype('int')
print(df_test_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1300968 entries, 0 to 1300967
Data columns (total 19 columns):
C_YEAR    1300968 non-null category
C_MNTH    1300968 non-null category
C_WDAY    1300968 non-null category
C_HOUR    1300968 non-null category
C_VEHS    1300968 non-null int64
C_CONF    1300968 non-null category
C_RCFG    1300968 non-null category
C_WTHR    1300968 non-null category
C_RSUR    1300968 non-null category
C_RALN    1300968 non-null category
C_TRAF    1300968 non-null category
V_TYPE    1300968 non-null category
V_YEAR    1300968 non-null category
P_SEX     1300968 non-null category
P_AGE     1300968 non-null int64
P_PSN     1300968 non-null category
P_SAFE    1300968 non-null category
P_USER    1300968 non-null category
P_ISEV    1300968 non-null int64
dtypes: category(16), int64(3)
memory usage: 49.6 MB
None


In [7]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

Number of Rows in test data: 1300968
Number of Rows in train data: 3534154


In [8]:
#Split between data and class for training
Y = df_train_cat[df_train_cat.columns[-1]]
X = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

Y_test = df_test_cat[df_test_cat.columns[-1]]
X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

In [10]:
print(Y.unique())
print(Y.groupby(Y).size())
print(Y_test.unique())
print(Y_test.groupby(Y_test).size())

[0 1]
P_ISEV
0    1767077
1    1767077
Name: P_ISEV, dtype: int64
[0 1]
P_ISEV
0    543649
1    757319
Name: P_ISEV, dtype: int64


In [19]:
if (False):    
    seed = 101
    print("Logistic Regression: Start")
    t_start =  time.time()
    print()
    print("Model: Logistic Regression")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Kfold: {}".format(nfold))
    #kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
    kfold = model_selection.KFold(n_splits=nfold, random_state=seed)
    
    models = []
    model_scores_train = []
    model_scores_val = []
    
    X_kfold = np.array(X)
    Y_kfold = np.array(Y)
    
    i = 0
    for train_index, val_index in kfold.split(X_kfold):
        i = i+1
        print("Fold: {}".format(i))
        
        X_train, X_val = X_kfold[train_index], X_kfold[val_index]
        y_train, y_val = Y_kfold[train_index], Y_kfold[val_index]
        
        lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
        print()
        print("Model: Logistic Regression")
        print(lr)
        print()
        print("Logistic Regression: Fit")
        lr.fit(X_train, y_train)

        #add model for list
        models.append(lr)

        print("Logistic Regression: Predict")
        y_pred = lr.predict(X_val)

        mst = lr.score(X_train, y_train)
        model_scores_train.append(mst)
        
        msv = lr.score(X_val, y_val)
        model_scores_val.append(msv)
        
        print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(mst))
        print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(msv))

        # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Intercept")
        print(lr.intercept_)

        # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Coefficients")
        print(lr.coef_)

        print("Logistic Regression: Confusion Matrix")
        cnf_matrix_lg = confusion_matrix(y_val, y_pred)
        print(cnf_matrix_lg)
    
        print("Logistic Regression: Classification Report")
        print(classification_report(y_val, y_pred))

    print('AVG Accuracy of logistic regression classifier on train set: {:.2f}'.format(np.mean(model_scores_train)))
    print('AVG Accuracy of logistic regression classifier on validation set: {:.2f}'.format(np.mean(model_scores_val)))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    # save model to file
    pickle.dump(models, open(file_lr, "wb"))
    print("Logistic Regression: End")

Logistic Regression: Start

Model: Logistic Regression
Tue Dec  4 22:22:45 2018

Using Stratified Kfold: 10
Fold: 1

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)

Logistic Regression: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 975 epochs took 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.60
Logistic Regression: Intercept
[0.36997666]
Logistic Regression: Coefficients
[[ 0.02991854 -0.00648926  0.02842     0.06431164 -0.31380016 -0.00377841
   0.02341607  0.19222788  0.04470878  0.25094103  0.02906342  0.09359306
  -0.20325479 -0.75084684  0.01033195 -0.33811546 -0.07302622  0.28381225]]
Logistic Regression: Confusion Matrix
[[14 25]
 [15 46]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.48      0.36      0.41        39
           1       0.65      0.75      0.70        61

   micro avg       0.60      0.60      0.60       100
   macro avg       0.57      0.56      0.55       100
weighted avg       0.58      0.60      0.59       100

Fold: 2

Model: Logistic Regression
LogisticRegression(C=1, class_wei

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.65
Accuracy of logistic regression classifier on validation set: 0.62
Logistic Regression: Intercept
[0.432285]
Logistic Regression: Coefficients
[[ 0.01732206 -0.00434748  0.00090096  0.03168285 -0.39724951 -0.00126389
  -0.01465608  0.11509176  0.09779617  0.19379823  0.03955796  0.10495565
  -0.07688629 -0.81476591  0.0116579  -0.34548646 -0.07837543  0.28451716]]
Logistic Regression: Confusion Matrix
[[20 23]
 [15 42]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.57      0.47      0.51        43
           1       0.65      0.74      0.69        57

   micro avg       0.62      0.62      0.62       100
   macro avg       0.61      0.60      0.60       100
weighted avg       0.61      0.62      0.61       100

Fold: 3

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, dua

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.5s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.64
Logistic Regression: Intercept
[0.3515762]
Logistic Regression: Coefficients
[[ 0.03264259 -0.01394384  0.01142831  0.04958763 -0.30458855 -0.00471532
   0.01532691  0.13866902  0.06788217  0.25718318  0.01297172  0.08553083
  -0.15958632 -0.75234523  0.01308548 -0.37594709 -0.09970992  0.39182431]]
Logistic Regression: Confusion Matrix
[[20 21]
 [15 44]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.57      0.49      0.53        41
           1       0.68      0.75      0.71        59

   micro avg       0.64      0.64      0.64       100
   macro avg       0.62      0.62      0.62       100
weighted avg       0.63      0.64      0.63       100

Fold: 4

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, du

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 936 epochs took 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.67
Accuracy of logistic regression classifier on validation set: 0.58
Logistic Regression: Intercept
[0.25611814]
Logistic Regression: Coefficients
[[ 0.02467214  0.00786681  0.00737996  0.05087882 -0.38757633  0.00126098
  -0.02382315  0.08640305  0.13483321  0.32501798  0.02429862  0.09381254
  -0.15922401 -0.79766457  0.01500165 -0.38669952 -0.07034517  0.29120771]]
Logistic Regression: Confusion Matrix
[[21 26]
 [16 37]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.57      0.45      0.50        47
           1       0.59      0.70      0.64        53

   micro avg       0.58      0.58      0.58       100
   macro avg       0.58      0.57      0.57       100
weighted avg       0.58      0.58      0.57       100

Fold: 5

Model: Logistic Regression
LogisticRegression(C=1, class_wei

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.3s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 863 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.65
Accuracy of logistic regression classifier on validation set: 0.65
Logistic Regression: Intercept
[0.14426047]
Logistic Regression: Coefficients
[[ 2.06225947e-02 -8.96618499e-04  1.24014634e-02  6.42784235e-02
  -3.59518050e-01 -1.80227009e-04 -2.90397782e-02  1.66467127e-01
   6.75205281e-02  2.36225712e-01  3.40999903e-02  1.00930347e-01
  -9.03476511e-02 -7.72733461e-01  1.20404884e-02 -3.25826161e-01
  -9.45452406e-02  2.80868152e-01]]
Logistic Regression: Confusion Matrix
[[23 19]
 [16 42]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.59      0.55      0.57        42
           1       0.69      0.72      0.71        58

   micro avg       0.65      0.65      0.65       100
   macro avg       0.64      0.64      0.64       100
weighted avg       0.65      0.65      0.65     

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.65
Logistic Regression: Intercept
[0.33183074]
Logistic Regression: Coefficients
[[ 0.02664171 -0.00387027  0.00600924  0.05328419 -0.37885621  0.00390018
  -0.05816996  0.07586503  0.08964424  0.27474556  0.04049263  0.08520034
  -0.1660582  -0.73117219  0.01221123 -0.46587699 -0.04474201  0.41251254]]
Logistic Regression: Confusion Matrix
[[22 20]
 [15 43]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.59      0.52      0.56        42
           1       0.68      0.74      0.71        58

   micro avg       0.65      0.65      0.65       100
   macro avg       0.64      0.63      0.63       100
weighted avg       0.65      0.65      0.65       100

Fold: 7

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, d

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.5s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 948 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.64
Logistic Regression: Intercept
[0.30628116]
Logistic Regression: Coefficients
[[ 0.02713421  0.00372541  0.01727455  0.03745634 -0.3354919  -0.00086874
   0.03578313  0.14584621  0.07585343  0.32100064  0.02416763  0.10189091
  -0.19436181 -0.7063982   0.01071075 -0.30595434 -0.07320319  0.24935919]]
Logistic Regression: Confusion Matrix
[[20 28]
 [ 8 44]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.71      0.42      0.53        48
           1       0.61      0.85      0.71        52

   micro avg       0.64      0.64      0.64       100
   macro avg       0.66      0.63      0.62       100
weighted avg       0.66      0.64      0.62       100

Fold: 8

Model: Logistic Regression
LogisticRegression(C=1, class_wei

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.8s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 935 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.65
Accuracy of logistic regression classifier on validation set: 0.63
Logistic Regression: Intercept
[0.16450401]
Logistic Regression: Coefficients
[[ 0.03112169  0.02288548  0.02297927  0.05750409 -0.33335224  0.00188815
  -0.07293399  0.18411444  0.08305029  0.32358592  0.01340494  0.09530547
  -0.18634955 -0.74240829  0.00956534 -0.42338064 -0.04849229  0.30834278]]
Logistic Regression: Confusion Matrix
[[22 20]
 [17 41]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.56      0.52      0.54        42
           1       0.67      0.71      0.69        58

   micro avg       0.63      0.63      0.63       100
   macro avg       0.62      0.62      0.62       100
weighted avg       0.63      0.63      0.63       100

Fold: 9

Model: Logistic Regression
LogisticRegression(C=1, class_wei

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.3s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 949 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.67
Logistic Regression: Intercept
[0.24531387]
Logistic Regression: Coefficients
[[ 0.03426961  0.00934676  0.03704784  0.07756577 -0.35752106  0.00098216
   0.01566866  0.10382548  0.09186648  0.20637084  0.02981184  0.08322785
  -0.23054514 -0.73100724  0.01232253 -0.35523082 -0.0965091   0.35016256]]
Logistic Regression: Confusion Matrix
[[17 15]
 [18 50]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.49      0.53      0.51        32
           1       0.77      0.74      0.75        68

   micro avg       0.67      0.67      0.67       100
   macro avg       0.63      0.63      0.63       100
weighted avg       0.68      0.67      0.67       100

Fold: 10

Model: Logistic Regression
LogisticRegression(C=1, class_we

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    2.0s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 972 epochs took 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.66
Logistic Regression: Intercept
[0.39969484]
Logistic Regression: Coefficients
[[ 0.02395756  0.00262835  0.00537847  0.05263413 -0.33179558 -0.0021787
   0.0400291   0.13961199  0.14017955  0.13649398  0.02710014  0.09731551
  -0.15565006 -0.73035572  0.01053861 -0.36417491 -0.06029987  0.26730208]]
Logistic Regression: Confusion Matrix
[[26 21]
 [13 40]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.67      0.55      0.60        47
           1       0.66      0.75      0.70        53

   micro avg       0.66      0.66      0.66       100
   macro avg       0.66      0.65      0.65       100
weighted avg       0.66      0.66      0.66       100

AVG Accuracy of logistic regression classifier on train set: 0.66
AVG 

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.3s finished


In [23]:
if (True):    
    seed = 101
    print("Logistic Regression: Start")
    t_start =  time.time()
    print()
    print("Model: Logistic Regression")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    
    models = []
    model_scores_train = []
    model_scores_val = []
    
    X_kfold = np.array(X)
    Y_kfold = np.array(Y)
    
    skf = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)    
    i = 0
    for train_index, val_index in skf.split(X_kfold, Y_kfold):
        i = i+1
        print("Fold: {}".format(i))
        
        X_train, X_val = X_kfold[train_index], X_kfold[val_index]
        y_train, y_val = Y_kfold[train_index], Y_kfold[val_index]
        
        lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
        print()
        print("Model: Logistic Regression")
        print(lr)
        print()
        print("Logistic Regression: Fit")
        lr.fit(X_train, y_train)

        #add model for list
        models.append(lr)

        print("Logistic Regression: Predict")
        y_pred = lr.predict(X_val)

        mst = lr.score(X_train, y_train)
        model_scores_train.append(mst)
        
        msv = lr.score(X_val, y_val)
        model_scores_val.append(msv)
        
        print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(mst))
        print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(msv))

        # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Intercept")
        print(lr.intercept_)

        # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Coefficients")
        print(lr.coef_)

        print("Logistic Regression: Confusion Matrix")
        cnf_matrix_lg = confusion_matrix(y_val, y_pred)
        print(cnf_matrix_lg)
    
        print("Logistic Regression: Classification Report")
        print(classification_report(y_val, y_pred))

    print('AVG Accuracy of logistic regression classifier on train set: {:.2f}'.format(np.mean(model_scores_train)))
    print('AVG Accuracy of logistic regression classifier on validation set: {:.2f}'.format(np.mean(model_scores_val)))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    # save model to file
    pickle.dump(models, open(file_lr, "wb"))
    print("Logistic Regression: End")

Logistic Regression: Start

Model: Logistic Regression
Tue Dec  4 22:32:47 2018

Using Stratified Kfold: 10
Fold: 1

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)

Logistic Regression: Fit


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 988 epochs took 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.65
Accuracy of logistic regression classifier on validation set: 0.65
Logistic Regression: Intercept
[0.34341687]
Logistic Regression: Coefficients
[[ 3.04507271e-02  4.77931376e-04  1.44437028e-02  5.49833092e-02
  -3.75685183e-01  7.37424515e-03 -2.46837714e-02  1.63831394e-01
   3.16348085e-02  3.44472120e-01  1.31472056e-02  1.01884380e-01
  -1.90883423e-01 -6.98523417e-01  1.18557394e-02 -3.56454663e-01
  -1.28987443e-01  2.91112513e-01]]
Logistic Regression: Confusion Matrix
[[22 21]
 [14 44]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.61      0.51      0.56        43
           1       0.68      0.76      0.72        58

   micro avg       0.65      0.65      0.65       101
   macro avg       0.64      0.64      0.64       101
weighted avg       0.65      0.65      0.65     

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 941 epochs took 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.64
Logistic Regression: Intercept
[0.20800668]
Logistic Regression: Coefficients
[[ 0.02444879  0.01549295 -0.02209847  0.08025617 -0.33123298 -0.00494305
   0.03185173  0.16361672  0.11711366  0.210368    0.0267246   0.1040551
  -0.13242584 -0.71618404  0.01171287 -0.35102271 -0.07362223  0.26427623]]
Logistic Regression: Confusion Matrix
[[19 24]
 [12 46]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.61      0.44      0.51        43
           1       0.66      0.79      0.72        58

   micro avg       0.64      0.64      0.64       101
   macro avg       0.64      0.62      0.62       101
weighted avg       0.64      0.64      0.63       101

Fold: 3

Model: Logistic Regression
LogisticRegression(C=1, class_weig

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 853 epochs took 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.59
Logistic Regression: Intercept
[0.21804965]
Logistic Regression: Coefficients
[[ 1.95372071e-02 -7.57754049e-03  1.21822496e-02  4.96291355e-02
  -3.76587794e-01  3.84512285e-04  3.27302886e-03  1.21464100e-01
   9.82414858e-02  2.67955137e-01  3.14027262e-02  1.12409014e-01
  -6.53625779e-02 -7.74491839e-01  1.04123393e-02 -3.23772171e-01
  -6.72199319e-02  1.71225689e-01]]
Logistic Regression: Confusion Matrix
[[22 21]
 [20 38]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.52      0.51      0.52        43
           1       0.64      0.66      0.65        58

   micro avg       0.59      0.59      0.59       101
   macro avg       0.58      0.58      0.58       101
weighted avg       0.59      0.59      0.59     

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 972 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.61
Logistic Regression: Intercept
[0.21649908]
Logistic Regression: Coefficients
[[ 2.46447700e-02 -2.92142771e-03  9.69494202e-03  2.12521577e-02
  -3.54866058e-01  6.02819238e-04  3.37514479e-04  1.14852609e-01
   6.36933184e-02  3.18291257e-01  3.39338173e-02  9.77111772e-02
  -1.60386815e-01 -7.60363272e-01  1.22050657e-02 -3.90165419e-01
  -6.83715374e-02  4.07184443e-01]]
Logistic Regression: Confusion Matrix
[[20 22]
 [17 41]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.54      0.48      0.51        42
           1       0.65      0.71      0.68        58

   micro avg       0.61      0.61      0.61       100
   macro avg       0.60      0.59      0.59       100
weighted avg       0.60      0.61      0.61     

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.65
Accuracy of logistic regression classifier on validation set: 0.71
Logistic Regression: Intercept
[0.34487919]
Logistic Regression: Coefficients
[[ 0.03496585  0.00689144  0.0234258   0.06104537 -0.32708572 -0.00362836
   0.0332625   0.13129417  0.09959308  0.23212216  0.03993626  0.07699245
  -0.24073289 -0.69763153  0.0121592  -0.39278097 -0.08147054  0.33468039]]
Logistic Regression: Confusion Matrix
[[26 16]
 [13 45]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.67      0.62      0.64        42
           1       0.74      0.78      0.76        58

   micro avg       0.71      0.71      0.71       100
   macro avg       0.70      0.70      0.70       100
weighted avg       0.71      0.71      0.71       100

Fold: 6

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, d

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.64
Logistic Regression: Intercept
[0.44009192]
Logistic Regression: Coefficients
[[ 3.42359162e-02  4.99758858e-03  2.05312979e-02  7.33060113e-02
  -3.53542621e-01  5.60204220e-04 -2.83233979e-02  1.40385425e-01
   9.87329324e-02  1.81058041e-01  2.18359198e-02  8.33532464e-02
  -2.27139641e-01 -7.53797095e-01  1.18462898e-02 -3.50967504e-01
  -8.79264209e-02  3.77951161e-01]]
Logistic Regression: Confusion Matrix
[[21 21]
 [15 43]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.58      0.50      0.54        42
           1       0.67      0.74      0.70        58

   micro avg       0.64      0.64      0.64       100
   macro avg       0.63      0.62      0.62       100
weighted avg       0.63      0.64      0.64       100

Fold

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.2s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 1 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.56
Logistic Regression: Intercept
[0.31614624]
Logistic Regression: Coefficients
[[ 0.02864942 -0.00438289  0.03122795  0.04576609 -0.38045972 -0.00148946
  -0.09687265  0.12956447  0.07662324  0.2363463   0.01245744  0.10264136
  -0.11119072 -0.75965612  0.01424793 -0.42891522 -0.05907092  0.36773568]]
Logistic Regression: Confusion Matrix
[[13 29]
 [15 43]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.46      0.31      0.37        42
           1       0.60      0.74      0.66        58

   micro avg       0.56      0.56      0.56       100
   macro avg       0.53      0.53      0.52       100
weighted avg       0.54      0.56      0.54       100

Fold: 8

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, d

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.4s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


max_iter reached after 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.64
Accuracy of logistic regression classifier on validation set: 0.73
Logistic Regression: Intercept
[0.37015295]
Logistic Regression: Coefficients
[[ 0.02457678  0.00400268  0.03474012  0.05004642 -0.34819386 -0.00217912
  -0.00277333  0.0994498   0.10929849  0.2448037   0.01597365  0.09041155
  -0.13425499 -0.73909412  0.00990408 -0.40487913 -0.05381657  0.26981436]]
Logistic Regression: Confusion Matrix
[[26 16]
 [11 46]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.70      0.62      0.66        42
           1       0.74      0.81      0.77        57

   micro avg       0.73      0.73      0.73        99
   macro avg       0.72      0.71      0.72        99
weighted avg       0.73      0.73      0.72        99

Fold: 9

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, d

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 879 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.66
Accuracy of logistic regression classifier on validation set: 0.59
Logistic Regression: Intercept
[0.27797278]
Logistic Regression: Coefficients
[[ 2.49212801e-02 -4.08569833e-03  1.07374123e-02  8.34145208e-02
  -3.45875280e-01  6.03675859e-04  1.30420133e-02  1.70422806e-01
   5.07463977e-02  2.69205470e-01  4.33745636e-02  8.43902656e-02
  -1.91629691e-01 -8.48456101e-01  1.16033371e-02 -3.26864364e-01
  -5.96377973e-02  2.88032122e-01]]
Logistic Regression: Confusion Matrix
[[16 26]
 [15 42]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.52      0.38      0.44        42
           1       0.62      0.74      0.67        57

   micro avg       0.59      0.59      0.59        99
   macro avg       0.57      0.56      0.56        99
weighted avg       0.57      0.59      0.57     

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.


convergence after 920 epochs took 2 seconds
Logistic Regression: Predict
Accuracy of logistic regression classifier on train set: 0.65
Accuracy of logistic regression classifier on validation set: 0.63
Logistic Regression: Intercept
[0.26145341]
Logistic Regression: Coefficients
[[ 0.02156369  0.00648553  0.01330936  0.02093796 -0.30848298 -0.00170111
   0.00477071  0.11375678  0.14586845  0.22057157  0.03769669  0.08825325
  -0.17129987 -0.79547705  0.0116668  -0.36129382 -0.0583636   0.33107285]]
Logistic Regression: Confusion Matrix
[[20 22]
 [15 42]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.57      0.48      0.52        42
           1       0.66      0.74      0.69        57

   micro avg       0.63      0.63      0.63        99
   macro avg       0.61      0.61      0.61        99
weighted avg       0.62      0.63      0.62        99

AVG Accuracy of logistic regression classifier on train set: 0.66
AVG

[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:    1.9s finished


In [24]:
## for each model generated, lets predict the on test set
## note, this is the first time any of these observations are seen by the model
# load model from file
loaded_model = pickle.load(open(file_lr, "rb"))
print("Logistic Regression: Predict on test set")

i = 0
for model in loaded_model:
    i = i + 1
    y_pred = model.predict(X_test)

    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(X_test, Y_test)))

    # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Intercept")
    print(model.intercept_)

    # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
    print("Logistic Regression: Coefficients")
    print(model.coef_)

    print("Logistic Regression: Confusion Matrix")
    cnf_matrix_lg = confusion_matrix(Y_test, y_pred)
    print(cnf_matrix_lg)
    
    print("Logistic Regression: Classification Report")
    print(classification_report(Y_test, y_pred))

Logistic Regression: Predict on test set
Accuracy of logistic regression classifier on test set: 0.62
Logistic Regression: Intercept
[0.34341687]
Logistic Regression: Coefficients
[[ 3.04507271e-02  4.77931376e-04  1.44437028e-02  5.49833092e-02
  -3.75685183e-01  7.37424515e-03 -2.46837714e-02  1.63831394e-01
   3.16348085e-02  3.44472120e-01  1.31472056e-02  1.01884380e-01
  -1.90883423e-01 -6.98523417e-01  1.18557394e-02 -3.56454663e-01
  -1.28987443e-01  2.91112513e-01]]
Logistic Regression: Confusion Matrix
[[257791 285858]
 [202149 555170]]
Logistic Regression: Classification Report
              precision    recall  f1-score   support

           0       0.56      0.47      0.51    543649
           1       0.66      0.73      0.69    757319

   micro avg       0.62      0.62      0.62   1300968
   macro avg       0.61      0.60      0.60   1300968
weighted avg       0.62      0.62      0.62   1300968

Accuracy of logistic regression classifier on test set: 0.62
Logistic Regress

In [26]:
if enable_lr:
    seed = 101
    print("Logistic Regression: Start")
    t_start =  time.time()
    print()
    print("Model: Logistic Regression")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Kfold: {}".format(nfold))
    kfold = model_selection.KFold(n_splits=nfold, random_state=seed)
    
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
    print()
    print("Model: Logistic Regression")
    print(lr)
    print()
    print("Logistic Regression: Fit")
    results = model_selection.cross_val_score(lr, X, Y, cv=kfold)
    #lr.fit(X_train, Y_train)
    
    print(results.mean())
    print(results)
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start

Model: Logistic Regression
Tue Dec  4 22:45:06 2018

Using Kfold: 10

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)

Logistic Regression: Fit


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   39.4s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   46.7s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   43.2s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   41.6s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   42.3s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out of   1 | elapsed:   41.3s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 out

0.6131342778357511
[0.62299952 0.62310988 0.62280146 0.62307592 0.6237115  0.62411612
 0.62535546 0.62504704 0.62345967 0.5176662 ]
Tue Dec  4 22:52:19 2018
Logistic Regression: End
