In [37]:
import numpy as np
#in case we need to repeat experiment
#np.random.seed(255)

import pandas as pd
pd.options.display.max_rows = 22

import matplotlib.pyplot as plt
plt.style.use('classic')

import seaborn as sns
sns.set()

#sklearn imports
from sklearn.cluster import KMeans
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from pandas.api.types import CategoricalDtype
from sklearn.naive_bayes import GaussianNB
import pickle # allows for model to be saved/load to file
import time

#Use print instead of display when run as python script
pyscript = True

#Classifier verborsity where supported
verbose_level=3

#Multiclass classification, binary if falase
multiclass = True
over_sample = True

#inputfile = 'CKME136X10_2018_Data_CTF.csv'
if multiclass:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_M_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_M_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_M_Test.csv'
else:
    inputfile_train_O = 'CKME136X10_2018_Data_CTFB_B_O_Train.csv'
    inputfile_train_U = 'CKME136X10_2018_Data_CTFB_B_U_Train.csv'
    inputfile_test = 'CKME136X10_2018_Data_CTFB_B_Test.csv'

if over_sample:
    datafile_train = inputfile_train_O
else:
    datafile_train = inputfile_train_U

datafile_test = inputfile_test
    
model_max_iter = 100
datestr = 'dec_09_binary_run_1000_KBO'

#Model Store
file_lr = 'lr_' + datestr + '.model'
file_lr_l1 = 'lr_l2_' + datestr + '.model'
file_dt = 'dt_' + datestr + '.model'
file_svm = 'svm_' + datestr + '.model'
file_knn = 'knn_' + datestr + '.model'
file_mlp = 'mlp_' + datestr + '.model'
file_kmean = 'kmean_' + datestr + '.model'
file_nbayes = 'nbayes_' + datestr + '.model'

file_final_train = 'final_train_' + datestr + '.csv'
file_final_test = 'final_test_' + datestr + '.csv'

#Enable Optimization Algorithms
enable_grid_search = False
svm_c = 1
svm_gamma = 1
nfold = 10

enable_lr_l1 = False
predict_lr_l1 = False

# Enable Algorithms
enable_lr = True
enable_dt = True
enable_svm = True
enable_knn = False
enable_mlp = True
enable_nbayes = True

predict_lr = True
predict_dt = True
predict_svm = True
predict_knn = False
predict_mlp = True
predict_nbayes = True

In [38]:
# this function converts the data frame to the appropriate data type
def convert_type(data):
    data = data.astype('category')
    data['C_MNTH'] = data['C_MNTH'].astype(CategoricalDtype(ordered=True))
    data['C_WDAY'] = data['C_WDAY'].astype(CategoricalDtype(ordered=True))
    data['C_HOUR'] = data['C_HOUR'].astype(CategoricalDtype(ordered=True))
    data['C_VEHS'] = data['C_VEHS'].astype(CategoricalDtype(ordered=True))
    data['P_AGE'] = data['P_AGE'].astype(CategoricalDtype(ordered=True))
    data['P_PSN'] = data['P_PSN'].astype(CategoricalDtype(ordered=True))
    data['P_ISEV'] = data['P_ISEV'].astype('int')
    return data

In [39]:
#print("Sample size: {}".format(sampleN))

if multiclass:
    print("Multi-Class Classification: Enabled")
else:
    print("Multi-Class Classification: Disabled")

if enable_lr:
    print("Logistic Regression: Enabled")
else:
    print("Logistic Regression: Disabled")
    
if enable_dt:
    print("Decision Tree: Enabled")
else:
    print("Decision Tree: Disabled")
    
if enable_svm:
    print("Support Vector Machines: Enabled")
else:
    print("Support Vector Machines: Disabled")

if  enable_knn:
    print("KNN: Enabled")
else:
    print("KNN: Disabled")
    
if enable_nbayes:
    print("Naive Bayes: Enabled")
else:
    print("Naive Bayes: Disabled")
    
if enable_mlp:
    print("MLP: Enabled")
else:
    print("MLP: Disabled")

Multi-Class Classification: Enabled
Logistic Regression: Enabled
Decision Tree: Enabled
Support Vector Machines: Enabled
KNN: Disabled
Naive Bayes: Enabled
MLP: Enabled


In [40]:
#load data
df_test = pd.read_csv(datafile_test, engine = 'python')
df_train = pd.read_csv(datafile_train, engine = 'python')
df = df_train.copy()

print(df_test.head(2))
print(df_train.head(2))

   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       5       6       1       2      35       2       1       1       1   
1       8       4       3       2      21       2       2       1       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       1      2      3      1       1       1  
1       5      2      4      1       1       0  
   C_MNTH  C_WDAY  C_HOUR  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0       4       7       5       3      22       1       1       2       3   
1      11       4       5       1       6       1       1       1       1   

   C_TRAF  P_SEX  P_AGE  P_PSN  P_USER  P_ISEV  
0       7      1      1      2       2       0  
1       7      2      4      1       1       1  


In [41]:
df_test_cat = convert_type(df_test)
print(df_test_cat.info())
df_train_cat = convert_type(df_train)
print(df_train_cat.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1444846 entries, 0 to 1444845
Data columns (total 15 columns):
C_MNTH    1444846 non-null category
C_WDAY    1444846 non-null category
C_HOUR    1444846 non-null category
C_VEHS    1444846 non-null category
C_CONF    1444846 non-null category
C_RCFG    1444846 non-null category
C_WTHR    1444846 non-null category
C_RSUR    1444846 non-null category
C_RALN    1444846 non-null category
C_TRAF    1444846 non-null category
P_SEX     1444846 non-null category
P_AGE     1444846 non-null category
P_PSN     1444846 non-null category
P_USER    1444846 non-null category
P_ISEV    1444846 non-null int32
dtypes: category(14), int32(1)
memory usage: 24.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5259385 entries, 0 to 5259384
Data columns (total 15 columns):
C_MNTH    category
C_WDAY    category
C_HOUR    category
C_VEHS    category
C_CONF    category
C_RCFG    category
C_WTHR    category
C_RSUR    category
C_RALN    category
C_TRAF  

In [42]:
total_test_Rows = df_test_cat.index.size
print("Number of Rows in test data: {}".format(total_test_Rows))

total_train_Rows = df_train_cat.index.size
print("Number of Rows in train data: {}".format(total_train_Rows))

Number of Rows in test data: 1444846
Number of Rows in train data: 5259385


In [43]:
#Split between data and class for training
Y = df_train_cat[df_train_cat.columns[-1]]
X = df_train_cat[df_train_cat.columns[0:df_train_cat.columns.size -1]]

Y_test = df_test_cat[df_test_cat.columns[-1]]
X_test = df_test_cat[df_test_cat.columns[0:df_test_cat.columns.size -1]]

In [44]:
print(Y.unique())
print(Y.groupby(Y).size())
print(Y_test.unique())
print(Y_test.groupby(Y_test).size())

[0 1 2]
P_ISEV
0    1432039
1    1913673
2    1913673
Name: P_ISEV, dtype: int64
[1 0 2]
P_ISEV
0    613731
1    820145
2     10970
Name: P_ISEV, dtype: int64


   ### Logistic Regression Model Evaluation with cross fold

In [45]:
if enable_lr:
    print("Logistic Regression: Start")
    t_start =  time.time()
    print()
    print("Model: Logistic Regression")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Kfold: {}".format(nfold))
    #kfold = model_selection.KFold(n_splits=nfold)
    kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True)  
    
    lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
    print()
    print("Model: Logistic Regression")
    print(lr)
    print()
    print("Logistic Regression: Fit")
    results = model_selection.cross_val_score(lr, X, Y, cv=kfold)
    
    print('K-fold results: {}'.format(results))
    print('K-fold mean: {}'.format(results.mean()))

    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Logistic Regression: End")

Logistic Regression: Start

Model: Logistic Regression
Sun Dec  9 15:33:34 2018

Using Kfold: 10

Model: Logistic Regression
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=10,
          penalty='l2', random_state=0, solver='saga', tol=0.0001,
          verbose=3, warm_start=False)

Logistic Regression: Fit
convergence after 14 epochs took 171 seconds
convergence after 14 epochs took 173 seconds
convergence after 16 epochs took 179 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.0min finished


convergence after 14 epochs took 173 seconds
convergence after 14 epochs took 174 seconds
convergence after 16 epochs took 181 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.0min finished


convergence after 13 epochs took 161 seconds
convergence after 15 epochs took 171 seconds
convergence after 16 epochs took 175 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  2.9min finished


convergence after 14 epochs took 174 seconds
convergence after 14 epochs took 174 seconds
convergence after 18 epochs took 187 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.1min finished


convergence after 14 epochs took 170 seconds
convergence after 14 epochs took 174 seconds
convergence after 16 epochs took 178 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.0min finished


convergence after 13 epochs took 159 seconds
convergence after 14 epochs took 163 seconds
convergence after 15 epochs took 168 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  2.8min finished


convergence after 14 epochs took 174 seconds
convergence after 14 epochs took 175 seconds
convergence after 15 epochs took 177 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.0min finished


convergence after 14 epochs took 169 seconds
convergence after 15 epochs took 177 seconds
convergence after 15 epochs took 178 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  3.0min finished


convergence after 14 epochs took 170 seconds
convergence after 14 epochs took 172 seconds
convergence after 15 epochs took 175 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  2.9min finished


convergence after 13 epochs took 161 seconds
convergence after 14 epochs took 167 seconds
convergence after 16 epochs took 174 seconds


[Parallel(n_jobs=10)]: Done   3 out of   3 | elapsed:  2.9min finished


K-fold results: [0.52025516 0.52244362 0.52082177 0.52150824 0.52055565 0.52068114
 0.51968673 0.52066593 0.52190372 0.52083424]
K-fold mean: 0.5209356224104622
Sun Dec  9 16:03:50 2018
Logistic Regression: End


### Naive Bayes Model Evaluation with cross fold

In [46]:
if enable_nbayes:
    print("Naive Bayes: Start")
    t_start =  time.time()
    print()
    print("Model: Naive Bayes")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True)  
    nbayes = GaussianNB()
    print()
    print("Model: Naive Bayes")
    print(nbayes)
    print()
    print("Naive Bayes: Fit")
    results = model_selection.cross_val_score(nbayes, X, Y, cv=kfold)
    #lr.fit(X_train, Y_train)
    
    print('K-fold results: {}'.format(results))
    print('K-fold mean: {}'.format(results.mean()))

    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Naive Bayes: End")

Naive Bayes: Start

Model: Naive Bayes
Sun Dec  9 16:03:50 2018

Using Stratified Kfold: 10

Model: Naive Bayes
GaussianNB(priors=None)

Naive Bayes: Fit
K-fold results: [0.47117732 0.46979503 0.47048903 0.47146051 0.47057258 0.47079884
 0.47162023 0.47013336 0.47105552 0.46967793]
K-fold mean: 0.470678035577418
Sun Dec  9 16:04:49 2018
Naive Bayes: End


### Decision Tree Model Evaluation with cross fold

In [47]:
if predict_dt:
    print("Decision Tree: Start")
    t_start =  time.time()
    print()
    print("Model: Decision Tree")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True)  
    tree = DecisionTreeClassifier(criterion='entropy',max_depth=50)
    print()
    print("Model: Decision Tree")
    print(tree)
    print()
    print("Decision Tree: Fit")
    results = model_selection.cross_val_score(tree, X, Y, cv=kfold)
    #lr.fit(X_train, Y_train)
    
    print('K-fold results: {}'.format(results))
    print('K-fold mean: {}'.format(results.mean()))

    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Decision Tree: End")

Decision Tree: Start

Model: Decision Tree
Sun Dec  9 16:04:49 2018

Using Stratified Kfold: 10

Model: Decision Tree
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=50,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Decision Tree: Fit
K-fold results: [0.72722554 0.72708864 0.72724645 0.72727964 0.7272188  0.72731577
 0.7268062  0.72702866 0.72629664 0.72781911]
K-fold mean: 0.7271325450334152
Sun Dec  9 16:15:01 2018
Decision Tree: End


### ANN - Multilayer Perceptron Model evaluation with cross fold

In [48]:
if predict_mlp:
    print("Multilayer Perceptron: Start")
    t_start =  time.time()
    print()
    print("Model: Multilayer Perceptron")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True)  
    mlpc = MLPClassifier(hidden_layer_sizes=(12, 12, 12), verbose=verbose_level, max_iter=model_max_iter)
    print()
    print("Model: Multilayer Perceptron")
    print(mlpc)
    print()
    print("Multilayer Perceptron: Fit")
    results = model_selection.cross_val_score(mlpc, X, Y, cv=kfold)
    #lr.fit(X_train, Y_train)
    
    print('K-fold results: {}'.format(results))
    print('K-fold mean: {}'.format(results.mean()))

    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("Multilayer Perceptron: End")

Multilayer Perceptron: Start

Model: Multilayer Perceptron
Sun Dec  9 16:15:01 2018

Using Stratified Kfold: 10

Model: Multilayer Perceptron
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(12, 12, 12), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=3, warm_start=False)

Multilayer Perceptron: Fit
Iteration 1, loss = 0.89839276
Iteration 2, loss = 0.86490088
Iteration 3, loss = 0.85889796
Iteration 4, loss = 0.85605729
Iteration 5, loss = 0.85340451
Iteration 6, loss = 0.85171759
Iteration 7, loss = 0.85072361
Iteration 8, loss = 0.85006409
Iteration 9, loss = 0.84904910
Iteration 10, loss = 0.84842696
Iteration 11, loss = 0.84779716
Iteration 12, loss = 0.84724909
Iteration 13, loss 

Iteration 30, loss = 0.83212541
Iteration 31, loss = 0.83214392
Iteration 32, loss = 0.83190121
Iteration 33, loss = 0.83189668
Iteration 34, loss = 0.83164697
Iteration 35, loss = 0.83154879
Iteration 36, loss = 0.83149812
Iteration 37, loss = 0.83149829
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
Iteration 1, loss = 0.90038735
Iteration 2, loss = 0.86349982
Iteration 3, loss = 0.85634717
Iteration 4, loss = 0.84954394
Iteration 5, loss = 0.84603006
Iteration 6, loss = 0.84437768
Iteration 7, loss = 0.84343667
Iteration 8, loss = 0.84252144
Iteration 9, loss = 0.84175350
Iteration 10, loss = 0.84105159
Iteration 11, loss = 0.84057992
Iteration 12, loss = 0.84034315
Iteration 13, loss = 0.83996196
Iteration 14, loss = 0.83945631
Iteration 15, loss = 0.83920385
Iteration 16, loss = 0.83899730
Iteration 17, loss = 0.83891561
Iteration 18, loss = 0.83875150
Iteration 19, loss = 0.83867495
Iteration 20, loss = 0.83847478
Iteration 21, loss = 0

### SVM Model Evaluation with SVM

In [49]:
if predict_svm:
    print("SVM: Start")
    t_start =  time.time()
    print()
    print("Model: SVM")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True)  
    svm = SVC(C=1, gamma = 'auto', verbose = verbose_level, max_iter=model_max_iter)
    print()
    print("Model: SVM")
    print(svm)
    print()
    print("SVM: Fit")
    results = model_selection.cross_val_score(svm, X, Y, cv=kfold)
    #lr.fit(X_train, Y_train)
    
    print('K-fold results: {}'.format(results))
    print('K-fold mean: {}'.format(results.mean()))

    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("SVM: End")

SVM: Start

Model: SVM
Sun Dec  9 18:15:05 2018

Using Stratified Kfold: 10

Model: SVM
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=100, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=3)

SVM: Fit
[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



K-fold results: [0.27603529 0.27223448 0.27247785 0.2734695  0.27369956 0.27265571
 0.27377181 0.27370717 0.27470348 0.27271898]
K-fold mean: 0.2735473822434865
Sun Dec  9 18:28:12 2018
SVM: End


### K-N-N Model Evaluation with cross fold

In [50]:
if predict_knn:
    print("KNN: Start")
    t_start =  time.time()
    print()
    print("Model: KNN")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True)  
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski', n_jobs = -1)
    print()
    print("Model: KNN")
    print(knn)
    print()
    print("KNN: Fit")
    results = model_selection.cross_val_score(knn, X, Y, cv=kfold)
    #lr.fit(X_train, Y_train)
    
    print('K-fold results: {}'.format(results))
    print('K-fold mean: {}'.format(results.mean()))

    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
print("KNN: End")

KNN: End


In [51]:
## extra code and play

In [52]:
if (False):    
    seed = 101
    print("Logistic Regression: Start")
    t_start =  time.time()
    print()
    print("Model: Logistic Regression")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Kfold: {}".format(nfold))
    #kfold = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
    kfold = model_selection.KFold(n_splits=nfold, random_state=seed)
    
    models = []
    model_scores_train = []
    model_scores_val = []
    
    X_kfold = np.array(X)
    Y_kfold = np.array(Y)
    
    i = 0
    for train_index, val_index in kfold.split(X_kfold):
        i = i+1
        print("Fold: {}".format(i))
        
        X_train, X_val = X_kfold[train_index], X_kfold[val_index]
        y_train, y_val = Y_kfold[train_index], Y_kfold[val_index]
        
        lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
        print()
        print("Model: Logistic Regression")
        print(lr)
        print()
        print("Logistic Regression: Fit")
        lr.fit(X_train, y_train)

        #add model for list
        models.append(lr)

        print("Logistic Regression: Predict")
        y_pred = lr.predict(X_val)

        mst = lr.score(X_train, y_train)
        model_scores_train.append(mst)
        
        msv = lr.score(X_val, y_val)
        model_scores_val.append(msv)
        
        print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(mst))
        print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(msv))

        # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Intercept")
        print(lr.intercept_)

        # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Coefficients")
        print(lr.coef_)

        print("Logistic Regression: Confusion Matrix")
        cnf_matrix_lg = confusion_matrix(y_val, y_pred)
        print(cnf_matrix_lg)
    
        print("Logistic Regression: Classification Report")
        print(classification_report(y_val, y_pred))

    print('AVG Accuracy of logistic regression classifier on train set: {:.2f}'.format(np.mean(model_scores_train)))
    print('AVG Accuracy of logistic regression classifier on validation set: {:.2f}'.format(np.mean(model_scores_val)))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    # save model to file
    pickle.dump(models, open(file_lr, "wb"))
    print("Logistic Regression: End")

In [53]:
if (False):    
    seed = 101
    print("Logistic Regression: Start")
    t_start =  time.time()
    print()
    print("Model: Logistic Regression")
    print(time.asctime( time.localtime(t_start) ))
    print()
    print("Using Stratified Kfold: {}".format(nfold))
    
    models = []
    model_scores_train = []
    model_scores_val = []
    
    X_kfold = np.array(X)
    Y_kfold = np.array(Y)
    
    skf = model_selection.StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)    
    i = 0
    for train_index, val_index in skf.split(X_kfold, Y_kfold):
        i = i+1
        print("Fold: {}".format(i))
        
        X_train, X_val = X_kfold[train_index], X_kfold[val_index]
        y_train, y_val = Y_kfold[train_index], Y_kfold[val_index]
        
        lr = LogisticRegression(C=1, random_state=0, solver='saga', multi_class='ovr', 
                            verbose=verbose_level, n_jobs=10, max_iter=model_max_iter)
        print()
        print("Model: Logistic Regression")
        print(lr)
        print()
        print("Logistic Regression: Fit")
        lr.fit(X_train, y_train)

        #add model for list
        models.append(lr)

        print("Logistic Regression: Predict")
        y_pred = lr.predict(X_val)

        mst = lr.score(X_train, y_train)
        model_scores_train.append(mst)
        
        msv = lr.score(X_val, y_val)
        model_scores_val.append(msv)
        
        print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(mst))
        print('Accuracy of logistic regression classifier on validation set: {:.2f}'.format(msv))

        # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Intercept")
        print(lr.intercept_)

        # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Coefficients")
        print(lr.coef_)

        print("Logistic Regression: Confusion Matrix")
        cnf_matrix_lg = confusion_matrix(y_val, y_pred)
        print(cnf_matrix_lg)
    
        print("Logistic Regression: Classification Report")
        print(classification_report(y_val, y_pred))

    print('AVG Accuracy of logistic regression classifier on train set: {:.2f}'.format(np.mean(model_scores_train)))
    print('AVG Accuracy of logistic regression classifier on validation set: {:.2f}'.format(np.mean(model_scores_val)))
    
    t_end =  time.time()
    print(time.asctime( time.localtime(t_end) ))
    
    # save model to file
    pickle.dump(models, open(file_lr, "wb"))
    print("Logistic Regression: End")

In [54]:
if (False)
    ## for each model generated, lets predict the on test set
    ## note, this is the first time any of these observations are seen by the model
    # load model from file
    loaded_model = pickle.load(open(file_lr, "rb"))
    print("Logistic Regression: Predict on test set")

    i = 0
    for model in loaded_model:
        i = i + 1
        y_pred = model.predict(X_test)

        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(X_test, Y_test)))

        # print the intercept (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Intercept")
        print(model.intercept_)

        # print the coeficients (Note: one vs rest => 1 vs 2and3, 2 vs 1and3, 3 vs 1and2)
        print("Logistic Regression: Coefficients")
        print(model.coef_)

        print("Logistic Regression: Confusion Matrix")
        cnf_matrix_lg = confusion_matrix(Y_test, y_pred)
        print(cnf_matrix_lg)
    
        print("Logistic Regression: Classification Report")
        print(classification_report(Y_test, y_pred))

SyntaxError: invalid syntax (<ipython-input-54-c5448911bff4>, line 1)