In [64]:
import pandas as pd
import os
import glob
from os.path import join
import numpy as np
import random
from pycaret.classification import *
from pycaret.datasets import get_data

from create_datasets import create_train_dataset_supervised, \
    create_test_dataset_balanced, \
    create_test_dataset_unbalanced
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, auc

In [53]:
random.seed(42)

WORK_DIR = os.path.dirname('.')
FM_DIR = os.path.join(WORK_DIR, 'feature_maps')
BENCHMARK_DIR = os.path.join(WORK_DIR, 'benchmark')
DATA_DIR = os.path.join(WORK_DIR, 'data')
OUTPUTS_DIR = os.path.join(WORK_DIR, 'outputs')

In [73]:
data_set_idx = 1001
X_train_sup, y_train_sup, sizes_train = create_train_dataset_supervised(data_set_idx)
X_test_b, y_test_b, sizes_balanced = create_test_dataset_balanced(data_set_idx)
X_test_ub, y_test_ub, sizes_unbalanced = create_test_dataset_unbalanced(data_set_idx)
data_train_df = pd.DataFrame(X_train_sup)
data_test_b_df = pd.DataFrame(X_test_b)
data_test_ub_df = pd.DataFrame(X_test_ub)
    
data_train_df['Labels'] = y_train_sup
data_test_b_df['Labels'] = y_test_b
data_test_ub_df['Labels'] = y_test_ub

In [74]:
data_test_b_df.shape

(7098, 513)

# Check balanced dataset

In [131]:
setup(data_train_df, test_data=data_test_b_df, index=False, target='Labels', session_id=42, n_jobs=1)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Labels
2,Target type,Binary
3,Original data shape,"(23660, 513)"
4,Transformed data shape,"(23660, 513)"
5,Transformed train set shape,"(16562, 513)"
6,Transformed test set shape,"(7098, 513)"
7,Numeric features,512
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x203076b9860>

In [132]:
models = compare_models(cross_validation=False, include=['lr', 'knn', 'dummy'], n_select=4)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9868,0.9962,0.9944,0.9795,0.9869,0.9735,0.9736,0.18
lr,Logistic Regression,0.981,0.9976,0.9797,0.9822,0.981,0.962,0.962,0.15
dummy,Dummy Classifier,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.19


In [133]:
df_results = pd.DataFrame()
predict_model(models[1])
df_results = pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.981,0.9976,0.9797,0.9822,0.981,0.962,0.962


In [134]:
models[1]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [135]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [136]:
lr.fit(X_train_sup, y_train_sup)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [137]:
y_pred = lr.predict(X_test_b)
y_scores = lr.predict_proba(X_test_b)

In [138]:
accuracy = accuracy_score(y_test_b, y_pred)
print(f'accuracy: {accuracy}')
auc = roc_auc_score(y_test_b, y_scores[:,1])
print(f'auc: {auc}')
f1 = f1_score(y_test_b, y_pred)
print(f'f1: {f1}')

accuracy: 0.9809805579036348
auc: 0.997648347996225
f1: 0.9809564113415151


In [139]:
df_results = pd.DataFrame()
predict_model(models[0])
df_results = pull()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.9868,0.9962,0.9944,0.9795,0.9869,0.9735,0.9736


In [140]:
models[0]

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

In [141]:
knn = KNeighborsClassifier(n_jobs=1)
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

In [142]:
knn.fit(X_train_sup, y_train_sup)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

In [143]:
y_pred = knn.predict(X_test_b)
y_scores = knn.predict_proba(X_test_b)

In [144]:
accuracy = accuracy_score(y_test_b, y_pred)
print(f'accuracy: {accuracy}')
auc = roc_auc_score(y_test_b, y_scores[:,1])
print(f'auc: {auc}')
f1 = f1_score(y_test_b, y_pred)
print(f'f1: {f1}')

accuracy: 0.9867568329106791
auc: 0.9961682839633292
f1: 0.9868568232662193


# Check unbalanced dataset

In [145]:
setup(data_train_df, test_data=data_test_ub_df, index=False, target='Labels', session_id=42, n_jobs=1)
models = compare_models(cross_validation=False, include=['lr', 'knn', 'dummy'], n_select=4)
df_results = pd.DataFrame()
predict_model(models[1])
df_results = pull()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Labels
2,Target type,Binary
3,Original data shape,"(23896, 513)"
4,Transformed data shape,"(23896, 513)"
5,Transformed train set shape,"(16562, 513)"
6,Transformed test set shape,"(7334, 513)"
7,Numeric features,512
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9853,0.9961,0.9938,0.9675,0.9805,0.9687,0.9689,0.2
lr,Logistic Regression,0.9798,0.9973,0.9762,0.9698,0.973,0.9569,0.9569,0.16
dummy,Dummy Classifier,0.6278,0.5,0.0,0.0,0.0,0.0,0.0,0.15


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.9798,0.9973,0.9762,0.9698,0.973,0.9569,0.9569


In [146]:
models[1]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [147]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [148]:
lr.fit(X_train_sup, y_train_sup)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [149]:
y_pred = lr.predict(X_test_ub)
y_scores = lr.predict_proba(X_test_ub)

In [150]:
accuracy = accuracy_score(y_test_ub, y_pred)
print(f'accuracy: {accuracy}')
auc = roc_auc_score(y_test_ub, y_scores[:,1])
print(f'auc: {auc}')
f1 = f1_score(y_test_ub, y_pred)
print(f'f1: {f1}')

accuracy: 0.9798200163621489
auc: 0.9972944373900066
f1: 0.97298284045272


In [151]:
models[0]

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

In [152]:
knn = KNeighborsClassifier(n_jobs=1)
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

In [153]:
knn.fit(X_train_sup, y_train_sup)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=1, n_neighbors=5, p=2,
                     weights='uniform')

In [154]:
y_pred = knn.predict(X_test_ub)
y_scores = knn.predict_proba(X_test_ub)

In [155]:
accuracy = accuracy_score(y_test_ub, y_pred)
print(f'accuracy: {accuracy}')
auc = roc_auc_score(y_test_ub, y_scores[:,1])
print(f'auc: {auc}')
f1 = f1_score(y_test_ub, y_pred)
print(f'f1: {f1}')

accuracy: 0.9852740659940006
auc: 0.996134791215156
f1: 0.9804842790025299
