 ## Classification Model Analysis
 The goal of this notebook is to build some different classification models using both the
 datasets from both PCA algorithms (normal and contrastive). The model results will be used
 as another comparison point between the datasets.

In [1]:
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.model_selection import GridSearchCV
#from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from contrastive import CPCA
from src.models.encoders.levelmulti import LevelMulti
from src.models.pipelines.pipelines import pca_pipeline

mpl.style.use('seaborn-notebook')
plt.rcParams["figure.figsize"] = (12, 9)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)


In [2]:
def load_data(filename):
    df = pd.read_csv(filename, index_col=0)
    ylevel = df['level'].values.copy()      
    dfData = df.drop(['level', 'blexp'], axis=1).copy()
    return ylevel, dfData.values.copy()

def pca_data(dsver, dsname, dstype='train'):
    ydata, Xdata = load_data('./data/processed/ds{0:04d}-{1}-{2}.csv'.format(dsver, dsname, dstype))
    ylabels = LevelMulti(targetmin=0.2, targetmax=0.8).transform(ydata.copy())   

    ppca = pca_pipeline(ncomponents=2)
    Xpca = ppca.fit_transform(Xdata)
    return ylabels, Xpca     

def cpca_data(dsver, dsname, alpha, dstype='train', bgname='nosignal'):
    _, Xback = load_data('./data/processed/ds{0:04d}-{1}-background-{2}.csv'.format(dsver, dsname, bgname))    
    ydata, Xdata = load_data('./data/processed/ds{0:04d}-{1}-{2}.csv'.format(dsver, dsname, dstype))
    ylabels = LevelMulti(targetmin=0.2, targetmax=0.8).transform(ydata.copy())   

    Xpca = CPCA().fit_transform(Xdata, Xback, alpha_selection='manual', alpha_value=alpha)  
    return ylabels, Xpca


In [3]:
def fit_model(name, estimator, ytrain, Xtrain, ytest, Xtest):
    estimator.fit(Xtrain.copy(), ytrain)
    ypred = estimator.predict(Xtest.copy())

    cmatrix = confusion_matrix(ytest, ypred)
    print('{0} CF Matrix:'.format(name))
    print(cmatrix)

    labels = ['low', 'norm', 'high'] 
    print(classification_report(ytest, ypred, target_names=labels))

    f1score = f1_score(ytest, ypred, average='macro')
    print('{0} F1 Score: {1:.4f}\n'.format(name, f1score))       

def run_models(ytrain, Xtrain, ytest, Xtest):
    estimator = KNeighborsClassifier(n_neighbors=5)
    fit_model('KNN', estimator, ytrain, Xtrain, ytest, Xtest)

    #estimator = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3,)
    #fit_model('DTREE', estimator, ytrain, Xtrain, ytest, Xtest)

    parms = {'C': [1, 10, 100], 'gamma': [0.1, 0.01, 0.001]}
    estimator = GridSearchCV(SVC(kernel='rbf'), param_grid=parms, cv=10, scoring='accuracy') 
    fit_model('SVC', estimator, ytrain, Xtrain, ytest, Xtest)   


 ## PCA Model Analysis
 TBD.

In [4]:
ytrain, Xtrain = pca_data(2, 'baseline', dstype='train')
ytest, Xtest = pca_data(2, 'baseline', dstype='test')
run_models(ytrain, Xtrain, ytest, Xtest)


KNN CF Matrix:
[[16  0  0]
 [ 1 64  2]
 [ 0  0 17]]
              precision    recall  f1-score   support

         low       0.94      1.00      0.97        16
        norm       1.00      0.96      0.98        67
        high       0.89      1.00      0.94        17

   micro avg       0.97      0.97      0.97       100
   macro avg       0.95      0.99      0.96       100
weighted avg       0.97      0.97      0.97       100

KNN F1 Score: 0.9637

SVC CF Matrix:
[[16  0  0]
 [ 2 62  3]
 [ 0  0 17]]
              precision    recall  f1-score   support

         low       0.89      1.00      0.94        16
        norm       1.00      0.93      0.96        67
        high       0.85      1.00      0.92        17

   micro avg       0.95      0.95      0.95       100
   macro avg       0.91      0.98      0.94       100
weighted avg       0.96      0.95      0.95       100

SVC F1 Score: 0.9404



In [5]:
ytrain, Xtrain = pca_data(2, 'filtered', dstype='train')
ytest, Xtest = pca_data(2, 'filtered', dstype='test')
run_models(ytrain, Xtrain, ytest, Xtest)


KNN CF Matrix:
[[12  4  0]
 [ 1 62  4]
 [ 0  4 13]]
              precision    recall  f1-score   support

         low       0.92      0.75      0.83        16
        norm       0.89      0.93      0.91        67
        high       0.76      0.76      0.76        17

   micro avg       0.87      0.87      0.87       100
   macro avg       0.86      0.81      0.83       100
weighted avg       0.87      0.87      0.87       100

KNN F1 Score: 0.8325

SVC CF Matrix:
[[14  2  0]
 [ 2 62  3]
 [ 0  4 13]]
              precision    recall  f1-score   support

         low       0.88      0.88      0.88        16
        norm       0.91      0.93      0.92        67
        high       0.81      0.76      0.79        17

   micro avg       0.89      0.89      0.89       100
   macro avg       0.87      0.86      0.86       100
weighted avg       0.89      0.89      0.89       100

SVC F1 Score: 0.8605



 ## Contrastive PCA Model Analysis
 TBD.

In [6]:
alpha = 151.18
ytrain, Xtrain = cpca_data(2, 'filtered', dstype='train', alpha=alpha)
ytest, Xtest = cpca_data(2, 'filtered', dstype='test', alpha=alpha)
run_models(ytrain, Xtrain, ytest, Xtest)


KNN CF Matrix:
[[12  4  0]
 [ 3 60  4]
 [ 0  3 14]]
              precision    recall  f1-score   support

         low       0.80      0.75      0.77        16
        norm       0.90      0.90      0.90        67
        high       0.78      0.82      0.80        17

   micro avg       0.86      0.86      0.86       100
   macro avg       0.82      0.82      0.82       100
weighted avg       0.86      0.86      0.86       100

KNN F1 Score: 0.8232

SVC CF Matrix:
[[12  4  0]
 [ 1 63  3]
 [ 0  3 14]]
              precision    recall  f1-score   support

         low       0.92      0.75      0.83        16
        norm       0.90      0.94      0.92        67
        high       0.82      0.82      0.82        17

   micro avg       0.89      0.89      0.89       100
   macro avg       0.88      0.84      0.86       100
weighted avg       0.89      0.89      0.89       100

SVC F1 Score: 0.8569



In [7]:
alpha = 46.42
ytrain, Xtrain = cpca_data(2, 'filtered', dstype='train', alpha=alpha)
ytest, Xtest = cpca_data(2, 'filtered', dstype='test', alpha=alpha)
run_models(ytrain, Xtrain, ytest, Xtest)


KNN CF Matrix:
[[12  4  0]
 [ 3 60  4]
 [ 0  3 14]]
              precision    recall  f1-score   support

         low       0.80      0.75      0.77        16
        norm       0.90      0.90      0.90        67
        high       0.78      0.82      0.80        17

   micro avg       0.86      0.86      0.86       100
   macro avg       0.82      0.82      0.82       100
weighted avg       0.86      0.86      0.86       100

KNN F1 Score: 0.8232

SVC CF Matrix:
[[12  4  0]
 [ 3 61  3]
 [ 0  3 14]]
              precision    recall  f1-score   support

         low       0.80      0.75      0.77        16
        norm       0.90      0.91      0.90        67
        high       0.82      0.82      0.82        17

   micro avg       0.87      0.87      0.87       100
   macro avg       0.84      0.83      0.83       100
weighted avg       0.87      0.87      0.87       100

SVC F1 Score: 0.8338



In [8]:
alpha = 58.78
ytrain, Xtrain = cpca_data(2, 'baseline', dstype='train', alpha=alpha)
ytest, Xtest = cpca_data(2, 'baseline', dstype='test', alpha=alpha)
run_models(ytrain, Xtrain, ytest, Xtest)


KNN CF Matrix:
[[16  0  0]
 [ 2 63  2]
 [ 0  1 16]]
              precision    recall  f1-score   support

         low       0.89      1.00      0.94        16
        norm       0.98      0.94      0.96        67
        high       0.89      0.94      0.91        17

   micro avg       0.95      0.95      0.95       100
   macro avg       0.92      0.96      0.94       100
weighted avg       0.95      0.95      0.95       100

KNN F1 Score: 0.9391

SVC CF Matrix:
[[16  0  0]
 [ 2 64  1]
 [ 0  1 16]]
              precision    recall  f1-score   support

         low       0.89      1.00      0.94        16
        norm       0.98      0.96      0.97        67
        high       0.94      0.94      0.94        17

   micro avg       0.96      0.96      0.96       100
   macro avg       0.94      0.97      0.95       100
weighted avg       0.96      0.96      0.96       100

SVC F1 Score: 0.9507



 ## Classification Model Results
 TBD.