# Intro

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import namedtuple

np.random.seed(37)

tr_df = pd.read_csv('./faces/data-train.csv')
va_df = pd.read_csv('./faces/data-valid.csv')

Data = namedtuple('Data', 'X y')
T = Data(tr_df[[i for i in tr_df.columns if i != 'y']], tr_df['y'])
V = Data(va_df[[i for i in va_df.columns if i != 'y']], va_df['y'])

# Classification performance

In [2]:
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score

Metric = namedtuple('Metric', 'clazz tn fp fn tp sen spe acc f1 auc')

def get_classification_metrics(model, T, V):
    def get_metrics(clazz, cmatrix):
        tn, fp, fn, tp = cmatrix[0][0], cmatrix[0][1], cmatrix[1][0], cmatrix[1][1]
        sen = tp / (tp + fn)
        spe = tn / (tn + fp)
        acc = (tp + tn) / (tp + fp + fn + tn)
        f1 = (2.0 * tp) / (2 * tp + fp + fn)
        return clazz, tn, fp, fn, tp, sen, spe, acc, f1
    
    model.fit(T.X, T.y)
    y_pred = model.predict(V.X)
    cmatrices = multilabel_confusion_matrix(V.y, y_pred)
    
    try:
        clazzes = sorted(list(T.y.value_counts().index))
    except:
        clazzes = np.unique(T.y).astype(int)
        
    y_pred = model.predict_proba(V.X)
    metrics = []
    for clazz in clazzes:
        clazz, tn, fp, fn, tp, sen, spe, acc, f1 = get_metrics(clazz, cmatrices[clazz])
        y_true = [1 if y == clazz else 0 for y in V.y]
        auc = roc_auc_score(y_true, y_pred[:,clazz])
        metric = Metric(clazz, tn, fp, fn, tp, sen, spe, acc, f1, auc)
        metrics.append(metric)
    return metrics
        
def print_classification_metrics(metrics):
    for m in metrics:
        print('{}: sen = {:.5f}, spe = {:.5f}, acc = {:.5f}, f1 = {:.5f}, auc = {:.5f}'
              .format(m.clazz, m.sen, m.spe, m.acc, m.f1, m.auc))

# Logistic regression

In [3]:
from sklearn.linear_model import LogisticRegression

print_classification_metrics(
    get_classification_metrics(
        LogisticRegression(random_state=37, multi_class='ovr', solver='newton-cg'), T, V))

0: sen = 1.00000, spe = 0.82667, acc = 0.87000, f1 = 0.79365, auc = 1.00000
1: sen = 1.00000, spe = 0.74667, acc = 0.81000, f1 = 0.72464, auc = 1.00000
2: sen = 0.72000, spe = 1.00000, acc = 0.93000, f1 = 0.83721, auc = 1.00000
3: sen = 0.00000, spe = 1.00000, acc = 0.75000, f1 = 0.00000, auc = 0.66347


# Random forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

print_classification_metrics(
    get_classification_metrics(
        RandomForestClassifier(n_estimators=100, random_state=37), T, V))

0: sen = 1.00000, spe = 0.98667, acc = 0.99000, f1 = 0.98039, auc = 1.00000
1: sen = 1.00000, spe = 1.00000, acc = 1.00000, f1 = 1.00000, auc = 1.00000
2: sen = 0.88000, spe = 0.93333, acc = 0.92000, f1 = 0.84615, auc = 0.98107
3: sen = 0.76000, spe = 0.96000, acc = 0.91000, f1 = 0.80851, auc = 0.96720


# Sample data to counter data imbalance

In [5]:
from scipy.stats import multivariate_normal

def sample(mvn, N=5000):
    X = np.array([multivariate_normal.rvs(mean=mvn.mean, cov=mvn.cov) for _ in range(N)])
    y = np.full((N, 1), mvn.clazz, dtype=np.int32)
    return np.hstack([X, y])

Mvn = namedtuple('Mvn', 'clazz mean cov')

X_cols = [i for i in tr_df.columns if i != 'y']

mvns = { clazz: Mvn(clazz, 
                    tr_df[tr_df['y'] == clazz][X_cols].mean().values, 
                    tr_df[tr_df['y'] == clazz][X_cols].cov().values) 
        for clazz in list(sorted(tr_df['y'].value_counts().index)) }

S = np.vstack([sample(mvn) for mvn in mvns.values()])
print(S.shape)

X = S[:, 0:S.shape[1] - 1]
y = S[:, S.shape[1] - 1]

S = Data(X, y)

(20000, 19)


# Logistic regression applied to sampled data

In [6]:
print_classification_metrics(
    get_classification_metrics(
        LogisticRegression(random_state=37, multi_class='ovr', solver='newton-cg'), S, V))

0: sen = 1.00000, spe = 1.00000, acc = 1.00000, f1 = 1.00000, auc = 1.00000
1: sen = 1.00000, spe = 1.00000, acc = 1.00000, f1 = 1.00000, auc = 1.00000
2: sen = 0.88000, spe = 0.97333, acc = 0.95000, f1 = 0.89796, auc = 0.99413
3: sen = 0.92000, spe = 0.96000, acc = 0.95000, f1 = 0.90196, auc = 0.99093


# Random forest applied to sampled data

In [7]:
print_classification_metrics(
    get_classification_metrics(
        RandomForestClassifier(n_estimators=100, random_state=37), S, V))

0: sen = 1.00000, spe = 1.00000, acc = 1.00000, f1 = 1.00000, auc = 1.00000
1: sen = 1.00000, spe = 1.00000, acc = 1.00000, f1 = 1.00000, auc = 1.00000
2: sen = 1.00000, spe = 0.97333, acc = 0.98000, f1 = 0.96154, auc = 0.99920
3: sen = 0.92000, spe = 1.00000, acc = 0.98000, f1 = 0.95833, auc = 0.99840
