# Lab Practice 1
### A. Boyko, E. Gladin

#### Train  LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier and SVC on each pair of classes from datasets BreastCancer, Iris, Wine, Digits. Check accuracy and AUC-ROC metrics on the control set. Are there best classifiers depending on their AUC-ROC and accuracy?

In [1]:
import numpy as np
import matplotlib.pylab as plt
import pandas as pd
import seaborn as sns
from itertools import combinations
import scipy.stats as st
from permute.core import two_sample
from arch.bootstrap import IIDBootstrap
from sklearn.metrics import accuracy_score
from statsmodels.stats.diagnostic import kstest_normal
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from statsmodels.stats.multitest import multipletests

#### Import and split datasets. We also scale the features:

In [2]:
from sklearn.preprocessing import scale

from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_wine
from sklearn.datasets import load_digits
from sklearn.datasets import load_iris

all_data = []

for dataset in [load_breast_cancer(), load_wine(), load_digits(), load_iris()]:
    
    data = dataset.data
    data = scale(data)
    target = dataset.target
    train_data, test_data, train_target,  test_target = train_test_split(data, target, test_size = 0.5)
    all_data.append([train_data, train_target, test_data, test_target])

#### Define evaluation function: here for each method and each dataset we train method on each pair classes from this dataset

In [3]:
def evaluate(train_data, train_target, test_data, test_target, model):
    
    unique_targets = np.unique(train_target)
    #pair = combinations(unique_targets, 2) #(range(a.shape[0]), 2))
    
    AUC = []
    acc = []
    
    for pair in combinations(unique_targets, 2):
        pair_indexes = np.where((train_target == pair[0]) + (train_target == pair[1]))[0]
        train_data_ = train_data[pair_indexes]
        train_target_ = train_target[pair_indexes]
        for i in range(len(train_target_)):
            if train_target_[i] == pair[0]:
                train_target_[i] = 0
            if train_target_[i] == pair[1]:
                train_target_[i] = 1
        
        pair_indexes = np.where((test_target == pair[0]) + (test_target == pair[1]))[0]
        test_data_ = test_data[pair_indexes]
        test_target_ = test_target[pair_indexes]
        for i in range(len(test_target_)):
            if test_target_[i] == pair[0]:
                test_target_[i] = 0
            if test_target_[i] == pair[1]:
                test_target_[i] = 1
        
        model.fit(train_data_, train_target_)
        
        target_proba = model.predict_proba(test_data_)
        target_pred = model.predict(test_data_)
        
        AUC.append(roc_auc_score(test_target_, target_proba[:, 1]))
        acc.append(accuracy_score(test_target_, target_pred))
    
    return AUC, acc

#### importing models:

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

model_list = [KNeighborsClassifier(), DecisionTreeClassifier(), LogisticRegression(), SVC(probability = True)]
AUCs = {1:[], 2:[], 3:[], 4:[]}
accs = {1:[], 2:[], 3:[], 4:[]}

In [5]:
for train_data, train_target, test_data, test_target in all_data:
    for model in model_list:
        AUC, acc = evaluate(train_data, train_target, test_data, test_target, model)
        AUCs[model_list.index(model)+1] += AUC
        accs[model_list.index(model)+1] += acc





In [6]:
auc = np.vstack([AUCs[1], AUCs[2], AUCs[3], AUCs[4]]).transpose()
acc = np.vstack([accs[1], accs[2], accs[3], accs[4]]).transpose()

#### Now we use permutational criteria (requires no additional assumptions on data) to compare each model with all other models. Null hypothesis is that mean metrics are the same in both models, and alternative is that first model's mean metric is less than second model's mean metric(first model is worse). After that we use multiple test with Holm's correction on retrieved p-values. If for current model none null hypothesis are rejected, it is considered best(as it is not worse than other tasks).

In [7]:
def multiple_two_sample_test(metric_matrix):
    p_vals = np.zeros(shape = [4, 3])
    
    for i in range(4):
        counter = 0
        for j in range(4):
            if j!=i:
                p_vals[i, counter] = two_sample(metric_matrix[:, i], metric_matrix[:, j],  alternative='less', reps=10000)[0] #compare moddels h0 i = j h1 i < j
                counter += 1
    best_model = []
    for i in range(4):
        if True in multipletests(p_vals[i], method='holm')[0]:
            best_model.append(False)
        else:
            best_model.append(True)
    return best_model
# best = nothing is rejected

In [12]:
for i, answer in enumerate(multiple_two_sample_test(acc)):
    print(str(model_list[i]), "is best", answer)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform') is best True
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') is best False
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False) is best True
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_

In [13]:
for i, answer in enumerate(multiple_two_sample_test(auc)):
    print(str(model_list[i]), "is best", answer)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform') is best True
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') is best False
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False) is best True
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_

### Conclusion: we can not reliably tell whether there is a best model but we can reliably tell that best model is not DecisionTreeClassifier.