# Assignment 3

Author: [Lucas David](http://github.com/lucasdavid)  
This notebook can be downloaded at https://github.com/lucasdavid/mo850/

In [1]:
import os
from itertools import combinations

import numpy as np
import pandas as pd
import scipy
from scipy import stats
import statsmodels.stats.contingency_tables
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.sandbox.stats.multicomp import multipletests

from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import matplotlib
from matplotlib import pyplot

%matplotlib inline

## Unpaired Data

In [67]:
data_dir = '../data/3/'

datasets = [letter + '.csv' for letter in 'abcde']
print('The following datasets will be loaded:', *datasets)

datasets = [pd.read_csv(os.path.join(data_dir, d), header=None, names=['measure'])
            for d in datasets]

for index, d in enumerate(datasets):
    d['group'] = index

print('Sample of a loaded dataset:',
      datasets[0].head(),
      sep='\n')

The following datasets will be loaded: a.csv b.csv c.csv d.csv e.csv
Sample of a loaded dataset:
    measure  group
0  4.249030      0
1  5.542826      0
2  5.161981      0
3  2.267553      0
4  4.155343      0


In [42]:
s, p = stats.f_oneway(*(d['measure'] for d in datasets))
print('p-value for anova test:', p)

p-value for anova test: 3.3595478270572274e-12


In [43]:
merged_datasets = pd.concat(datasets)
r = pairwise_tukeyhsd(merged_datasets['measure'],
                      groups=merged_datasets['group'])
print(r)

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
  0      1    -0.5373   -1.514 0.4395 False 
  0      2     1.831    0.8718 2.7901  True 
  0      3     1.908    1.0037 2.8123  True 
  0      4     1.7006   0.7414 2.6597  True 
  1      2     2.3682   1.3405 3.396   True 
  1      3     2.4453   1.4685 3.422   True 
  1      4     2.2379   1.2101 3.2656  True 
  2      3     0.077   -0.8821 1.0362 False 
  2      4    -0.1304  -1.1414 0.8807 False 
  3      4    -0.2074  -1.1665 0.7518 False 
--------------------------------------------


In [44]:
s, p = stats.kruskal(*(d['measure'] for d in datasets))
print('p-value for Kruskal-Wallis:', p)

p-value for Kruskal-Wallis: 2.275373540934311e-09


In [60]:
pairs_indices = list(combinations('abcde', 2))
pairs_data = combinations([d['measure'] for d in datasets], 2)

ps = [stats.ranksums(x, y)[1] for x, y in pairs_data]
ps_holm = multipletests(ps, method='holm')[1]
ps_bonferroni = multipletests(ps, method='bonferroni')[1]

ps = pd.DataFrame({
    'pairs': pairs_indices,
    'p': ps,
    'p holm': ps_holm,
    'p bonferroni': ps_bonferroni
}).set_index('pairs')[['p', 'p holm', 'p bonferroni']]


print(ps)

                   p    p holm  p bonferroni
pairs                                       
(a, b)  2.433450e-01  0.973380      1.000000
(a, c)  1.332963e-04  0.000800      0.001333
(a, d)  5.508804e-06  0.000050      0.000055
(a, e)  1.515892e-04  0.000800      0.001516
(b, c)  6.598469e-06  0.000053      0.000066
(b, d)  9.583666e-07  0.000010      0.000010
(b, e)  2.789325e-05  0.000195      0.000279
(c, d)  5.452595e-01  1.000000      1.000000
(c, e)  8.505281e-01  1.000000      1.000000
(d, e)  5.883647e-01  1.000000      1.000000


## Paired Data

In [75]:
dataset = 'multi.csv'
print('The following datasets will be loaded:', dataset)

dataset = pd.read_csv(os.path.join(data_dir, dataset), header=None)
print(dataset.head())

The following datasets will be loaded: multi.csv
           0          1          2           3           4
0  34.381581  38.230745  52.236630   59.027814   28.288420
1  78.475309  74.647168  82.374582   95.970556   28.983943
2   5.676301   8.919621  16.492051   28.646045   29.585645
3  90.357392  90.869337  98.987833  112.835174  118.070135
4  72.198253  71.068576  79.561990   92.640627   72.266390


In [1]:
s, p = stats.friedmanchisquare(*(dataset[c] for c in dataset.columns))
print('p-value:', p)

NameError: name 'stats' is not defined

## Extra Homework for ML Students

In [3]:
from keras.datasets.cifar10 import load_data

samples_used = 3000

(x, y), _ = load_data()
x = x.astype(float)
x /= 127.
x -= 1.

# Gray-scale, 1-rank tensors.
x = x.mean(axis=-1).reshape(x.shape[0], -1)
y = y.ravel()

p = np.random.permutation(x.shape[0])[:samples_used]
x, y = x[p], y[p]

print(x.shape, x[:3])
print(y.shape, y[:3])

print(' data statistics:', x.mean(), x.std())
print('label statistics:', dict(zip(*np.unique(y, return_counts=True))))

(3000, 1024) [[ 0.59055118  0.58267717  0.5984252  ...  0.09448819  0.05249344
  -0.0183727 ]
 [ 0.43307087  0.44619423  0.46981627 ... -0.96587927 -0.99475066
  -1.        ]
 [-0.09711286 -0.09973753 -0.02099738 ...  0.09448819  0.28346457
   0.31496063]]
(3000,) [4 8 2]
 data statistics: -0.04883058135799428 0.48773118463802934
label statistics: {0: 298, 1: 324, 2: 283, 3: 306, 4: 294, 5: 294, 6: 287, 7: 316, 8: 300, 9: 298}


In [4]:
def build_model(estimator):
    if estimator == 'a':
        return SVC()
    elif estimator == 'b':
        return RandomForestClassifier()
    else:
        raise ValueError('unknown estimator of type `%s`' % estimator)

In [5]:
r = np.random.RandomState(42)

experiments = 5
scores = []

for experiment in range(experiments):
    print('Executing experiment', experiment)
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=r)
    
    for train_indices, test_indices in skf.split(x, y):
        x_train, y_train = (d[train_indices] for d in (x, y))
        x_test, y_test = (d[test_indices] for d in (x, y))

        for estimator in 'ab':
            e = build_model(estimator)
            e.fit(x_train, y_train)
            scores += [e.score(x_test, y_test)]

Executing experiment 0
Executing experiment 1
Executing experiment 2
Executing experiment 3
Executing experiment 4


In [14]:
scores = np.asarray(scores)
scores_a = scores[::2]
scores_b = scores[1::2]

print('model a\'s average score:', scores_a.mean())
print('model b\'s average score:', scores_b.mean())

print('On average, model',
      'a' if scores_a.mean() > scores_b.mean() else 'b',
      'performed better.')

model a's average score: 0.3078667146074287
model b's average score: 0.2315298362354828
On average, model a performed better.


In [11]:
s, p = stats.wilcoxon(scores_a, scores_b)
print('p-value from wilconxon test:', p)

model a is averagely better, with a p-value of 0.005062032126267864


In [18]:
# rc_table = pd.crosstab(scores_a, scores_b)
# print(rc_table)
# p = statsmodels.stats.contingency_tables.mcnemar(rc_table).pvalue
# print('p-value from mcnemar test:', p)