## Comparison of measures of the magnitude of difference between two groups of data in univariate and multivariate models

In [1]:
# load libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn import model_selection
from scipy import stats
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.graphics.factorplots import interaction_plot
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score



In [2]:
# simulate two classes of data
# Example 1: 20 examples each class - 40 separable points
n=20
feat=2
np.random.seed(0)
class1 = np.random.randn(n, feat) - [2, 2] # to change effect size alter these weights
class2 = np.random.randn(n, feat) + [1, .75] # to change effect size alter these weights

X = np.r_[class1, class2]
Y = [0] * n + [1] * n

In [3]:
# perform univariate test
avg_class1 = np.mean(class1,axis=0)
avg_class2 = np.mean(class2,axis=0)
std_class1 = np.std(class1,axis=0)
std_class2 = np.std(class2,axis=0)

cohens_d = abs(avg_class1 - avg_class2) / (np.sqrt((std_class1 ** 2 + std_class2 ** 2) / 2))

print "Class 1 Mean"
print(avg_class1)
print "Class 2 Mean"
print(avg_class2)

print "Class 1 std"
print(std_class1)
print "Class 2 std"
print(std_class2)

t1, p1 = stats.ttest_ind(class1[:,0], class2[:,0], equal_var=False)
print "Feature 1 two sample t-test"
print(t1)
print(p1)
t2, p2 = stats.ttest_ind(class1[:,1], class2[:,1], equal_var=False)
print "Feature 2 two sample t-test"
print(t2)
print(p2)
print "Effect Size"
print(cohens_d)

Class 1 Mean
[-1.47323693 -1.90167812]
Class 2 Mean
[ 0.45546893  0.54727699]
Class 1 std
[ 1.08198183  1.00157252]
Class 2 std
[ 0.75688435  0.82547162]
Feature 1 two sample t-test
-6.36684867105
2.886325064e-07
Feature 2 two sample t-test
-8.22461456719
7.60253874059e-10
Effect Size
[ 2.06567848  2.66841733]


In [4]:
# perform multivariate classification

# fit the model
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)

# get support vectors
print "support vectors"
print(clf.support_vectors_)
# get indices of support vectors
# print "support vec indices"
# print(clf.support_)
# get number of support vectors for each class
print "number of support vectors"
print(clf.n_support_ )

# cv accuracy
scores = cross_val_score(clf, X, Y, cv=10)
print("CV 10 fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf, X, Y, cv=5)
print("CV 5 fold Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

support vectors
[[-0.23594765 -1.59984279]
 [-1.02126202  0.2408932 ]
 [-0.46722079 -0.53064123]
 [-0.76970932 -0.79762015]
 [-0.04855297 -0.67001794]
 [-0.61389785  0.53725972]
 [ 0.18685372 -0.9762826 ]
 [-0.63019835  1.21278226]]
number of support vectors
[4 4]
CV 10 fold Accuracy: 0.97 (+/- 0.15)
CV 5 fold Accuracy: 0.97 (+/- 0.10)


In [5]:
# Makes figures for 2 values of C

print(__doc__)
# Code source: Gaël Varoquaux
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

# figure number
fignum = 1

# fit the model
for name, penalty in (('unreg', 1), ('reg', .05)):

    clf = svm.SVC(kernel='linear', C=penalty)
    clf.fit(X, Y)

    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(-5, 5)
    yy = a * xx - (clf.intercept_[0]) / w[1]

    # plot the parallels to the separating hyperplane that pass through the
    # support vectors
    margin = 1 / np.sqrt(np.sum(clf.coef_ ** 2))
    yy_down = yy + a * margin
    yy_up = yy - a * margin

    # plot the line, the points, and the nearest vectors to the plane
    plt.figure(fignum, figsize=(4, 3))
    plt.clf()
    plt.plot(xx, yy, 'k-')
    plt.plot(xx, yy_down, 'k--')
    plt.plot(xx, yy_up, 'k--')

    plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80,
                facecolors='none', zorder=10)
    plt.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=plt.cm.Paired)

    plt.axis('tight')
    x_min = -4.8
    x_max = 4.2
    y_min = -6
    y_max = 6

    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.predict(np.c_[XX.ravel(), YY.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    plt.figure(fignum, figsize=(4, 3))
    plt.pcolormesh(XX, YY, Z, cmap=plt.cm.Paired)

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)

    plt.xticks(())
    plt.yticks(())
    fignum = fignum + 1

plt.show()

Automatically created module for IPython interactive environment
