In [50]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import pandas as pd
from scipy.stats import ttest_ind, pearsonr
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

In [80]:
X = np.random.randn(50, 5)
y = np.repeat([0, 1], repeats=X.shape[0]/2)
X[y == 1] += 0.5
c = np.roll(y, 20)
X[c == 1] += 0.5
print(pearsonr(c, y))

(-0.59999999999999998, 4.1202159309776657e-06)


In [81]:
# univar
for i in range(X.shape[1]):
    pred = sm.add_constant(y)
    pred = np.hstack((pred, c[:, np.newaxis]))
    this_data = X[:, i]
    model = sm.OLS(this_data, pred)
    results = model.fit()
    print(results.pvalues)

[ 0.92523859  0.23257924  0.04401089]
[ 0.1411764   0.96097113  0.51815233]
[ 0.58194605  0.01980688  0.01009266]
[ 0.37343209  0.48200282  0.38436122]
[ 0.16177135  0.84086278  0.43929472]


In [39]:
def do_classification(X, y, pipeline, n_splits, permute=False):
    
    skf = StratifiedKFold(n_splits=10)
    scores = np.zeros(10)
    for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        
        if permute:
            np.random.shuffle(y_train)
        
        pipeline.fit(X_train, y_train)
        scores[i] = pipeline.score(X_test, y_test)
    return scores

In [40]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='linear', C=0.01))
])

observed_score = do_classification(X, y, pipe, n_splits=10, permute=False).mean()
print(observed_score)

perm_scores = [do_classification(X, y, pipe, n_splits=10, permute=True).mean() for i in range(1000)]

0.616666666667


In [41]:
(observed_score > perm_scores).sum() / 10000

0.095200000000000007