In [None]:
%pylab inline
%load_ext autoreload
%autoreload 2

# Run this notebook outside of main module tree
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

# ignore warnings!
import warnings
warnings.filterwarnings('ignore')


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from surveys.personality import *
from utils.factors import *
from factor_rotation._analytic_rotation import target_rotation
from factor_rotation._gpa_rotation import orthomax_objective, GPA, rotateA as rotate
from factor_rotation._wrappers import rotate_factors

In [11]:
surveys = read_surveys('../data')


In [839]:
def get_scores(model, X, y, name, cv=cv):
    scores = cross_val_score(model, X, y, cv=cv)
    return pd.DataFrame([{'model': name, 'mean': scores.mean(), 'std': scores.std()}]).set_index('model')

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold

X = prep_X(surveys)
y = surveys.group == 'c'
cv = ShuffleSplit(n_splits=100, test_size = 0.2, random_state=1)

In [840]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

model = LinearDiscriminantAnalysis(store_covariance=True)
ld_factor = model.fit_transform(X, y)[:,0]
get_scores(model, X, y, 'LDA')

           mean       std
model                    
LDA    0.555263  0.096296

In [841]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=.5)
model.fit(X, y)
logistic_factor = X.dot(model.coef_[0,:])
get_scores(model, X, y, 'Logistic')

              mean      std
model                      
Logistic  0.595263  0.09347

In [843]:
from sklearn.svm import LinearSVC, SVC

model = LinearSVC(C=1.)
model.fit(X, y)
svc_factor = X.dot(model.coef_[0,:])
get_scores(model, X, y, 'SVM')

       mean       std
model                
SVM    0.59  0.097417

In [844]:
from sklearn.decomposition import PCA

princ_comps = PCA(n_components=5).fit_transform(X)
bf_survey = X.iloc[:, 0:65]
bigfive_key = pd.read_csv("../data/educatalyst/Auxil/q1_key_bigfive.csv", encoding='ISO-8859-1')
bf_comps = get_big_five_comps(bigfive_key)
bigfive = big_five_projection(bigfive_key, bf_survey)
teique = X.filter(regex="^3")
grit = X.filter(regex="^4")

In [845]:
model = LogisticRegression(C=.5)

li = [(grit, "Logistic: Grit"),
      (teique, "Logistic: Teique"),
      (bigfive, "Logistic: Big Five"),
      (princ_comps, "Logistic: PCA on all Surveys")]

pd.concat([get_scores(model, x, y, n) for x,n in li])

                                  mean       std
model                                           
Logistic: Grit                0.561053  0.097679
Logistic: Teique              0.596842  0.101025
Logistic: Big Five            0.538947  0.111430
Logistic: PCA on all Surveys  0.497895  0.109752

In [694]:
m = bigfive.as_matrix()
# np.corrcoef(bigfive.as_matrix(), ridge_factor)
def get_corr(bf, m):
    try: 
        bf = bf.as_matrix()
    except AttributeError:
        bf = bf
    return pd.DataFrame([np.corrcoef(i, m)[0,1] for i in bf.T])
     
pd.concat([get_corr(bigfive, ld_factor), get_corr(bigfive, logistic_factor), get_corr(princ_comps, logistic_factor), get_corr(princ_comps, ld_factor)], axis=1)

          0         0         0         0
0  0.047290  0.045018  0.067458  0.041342
1  0.011719  0.015875  0.016532  0.030283
2 -0.111763 -0.153912 -0.195607 -0.148115
3 -0.143229 -0.190179 -0.090641 -0.060674
4 -0.010267 -0.029111 -0.185912 -0.115046

In [846]:
from sklearn.linear_model import LogisticRegression

model = PLSSVD(n_components=5)
model.fit(X1, X2)

def make_transformed_training(A,B):
    a = np.repeat(1, A.shape[0])
    b = np.repeat(0, B.shape[0])
    groups = np.concatenate([a,b])
    return np.concatenate([A, B]), groups
    
# np.array.concat(model.x_scores_, model.y_scores_)
X, y = make_transformed_training(model.x_scores_, model.y_scores_)
model = LogisticRegression()
cross_val_score(model, X, y)

array([ 0.53571429,  0.46428571,  0.38461538])