In [None]:
%pylab inline
%load_ext autoreload
%autoreload 2

# Run this notebook outside of main module tree
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

# ignore warnings!
import warnings
warnings.filterwarnings('ignore')

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from surveys.personality import *
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold

In [873]:
def get_scores(model, X, y, name, cv=cv):
    scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
    return pd.DataFrame([{'model': name, 'mean': scores.mean(), 'std': scores.std()}]).set_index('model')

In [870]:
# Load Data
surveys = read_surveys('../data')
X = prep_X(surveys)
y = surveys.group == 'c'
cv = ShuffleSplit(n_splits=100, test_size = 0.2, random_state=1)

In [871]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

ld_model = LinearDiscriminantAnalysis(store_covariance=True)
ld_factor = ld_model.fit_transform(X, y)[:,0]

In [852]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(C=.5)
logistic_model.fit(X, y)
logistic_factor = X.dot(logistic_model.coef_[0,:])

In [868]:
from sklearn.svm import LinearSVC, SVC

svm_model = LinearSVC(C=1.)
svm_model.fit(X, y)
svc_factor = X.dot(svm_model.coef_[0,:])

In [854]:
from sklearn.decomposition import PCA

princ_comps = PCA(n_components=5).fit_transform(X)
bf_survey = X.iloc[:, 0:65]
bigfive_key = pd.read_csv("../data/educatalyst/Auxil/q1_key_bigfive.csv", encoding='ISO-8859-1')
bf_comps = get_big_five_comps(bigfive_key)
bigfive = big_five_projection(bigfive_key, bf_survey)
teique = X.filter(regex="^3")
grit = X.filter(regex="^4")

## Testing for Predictive Power of Survey Questions in Determining Group Membership

Here we take the raw survey data and apply 3 linear classification algorithms that work well with the dimensionality of the data that we have (more features than observations). We choose linear because we are assuming that our data has a linear factor structure, and as such a linear algorithm will point to a single factor. LDA works directly on factors of the data explicitly, Linear Supper Vector Machines are a natural choice for a classification problem in high dimensions, and regularized Logistic Regression is, as well, as simple choice for a linear classifier. 

We measure out-of-sample "F1" score, an average of precision and recall, as a pure accuracy score would be misleading given our classes (groups) are not perfectly balanced. We do this on a randomized 1/4 test/training split of the data, which is repeated 100 times, from which we pull a mean and standard deviation of the scores to see if any classifier can predict significantly better than random given the survey data. They all fail.

In [874]:
models = [(ld_model, "LDA"), (svm_model, "SVM"), (logistic_model, "Logistic: All Survey Data")]

pd.concat([get_scores(m, X, y, n) for m,n in models])

                               mean       std
model                                        
LDA                        0.585174  0.117171
SVM                        0.638187  0.097601
Logistic: All Survey Data  0.650731  0.090990

## Testing the Predictive Power of Individual Surveys

Next we look at individual surveys, for the Big Five we take the 5 factors themselves, for the other surveys we just use the individual questions as there are very few. We also take a look at a basic PCA decomposition of the entire set of 104 survey questions together. All of these features are attempted with the same regularlized Logistic Regression model, and nothing is proven to have any predictive power over group membership. 

In [867]:
model = LogisticRegression(C=.5)

li = [(grit, "Logistic: Grit"),
      (teique, "Logistic: Teique"),
      (bigfive, "Logistic: Big Five"),
      (princ_comps, "Logistic: PCA on all Surveys")]

pd.concat([get_scores(model, x, y, n) for x,n in li])

                                  mean       std
model                                           
Logistic: Grit                0.561053  0.097679
Logistic: Teique              0.596842  0.101025
Logistic: Big Five            0.538947  0.111430
Logistic: PCA on all Surveys  0.497895  0.109752

In [877]:
bigfive.columns

Index(['Energy / Extraversion', 'Agreeableness', 'Conscientiousness',
       'Emotional Instability', 'Intellect / Openness'],
      dtype='object')

# THE REST IS NOT REALLY AS INTERESTING!

But here we see if how the factors we find with direct prediction compare to the big five traits. We see that, if anything, some factor that is most closely correlated with Conscientiousness and Emotional Instability has some power on predicting group membership in our cohort. 

In [880]:
m = bigfive.as_matrix()
def get_corr(bf, m):
    try: 
        bf = bf.as_matrix()
    except AttributeError:
        bf = bf
    return pd.DataFrame([np.corrcoef(i, m)[0,1] for i in bf.T])
     
df = pd.concat([get_corr(bigfive, ld_factor), get_corr(bigfive, logistic_factor), get_corr(princ_comps, logistic_factor), get_corr(princ_comps, ld_factor)], axis=1)
df.index = bigfive.columns
df

                              0         0         0         0
Energy / Extraversion  0.047290  0.045018  0.067458  0.041342
Agreeableness          0.011719  0.015875  0.016532  0.030283
Conscientiousness     -0.111763 -0.153912 -0.195607 -0.148115
Emotional Instability -0.143229 -0.190179 -0.090641 -0.060674
Intellect / Openness  -0.010267 -0.029111 -0.185912 -0.115046

In [846]:
from sklearn.linear_model import LogisticRegression

model = PLSSVD(n_components=5)
model.fit(X1, X2)

def make_transformed_training(A,B):
    a = np.repeat(1, A.shape[0])
    b = np.repeat(0, B.shape[0])
    groups = np.concatenate([a,b])
    return np.concatenate([A, B]), groups
    
# np.array.concat(model.x_scores_, model.y_scores_)
X, y = make_transformed_training(model.x_scores_, model.y_scores_)
model = LogisticRegression()
cross_val_score(model, X, y)

array([ 0.53571429,  0.46428571,  0.38461538])