In [1]:
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, SparsePCA
from sklearn.cross_validation import KFold, train_test_split
from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from scipy.stats import spearmanr
import sklearn.svm as svm
import scipy.sparse
%matplotlib inline

In [3]:
essay_df = pd.read_csv("datasets/training_set_rel3.tsv", delimiter="\t")
print essay_df.shape
essay_df.head(2)

(12976, 28)


Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,


In [10]:
vectorizer = TfidfVectorizer(stop_words="english", min_df=4, decode_error="ignore", ngram_range=(1, 1))
corpus = essay_df["essay"].values
sparse_essay_vector = vectorizer.fit_transform(corpus)

In [32]:
sparse_essay_vector.shape, type(sparse_essay_vector)

((12976, 10017), scipy.sparse.csr.csr_matrix)

In [28]:
essay_df["essay_length"] = map(len, essay_df["essay"])
sparse_essay_length = scipy.sparse.csc_matrix(essay_df["essay_length"]).transpose()
sparse_essay_set = scipy.sparse.csc_matrix(essay_df["essay_set"]).transpose()

In [66]:
X = scipy.sparse.hstack([sparse_essay_vector, sparse_essay_length, sparse_essay_set], format="csr")
y = essay_df["domain1_score"]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [67]:
lasso_alpha_list = [.0001, .001, .01 , 1]
lasso_r2_value = []
lasso_spearman_value = []

for alpha in lasso_alpha_list: 
    lasso_r2_temp = []
    lasso_spearman_temp = []
    
    print "Testing alpha = {0}".format(alpha)
    
    # gives us the different folds of our data to test against
    folds = KFold(X.shape[0], n_folds=2, shuffle=True)

    for train_indices, test_indices in folds:
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]

        lasso = Lasso(alpha=alpha)
        
        # fit on the training data 
        lasso.fit(X_train, y_train)
        
        # calculate r^2
        lasso_r2_temp.append(lasso.score(X_test, y_test))
        spearman_r = scipy.stats.spearmanr(lasso.predict(X_test), y_test)
        print "Spearman_r:", spearman_r
        lasso_spearman_temp.append(spearman_r)
    
    lasso_r2_value.append(np.mean(lasso_r2_temp))
    lasso_spearman_value.append(np.mean(lasso_spearman_temp))

Testing alpha = 0.0001
Spearman_r: (0.80811197085814368, 0.0)
Spearman_r: (0.80904335400306338, 0.0)
Testing alpha = 0.001
Spearman_r: (0.81800140336762706, 0.0)
Spearman_r: (0.80294195825878867, 0.0)
Testing alpha = 0.01
Spearman_r: (0.78938578805999826, 0.0)
Spearman_r: (0.77984045641339084, 0.0)
Testing alpha = 1
Spearman_r: (0.66821869836598513, 0.0)
Spearman_r: (0.69725780694485318, 0.0)
