In [12]:
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA, SparsePCA
from sklearn.cross_validation import KFold, train_test_split
from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.decomposition import TruncatedSVD
from scipy.stats import spearmanr
import sklearn.svm as svm
import scipy.sparse
%matplotlib inline

In [2]:
essay_df = pd.read_csv("datasets/training_set_rel3.tsv", delimiter="\t")
print essay_df.shape
essay_df.head(2)

(12976, 28)


Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,


In [3]:
prompts_df = pd.read_csv("prompts.csv")
prompts_df.rename(columns={"prompt":"essay_set", "text":"prompt_text"}, inplace=True)
prompts_df

Unnamed: 0,essay_set,prompt_text
0,1,"More and more people use computers, but not ev..."
1,2,All of us can think of a book that we hope non...
2,3,Write a response that explains how the feature...
3,4,"When they come back, Saeng vowed silently to h..."
4,5,Describe the mood created by the author in the...
5,6,"Based on the excerpt, describe the obstacles t..."
6,7,Write about patience. Being patient means that...
7,8,"""If you want a place in the sun, you will have..."


In [4]:
essay_df = essay_df.join(prompts_df.set_index("essay_set"), on="essay_set", how="inner")

In [5]:
vectorizer = TfidfVectorizer(stop_words="english", min_df=4, decode_error="ignore", ngram_range=(1, 1))
corpus = essay_df["essay"].values + essay_df["prompt_text"].values
sparse_essay_vector = vectorizer.fit_transform(corpus)

In [6]:
sparse_essay_vector.shape, type(sparse_essay_vector)

((12976, 10100), scipy.sparse.csr.csr_matrix)

In [7]:
essay_df["essay_length"] = map(len, essay_df["essay"])
sparse_essay_length = scipy.sparse.csc_matrix(essay_df["essay_length"]).transpose()
sparse_essay_set = scipy.sparse.csc_matrix(essay_df["essay_set"]).transpose()

In [8]:
X = scipy.sparse.hstack([sparse_essay_vector, sparse_essay_length, sparse_essay_set], format="csr")
y = essay_df["domain1_score"]

In [9]:
lasso_alpha_list = [.0001, .001, .01 , 1]
lasso_r2_value = []
lasso_spearman_value = []

for alpha in lasso_alpha_list: 
    lasso_r2_temp = []
    lasso_spearman_temp = []
    
    print "Testing alpha = {0}".format(alpha)
    
    # gives us the different folds of our data to test against
    folds = KFold(X.shape[0], n_folds=2, shuffle=True)

    for train_indices, test_indices in folds:
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]

        lasso = Lasso(alpha=alpha)
        
        # fit on the training data 
        lasso.fit(X_train, y_train)
        
        # calculate r^2
        lasso_r2_temp.append(lasso.score(X_test, y_test))
        spearman_r = scipy.stats.spearmanr(lasso.predict(X_test), y_test)
        print "Spearman_r:", spearman_r
        lasso_spearman_temp.append(spearman_r)
    
    lasso_r2_value.append(np.mean(lasso_r2_temp))
    lasso_spearman_value.append(np.mean(lasso_spearman_temp))

Testing alpha = 0.0001




Spearman_r: SpearmanrResult(correlation=0.87863148191095586, pvalue=0.0)
Spearman_r: SpearmanrResult(correlation=0.87058364137816246, pvalue=0.0)
Testing alpha = 0.001
Spearman_r: SpearmanrResult(correlation=0.87911528457118593, pvalue=0.0)
Spearman_r: SpearmanrResult(correlation=0.88204723282229069, pvalue=0.0)
Testing alpha = 0.01
Spearman_r: SpearmanrResult(correlation=0.84515980332243024, pvalue=0.0)
Spearman_r: SpearmanrResult(correlation=0.85100166680382117, pvalue=0.0)
Testing alpha = 1
Spearman_r: SpearmanrResult(correlation=0.68342485407723919, pvalue=0.0)
Spearman_r: SpearmanrResult(correlation=0.68231220062270526, pvalue=0.0)


In [16]:
lsa = TruncatedSVD(n_components = 100)
lsa = lsa.fit_transform(X)

array([[  1.87499838e+03,   2.86902956e+00,   8.10354954e-02, ...,
         -2.41169457e-02,   4.65674030e-02,   1.60427051e-02],
       [  2.28799756e+03,  -2.75691780e-01,   5.25138051e-02, ...,
          2.20408186e-02,   7.93554556e-03,   8.77970755e-03],
       [  1.54099904e+03,   2.80318560e+00,   1.79456141e-01, ...,
          5.78922471e-03,   1.91167474e-03,   1.29736736e-03],
       ..., 
       [  4.59700682e+03,  -5.69673289e+00,  -3.78473821e-03, ...,
          1.91415051e-02,   1.67909001e-02,   8.60195339e-03],
       [  3.08500977e+03,  -6.27749510e+00,   2.06566430e-01, ...,
         -5.86594122e-03,   6.49595864e-03,  -2.93683986e-02],
       [  2.52601088e+03,  -4.67795080e+00,  -1.79155250e-01, ...,
         -5.38256431e-03,   4.88140024e-03,  -6.77888514e-03]])

(12976, 100)