In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


In [2]:
from utils import *

In [3]:
from nltk.corpus import stopwords
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [4]:
class essayGrader(object):
    """
    Contains methods to create features
    responsible for producing grading
    for the text essay
    """
    
    def __init__(self,dataframe):
        """
        Constructor which takes in input as
        the datafram containing default features
        of the essay (if present)
        """
        self.df = dataframe
    
    def __str__(self):
        """
        Returns information about essay
        dataframe
        """
        return str(self.df.info())
    
    def spell_check(self):
        """
        Spellings are checked and scores
        are provided for the given essay.
        Here scores are normalised by the
        length of the essay.
        """
        print 'Run SpellCheck.py'
    
    def get_complexity(self,col):
        """
        Takes input as the column of request and calculate
        the complexity score given by Flesch-Kincaid Grade 
        Level.
        Returns a list of complexity score for each request.
        """
        grade_level = []
        syl = set(['a','e','i','o','u'])
        for text in self.df[col]:
            sent_cnt = len(text.split('.'))
            words = re.sub("["+'!"#$%&\'()*+.,-/:;<=>?@[\\]^_`{|}~'+"]", " ", text).split()
            syl_count = 0
            for word in words:
                for letter in list(word):
                    if letter in syl:
                        syl_count += 1
            grade_level.append(Flesch_reading_ease(total_sentences = sent_cnt,total_words = len(words),\
                                                   total_sylabls=syl_count))
        return grade_level
    
    def get_length(self,col):
        f = lambda x : len(x.split())
        length = pd.Series([f(x) for x in self.df[col]])
        return length

In [13]:
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3 
def train_model(sentences,num_features=300,min_word_count=40,num_workers=4,context=10,downsampling=1e-3):
    # Set values for various parameters
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    from gensim.models import word2vec
    print "Training model..."
    model = word2vec.Word2Vec(sentences, workers=num_workers, \
                size=num_features, min_count = min_word_count, \
                window = context, sample = downsampling)

    # If you don't plan to train the model any further, calling 
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    # It can be helpful to create a meaningful model name and 
    # save the model for later use. You can load it later using Word2Vec.load()
    model_name = "300features_40minwords_10context"
    model.save(model_name)
    return model

In [5]:
df_train = pd.read_csv('Data/training_set_rel3.tsv',delimiter='\t')
print df_train.shape

(12976, 28)


In [6]:
essay = essayGrader(df_train)
print essay

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12976 entries, 0 to 12975
Data columns (total 28 columns):
essay_id          12976 non-null int64
essay_set         12976 non-null int64
essay             12976 non-null object
rater1_domain1    12976 non-null int64
rater2_domain1    12976 non-null int64
rater3_domain1    128 non-null float64
domain1_score     12976 non-null int64
rater1_domain2    1800 non-null float64
rater2_domain2    1800 non-null float64
domain2_score     1800 non-null float64
rater1_trait1     2292 non-null float64
rater1_trait2     2292 non-null float64
rater1_trait3     2292 non-null float64
rater1_trait4     2292 non-null float64
rater1_trait5     723 non-null float64
rater1_trait6     723 non-null float64
rater2_trait1     2292 non-null float64
rater2_trait2     2292 non-null float64
rater2_trait3     2292 non-null float64
rater2_trait4     2292 non-null float64
rater2_trait5     723 non-null float64
rater2_trait6     723 non-null float64
rater3_trait1     128

In [7]:
essay_col = 'essay'
print 'Getting complexity of the essay...'
complexity = essay.get_complexity(essay_col)
print len(complexity)

Getting complexity of the essay...
12976


In [8]:
def convert_to_wordlist(sentence,remove_stopwords=True):
    sentence = re.sub("[^a-zA-Z]"," ", sentence)
    words = sentence.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

In [9]:
def parse_into_sents(text,remove_stopwords=True):
    parsed_essay = []
    text = unicode(text, errors='ignore')
    sents = tokenizer.tokenize(text.strip().decode('utf-8'))
    for sent in sents:
        if len(sent) > 0:
            parsed_essay.append(convert_to_wordlist(sent,remove_stopwords=remove_stopwords))
    return parsed_essay

In [10]:
print 'Parsing sentences from training data....'
sentences = []
for text in df_train['essay']:
    sentences += parse_into_sents(text)

Parsing sentences from training data....


In [11]:
def create_feat_vector(words,model,num_features):
    featVector = np.zeros((num_features,))
    vocab = set(model.wv.index2word)
    nwords = 0
    for word in words:
        if word in vocab:
            featVector += model[word]
            nwords += 1
    return featVector/nwords

In [12]:
def create_feat_matrix(df,col,model,num_features):
    featMatrix,i = np.zeros((len(df[col]),num_features)),0
    for essay in df[col]:
        essay = unicode(essay, errors='ignore')
        featVector = create_feat_vector(essay.strip().lower().split(),model,num_features)
        featMatrix[i] = featVector
        i += 1
        if i % 1000 == 0:
            print '{} reviews parsed...'.format(i)
    return featMatrix

In [14]:
# Train word2vec model
model = train_model(sentences)

Training model...


In [15]:
featMatrix = create_feat_matrix(df_train,'essay',model,num_features)
featMatrix[11748] = create_feat_vector(df_train.iloc[11748,:].essay,model,num_features=num_features)

1000 reviews parsed...
2000 reviews parsed...
3000 reviews parsed...
4000 reviews parsed...
5000 reviews parsed...
6000 reviews parsed...
7000 reviews parsed...
8000 reviews parsed...
9000 reviews parsed...
10000 reviews parsed...
11000 reviews parsed...
12000 reviews parsed...


In [None]:
df_new = pd.DataFrame(featMatrix,columns=['feature{}'.format(i) for i in range(num_features)])
df_new['complexity'] = complexity
df_new['spell_check'] = pd.read_csv('spellCheck.csv',header=None)
df_new['length'] = essay.get_length(essay_col)

### Applying Models

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
y = df_train.rater1_domain1

In [None]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df_new,y,random_state=7,test_size=0.1)
rfc.fit(X_train,y_train)

In [None]:
print 'Error: {}'.format(np.sqrt(np.sum(np.square(rfc.predict(X_test) - y_test))))/X_test.shape[0]