In [6]:
import pandas as pd
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

%matplotlib inline

In [46]:
train = pd.read_csv("data/train.tsv", sep= '\t')
test = pd.read_csv("data/test.tsv", sep= '\t')

In [47]:
def clean_phrase(phrase):
    letters_only = re.sub("[^a-zA-Z]", " ", phrase)
    lower_case = letters_only.lower()
    
    words = lower_case.split()
    stops = set(stopwords.words("english")) 
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join( meaningful_words))   

def num_words(phrase):
    return len(phrase.split())

def length_phrase(phrase):
    return len(phrase)

def avg_word_length(phrase):
    if(phrase != ''):
        return sum(map(len, phrase.split()))/len(phrase.split())
    else:
        return 0

most_positive = ['remarkable', 'brilliant', 'terrific', 'excellent', 'finest', 'extraordinary', 'masterful', 
                 'hilarious', 'beautiful', 'wonderful', 'breathtaking', 'powerful', 'wonderfully', 'delightful', 
                 'masterfully', 'fantastic', 'dazzling', 'funniest', 'interference', 'refreshing']
most_negative = ['worst', 'failure', 'lacks', 'waste', 'bore', 'depressing', 'lacking', 'stupid', 'disappointment', 
                 'unfunny', 'lame', 'devoid', 'trash', 'lousy', 'junk', 'poorly', 'mess', 'sleep', 'unappealing', 'fails']

def contains_positive(phrase):
    for word in phrase.split():
        if word in most_positive:
            return 1 
    return 0
        
def contains_negative(phrase):
    for word in phrase.split():
        if word in most_negative:
            return 1
    return 0

In [48]:
def apply_transform(data):
    data['CleanPhrase'] = data['Phrase'].apply(clean_phrase)
    data['NumWords'] = data['CleanPhrase'].apply(num_words)
    data['LengthPhrase'] = data['CleanPhrase'].apply(length_phrase)
    data['AvgWordLength'] = data['CleanPhrase'].apply(avg_word_length)
    data['ContainPositive'] = data['CleanPhrase'].apply(contains_positive)
    data['ContainNegative'] = data['CleanPhrase'].apply(contains_negative)

In [49]:
apply_transform(train)
apply_transform(test)

In [54]:
predictors = ["ContainPositive", "ContainNegative", "NumWords", "LengthPhrase", "AvgWordLength"]
# predictors = ["ContainPositive", "ContainNegative"]
# alg = LogisticRegression(random_state=1)
alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=8, min_samples_leaf=4)
cross_validation.cross_val_score(alg, train[predictors], train["Sentiment"], cv=3).mean()


0.51919124553222373

We need to come up with better predictors because they would only give around 50 percent

In [50]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([('vect', CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,
                             max_features = 5000)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', OneVsOneClassifier(LinearSVC())),
                     ])
# pipeline = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB()),
#                      ])




In [55]:
predictors = "CleanPhrase"
# cross_validation.cross_val_score(pipeline, train[predictors], train["Sentiment"], cv=3).mean()
pipeline = pipeline.fit(train.CleanPhrase, train.Sentiment)

Cross validation to check the initial score and fit to actually submit

In [62]:
# vectorizer = CountVectorizer(analyzer = "word",   \
#                              tokenizer = None,    \
#                              preprocessor = None, \
#                              stop_words = None,   \
#                              max_features = 5000) 
# train_data_features = vectorizer.fit_transform(train.CleanPhrase)
# train_data_features = train_data_features.toarray()

In [29]:
# alg = LinearRegression()
# cross_validation.cross_val_score(alg, train_data_features, train["Sentiment"], cv=3).mean()

It takes forever to apply the bag of words model that is in the tutorial for the rotten tomato

In [52]:
prediction = pipeline.predict(test.CleanPhrase)
print prediction

[3 3 2 ..., 2 2 1]


This prediction gives around 60 percent

In [53]:
output = pd.DataFrame( data={"PhraseId":test["PhraseId"], "Sentiment":prediction} )

# Use pandas to write the comma-separated output file
output.to_csv("new.csv", index=False, quoting=3 )