In [2]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

%matplotlib inline

In [12]:
train = pd.read_csv("data/train.tsv", sep= '\t')
test = pd.read_csv("data/test.tsv", sep= '\t')

In [42]:
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

negations = ['no', 'never', 'not']

def clean_phrase_porter(phrase):
    letters_only = re.sub("[^a-zA-Z]", " ", phrase)
    lower_case = letters_only.lower()
    
    words = lower_case.split()
    stops = set(stopwords.words("english")) 
    meaningful_words = [porter_stemmer.stem(w) for w in words if not w in stops]
    
    for i, word in enumerate(meaningful_words):
        if word in negations:
            try:
                meaningful_words[i+1] = "!" + meaningful_words[i+1]
            except:
                pass
            try:
                meaningful_words[i-1] = "!" + meaningful_words[i-1]
            except:
                pass
        
        
    return(" ".join( meaningful_words))   

def clean_phrase_lemmatizer(phrase):
    letters_only = re.sub("[^a-zA-Z]", " ", phrase)
    lower_case = letters_only.lower()
    
    words = lower_case.split()
    stops = set(stopwords.words("english")) 
    meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words if not w in stops]
    return(" ".join( meaningful_words))   

In [43]:
def apply_transform(data):
    data['CleanPhrase'] = data['Phrase'].apply(clean_phrase_porter)

In [44]:
apply_transform(train)
apply_transform(test)

[u'one', u'cool', u'actor', u'never', u'seem', u'awar', u'cool']
[u'one', u'cool', u'actor', u'never', u'seem', u'awar', u'cool']
[u'one', u'cool', u'actor', u'never', u'seem', u'awar', u'cool']
[u'one', u'cool', u'actor', u'never', u'seem', u'awar', u'cool']
[u'cool', u'actor', u'never', u'seem', u'awar', u'cool']
[u'cool', u'actor', u'never', u'seem', u'awar', u'cool']
[u'never', u'seem', u'awar', u'cool']
[u'never', u'seem', u'awar', u'cool']
[u'never']
[u'never', u'play', u'dramat', u'even', u'dramat', u'thing', u'happen', u'peopl']
[u'never', u'play', u'dramat', u'even', u'dramat', u'thing', u'happen', u'peopl']
[u'though', u'film', u'never', u'veer', u'comic', u'cours', u'unintent', u'parallel', u'might', u'inadvert', u'evok', u'memori', u'emot', u'anyth', u'humor']
[u'though', u'film', u'never', u'veer', u'comic', u'cours']
[u'film', u'never', u'veer', u'comic', u'cours']
[u'never', u'veer', u'comic', u'cours']
[u'comput', u'anim', u'handsom', u'variou', u'amus', u'sidekick', u'

KeyboardInterrupt: 

In [39]:
pipelineOneVOne = Pipeline([('vect', CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)),
                            ('tfidf', TfidfTransformer()),
                            ('clf', OneVsOneClassifier(LinearSVC())),])

In [38]:
pipelineBayes = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),])

In [41]:
predictors = "CleanPhrase"

bayes_mean = cross_validation.cross_val_score(pipelineBayes, train[predictors], train["Sentiment"], cv=3).mean()
onevone_mean = cross_validation.cross_val_score(pipelineOneVOne, train[predictors], train["Sentiment"], cv=3).mean()

print "Mean score for Bayes model: {}".format(bayes_mean)
print "Mean score for OnevOne Model: {}".format(onevone_mean)

Mean score for Bayes model: 0.552390068238
Mean score for OnevOne Model: 0.588433781287
