In [43]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

%matplotlib inline

In [44]:
train = pd.read_csv("data/train.tsv", sep= '\t')
test = pd.read_csv("data/test.tsv", sep= '\t')

In [47]:
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

negations = ['no', 'never', 'not']

def clean_phrase_simple(phrase):
    # Grab only words and lower them
    clean_str = re.findall(r'\w+', phrase, flags = re.UNICODE | re.LOCALE)
    return ' '.join(clean_str).lower()

def clean_phrase_porter(phrase):
    clean_str = re.findall(r'\w+', phrase, flags = re.UNICODE | re.LOCALE)
    
    stemmed = [porter_stemmer.stem(word) for word in clean_str]
    
    return ' '.join(stemmed).lower()
    
    
# I tried something with negations here - didn't seem to offer any real improvement
    
#     for i, word in enumerate(meaningful_words):
#         if word in negations or word.endswith('n\'t'):
#             try:
#                 meaningful_words[i+1] = "!" + meaningful_words[i+1]
#             except:
#                 pass
#             try:
#                 meaningful_words[i-1] = "!" + meaningful_words[i-1]
#             except:
#                 pass
        
        
    return(" ".join( meaningful_words))   

def clean_phrase_lemmatizer(phrase):
    letters_only = re.sub("[^a-zA-Z]", " ", phrase)
    lower_case = letters_only.lower()
    
    words = lower_case.split()
    stops = set(stopwords.words("english")) 
    meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words if not w in stops]
    return(" ".join( meaningful_words))   

In [48]:
def apply_transform(data):
    data['CleanPhrase'] = data['Phrase'].apply(clean_phrase_porter)
    data['CleanPhraseSimple'] = data['Phrase'].apply(clean_phrase_simple)

In [49]:
apply_transform(train)
apply_transform(test)

Multinomial Bayes - apparently a good place to start, but haven't had huge success with it. Most places recommend that SVM/some onevone classifier would be better, so that's what we did.

In [50]:
pipelineOneVOne = Pipeline([('vect', CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words='english', max_features = 5000)),
                            ('tfidf', TfidfTransformer()),
                            ('clf', OneVsOneClassifier(LinearSVC())),])

In [51]:
pipelineBayes = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),])

In [52]:
pipelineSVM  = Pipeline([('vect', CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words='english', max_features = 5000)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', svm.LinearSVC(penalty = 'l2', dual = False, tol = 1e-3)),])

In [53]:
predictors = ["CleanPhraseSimple","CleanPhrase"]

for predictor in predictors:
    # bayes_mean = cross_validation.cross_val_score(pipelineBayes, train[predictors], train["Sentiment"], cv=3).mean()
    onevone_mean = cross_validation.cross_val_score(pipelineOneVOne, train[predictor], train["Sentiment"], cv=3).mean()
    svc_mean = cross_validation.cross_val_score(pipelineSVM, train[predictor], train["Sentiment"], cv=3).mean()


    # print "Mean score for Bayes model: {}".format(bayes_mean)
    print "Mean score for OnevOne Model: {}  |  Predictor: {}".format(onevone_mean, predictor)
    print "Mean score for SVM Model: {}  |  Predictor: {}".format(svc_mean, predictor)

Mean score for OnevOne Model: 0.580859829424  |  Predictor: CleanPhraseSimple
Mean score for SVM Model: 0.57742521947  |  Predictor: CleanPhraseSimple
Mean score for OnevOne Model: 0.586658821458  |  Predictor: CleanPhrase
Mean score for SVM Model: 0.580808524222  |  Predictor: CleanPhrase


In [23]:
pipelineBayes = pipelineBayes.fit(train.Phrase, train.Sentiment)
pipelineOneVOne = pipelineOneVOne.fit(train.Phrase, train.Sentiment)

predictionBayes = pipelineBayes.predict(test.Phrase)
predictionOnevOne = pipelineOneVOne.predict(test.Phrase)

In [24]:
output = pd.DataFrame( data={"PhraseId":test["PhraseId"], "Sentiment":predictionOnevOne} )

# Use pandas to write the comma-separated output file
output.to_csv("submission.csv", index=False, quoting=3 )