In [1]:
import pandas as pd
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

%matplotlib inline



In [2]:
train = pd.read_csv("data/train.tsv", sep= '\t')
test = pd.read_csv("data/test.tsv", sep= '\t')

In [51]:
def clean_phrase(phrase):
    letters_only = re.sub("[^a-zA-Z]", " ", phrase)
    lower_case = letters_only.lower()
    
    words = lower_case.split()
    stops = set(stopwords.words("english")) 
    words = [w for w in words if not w in stops]
    return(" ".join(words))    

def num_words(phrase):
    return len(phrase.split())

def length_phrase(phrase):
    return len(phrase)

def avg_word_length(phrase):
    if(phrase != ''):
        return sum(map(len, phrase.split()))/len(phrase.split())
    else:
        return 0

most_positive = ['remarkable', 'brilliant', 'terrific', 'excellent', 'finest', 'extraordinary', 'masterful', 
                 'hilarious', 'beautiful', 'wonderful', 'breathtaking', 'powerful', 'wonderfully', 'delightful', 
                 'masterfully', 'fantastic', 'dazzling', 'funniest', 'interference', 'refreshing']
most_negative = ['worst', 'failure', 'lacks', 'waste', 'bore', 'depressing', 'lacking', 'stupid', 'disappointment', 
                 'unfunny', 'lame', 'devoid', 'trash', 'lousy', 'junk', 'poorly', 'mess', 'sleep', 'unappealing', 'fails']

def contains_positive(phrase):
    for word in phrase.split():
        if word in most_positive:
            return 1 
    return 0
        
def contains_negative(phrase):
    for word in phrase.split():
        if word in most_negative:
            return 1
    return 0

def getSentences(data):
    sentenceK = data['SentenceId'].drop_duplicates()
    sentences = data.iloc[sentenceK.keys()]
    return sentences

In [53]:
def apply_transform(data):
    data['CleanPhrase'] = data['Phrase'].apply(clean_phrase)
    data['NumWords'] = data['CleanPhrase'].apply(num_words)
    data['LengthPhrase'] = data['CleanPhrase'].apply(length_phrase)
    data['AvgWordLength'] = data['CleanPhrase'].apply(avg_word_length)
    data['ContainPositive'] = data['CleanPhrase'].apply(contains_positive)
    data['ContainNegative'] = data['CleanPhrase'].apply(contains_negative)

In [54]:
apply_transform(train)
apply_transform(test)

In [7]:
sentences = getSentences(train)

In [54]:
predictors = ["ContainPositive", "ContainNegative", "NumWords", "LengthPhrase", "AvgWordLength"]
# predictors = ["ContainPositive", "ContainNegative"]
# alg = LogisticRegression(random_state=1)
alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=8, min_samples_leaf=4)
cross_validation.cross_val_score(alg, train[predictors], train["Sentiment"], cv=3).mean()


0.51919124553222373

We need to come up with better predictors because they would only give around 50 percent

In [56]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([('vect', CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,
                             max_features = 5000)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', OneVsOneClassifier(LinearSVC())),
                     ])
# pipeline = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB()),
#                      ])




In [80]:
predictors = "Phrase"
# cross_validation.cross_val_score(pipeline, train[predictors], train["Sentiment"], cv=3).mean()
pipeline = pipeline.fit(sentences.CleanPhrase, sentences.Sentiment)

Cross validation to check the initial score and fit to actually submit

In [81]:
prediction = pipeline.predict(test.CleanPhrase)
print prediction

[3 3 2 ..., 1 1 3]


This prediction gives around 60 percent

In [28]:
output = pd.DataFrame( data={"PhraseId":test["PhraseId"], "Sentiment":prediction} )

# Use pandas to write the comma-separated output file
output.to_csv("new.csv", index=False, quoting=3 )

NameError: name 'prediction' is not defined

### Word2Vec
From the bag of words Kaggle competition we found a tutorial about Google's Word2Vec and decided to implement it on our dataset [Tutorial](https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-2-word-vectors).

In [55]:
def review_to_wordlist(review):
    review_text = re.sub("[^a-zA-Z]"," ", review)
    words = review_text.lower().split()
    return words

Word2Vec requires lists of sentences so we had to group our data by sentenceId. We do not actually need a tokenizer to split the paragraphs into sentences as shown in the tutorial because our dataset is already split by sentences

In [56]:
sentencesWord2Vec = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for sentence in sentences["Phrase"]:
    sentencesWord2Vec.append(review_to_wordlist(sentence))
    

Parsing sentences from training set


In [57]:
print sentencesWord2Vec[0]
print sentencesWord2Vec[1]

['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story']
['this', 'quiet', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking']


We checked the first couple of sentences to confirm we had created the correct data structure.

In [33]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentencesWord2Vec, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

Training model...


In [35]:
model.doesnt_match("man woman child kitchen".split())

'man'

In [39]:
model.most_similar("man")

[('old', 0.9997593760490417),
 ('style', 0.999756395816803),
 ('predictable', 0.9997518062591553),
 ('action', 0.9997439384460449),
 ('feels', 0.9997289180755615),
 ('cinematic', 0.9997148513793945),
 ('often', 0.9997051954269409),
 ('american', 0.9997024536132812),
 ('family', 0.9996999502182007),
 ('premise', 0.9996915459632874)]

In [46]:
model.most_similar("good")

[('pretty', 0.9996630549430847),
 ('half', 0.9996534585952759),
 ('especially', 0.999642014503479),
 ('done', 0.9996370673179626),
 ('simply', 0.9996294975280762),
 ('day', 0.9996140003204346),
 ('last', 0.9996137619018555),
 ('show', 0.9996005892753601),
 ('actually', 0.9995996952056885),
 ('give', 0.9995940923690796)]

In [47]:
model.most_similar("bad")

[('still', 0.9997092485427856),
 ('going', 0.9996057152748108),
 ('how', 0.9996010065078735),
 ('because', 0.9995409250259399),
 ('mind', 0.9995290040969849),
 ('then', 0.9995182752609253),
 ('go', 0.9994921684265137),
 ('why', 0.9994859099388123),
 ('end', 0.9994835257530212),
 ('anyone', 0.9994621276855469)]

Training our model took a very short time and as we can see the was not very accurate at all. doesnt_match function and most_similar don't seem to be working very well. Word2Vec works better as the size of its training data grows. We were only able to provide the model around 8544 sentences which is very small. Word2Vec is also supposed to work with a lot of unlabeld text so we had to disregard the Sentiment data we had. which goes out to say that we weren't really using the best model for our dataset.