In [1]:
import pandas as pd
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

%matplotlib inline



In [2]:
train = pd.read_csv("data/train.tsv", sep= '\t')
test = pd.read_csv("data/test.tsv", sep= '\t')
unlabeled = train.drop('Sentiment', 1)

In [18]:
def clean_phrase(phrase, remove_stopwords=False):
    letters_only = re.sub("[^a-zA-Z]", " ", phrase)
    lower_case = letters_only.lower()
    
    words = lower_case.split()
    if remove_stopwords:
        stops = set(stopwords.words("english")) 
        words = [w for w in words if not w in stops]
    return(" ".join(words))    

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words. 
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

def num_words(phrase):
    return len(phrase.split())

def length_phrase(phrase):
    return len(phrase)

def avg_word_length(phrase):
    if(phrase != ''):
        return sum(map(len, phrase.split()))/len(phrase.split())
    else:
        return 0

most_positive = ['remarkable', 'brilliant', 'terrific', 'excellent', 'finest', 'extraordinary', 'masterful', 
                 'hilarious', 'beautiful', 'wonderful', 'breathtaking', 'powerful', 'wonderfully', 'delightful', 
                 'masterfully', 'fantastic', 'dazzling', 'funniest', 'interference', 'refreshing']
most_negative = ['worst', 'failure', 'lacks', 'waste', 'bore', 'depressing', 'lacking', 'stupid', 'disappointment', 
                 'unfunny', 'lame', 'devoid', 'trash', 'lousy', 'junk', 'poorly', 'mess', 'sleep', 'unappealing', 'fails']

def contains_positive(phrase):
    for word in phrase.split():
        if word in most_positive:
            return 1 
    return 0
        
def contains_negative(phrase):
    for word in phrase.split():
        if word in most_negative:
            return 1
    return 0

def getSentences(data):
    sentenceK = data['SentenceId'].drop_duplicates()
    sentences = data.iloc[sentenceK.keys()]
    return sentences

In [5]:
def apply_transform(data):
    data['CleanPhrase'] = data['Phrase'].apply(clean_phrase, remove_stopwords= True)
    data['NumWords'] = data['CleanPhrase'].apply(num_words)
    data['LengthPhrase'] = data['CleanPhrase'].apply(length_phrase)
    data['AvgWordLength'] = data['CleanPhrase'].apply(avg_word_length)
    data['ContainPositive'] = data['CleanPhrase'].apply(contains_positive)
    data['ContainNegative'] = data['CleanPhrase'].apply(contains_negative)

In [6]:
apply_transform(train)
apply_transform(test)
unlabeled['CleanPhrase'] = unlabeled['Phrase'].apply(clean_phrase, remove_stopwords = False)

In [7]:
sentences = getSentences(train)
sentencesWord = getSentences(unlabeled)

In [54]:
predictors = ["ContainPositive", "ContainNegative", "NumWords", "LengthPhrase", "AvgWordLength"]
# predictors = ["ContainPositive", "ContainNegative"]
# alg = LogisticRegression(random_state=1)
alg = RandomForestClassifier(random_state=1, n_estimators=1000, min_samples_split=8, min_samples_leaf=4)
cross_validation.cross_val_score(alg, train[predictors], train["Sentiment"], cv=3).mean()


0.51919124553222373

We need to come up with better predictors because they would only give around 50 percent

In [56]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([('vect', CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,
                             max_features = 5000)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', OneVsOneClassifier(LinearSVC())),
                     ])
# pipeline = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('clf', MultinomialNB()),
#                      ])




In [80]:
predictors = "Phrase"
# cross_validation.cross_val_score(pipeline, train[predictors], train["Sentiment"], cv=3).mean()
pipeline = pipeline.fit(sentences.CleanPhrase, sentences.Sentiment)

Cross validation to check the initial score and fit to actually submit

In [81]:
prediction = pipeline.predict(test.CleanPhrase)
print prediction

[3 3 2 ..., 1 1 3]


This prediction gives around 60 percent

In [82]:
output = pd.DataFrame( data={"PhraseId":test["PhraseId"], "Sentiment":prediction} )

# Use pandas to write the comma-separated output file
output.to_csv("new.csv", index=False, quoting=3 )

In [22]:
sentencesWord2Vec = []  # Initialize an empty list of sentences

print "Parsing sentences from training set"
for sentence in sentencesWord["Phrase"]:
    sentencesWord2Vec.append(review_to_wordlist(sentence))
    

Parsing sentences from training set


In [23]:
print sentencesWord2Vec[0]

['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story']


In [24]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentencesWord2Vec, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

Training model...


In [27]:
model.doesnt_match("man woman child kitchen".split())

'man'