In [41]:
import pandas as pd
import re
import nltk.data
import sys
import logging
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from gensim.models import word2vec

reload(sys)
sys.setdefaultencoding('utf8')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Bag of Words Features

First we will use a BoW model to simply see the difficulty of the problem space.

In [2]:
train = pd.read_csv("Data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf-8')
unlabeled_train = pd.read_csv( "Data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf-8')
test = pd.read_csv("Data/testData.tsv", header=0, delimiter="\t", quoting=3, encoding='utf-8')

Clean data by removing HTML markup, non-letters, and converting to lower case.

In [34]:
def review_to_words( raw_review ):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    
    # 3. Convert to lower case
    return letters_only.lower()

Apply cleaning function to entire dataset

In [35]:
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
clean_test_reviews = []

for i in xrange(0, train["review"].size):
    clean_train_reviews.append(review_to_words(train["review"][i]))
    
for i in xrange(0, test["review"].size):
    clean_test_reviews.append(review_to_words(test["review"][i]))

Fit a language model to the training set and apply to the full corpus.

In [38]:
vectorizer = CountVectorizer(analyzer="word", max_features=5000, stop_words='english')
train_data_features = vectorizer.fit_transform(clean_train_reviews)
test_data_features = vectorizer.transform(clean_test_reviews)

Apply a basic logistic regression model.

In [39]:
clf = LogisticRegression(penalty='l1').fit(train_data_features, train["sentiment"])
result = clf.predict(test_data_features)

In [40]:
# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

output.to_csv( "Data/Bag_of_Words_model.csv", index=False, quoting=3 )

In [42]:
forest = RandomForestClassifier(n_estimators = 100).fit( train_data_features, train["sentiment"] )
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Data/Bag_of_Words_model_rf.csv", index=False, quoting=3 )

This model achieved 0.85192 score.

# Word2Vec Features

The main goal of this project is to gain practical experience with Word2Vec implementations and integration into my skillset.

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [4]:
def review_to_wordlist( review ):
    # 1. Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # 3. Convert words to lower case and split them
    return review_text.lower().split()

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer ):
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence ))
            
    return sentences

In [5]:
sentences = []  # Initialize an empty list of sentences

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

In [13]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model
print "Training model..."
model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          size=num_features, 
                          min_count = min_word_count,
                          window = context, sample = downsampling)

model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)