In [7]:
# Loading the reviews
import pandas as pd

labeled_train_review = pd.read_csv("./data/movie_reviews/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test_reviews = pd.read_csv("./data/movie_reviews/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train_reviews = pd.read_csv("./data/movie_reviews/unlabeledTrainData.tsv", header=0,
                                      delimiter="\t", quoting=3)

# Verify the number of reviews that were read (100,000 in total)
print "labeled reviews: {}".format(labeled_train_review.shape)
print "test reviews: {}".format(test_reviews.shape)
print "unlabeled reviews: {}".format(unlabeled_train_reviews.shape)

print "\nlabeled reviews columns: {}".format(labeled_train_review.columns.values)


labeled reviews: (25000, 3)
test reviews: (25000, 2)
unlabeled reviews: (50000, 2)

labeled reviews columns: ['id' 'sentiment' 'review']


In [5]:
num_features = 300

In [10]:
# Review to clean text
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review, 'html5lib').get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if w not in stops]
    #
    # 5. Return a list of words
    return words


In [2]:
# e.g: we can pull out of our model the word happy and see it's vector representation 
# model['happy'].shape
import os
from gensim.models import KeyedVectors

model_name = os.path.join("samples/movies_reviews_sentiment_analysis", "300features_40minwords_10context")
model = KeyedVectors.load_word2vec_format(model_name, binary=True)

In [25]:
import numpy as np  # Make sure that numpy is imported

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            featureVec = np.add(featureVec, model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec, nwords)
    
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        #
        # Increment the counter
        counter = counter + 1
    return reviewFeatureVecs


In [37]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.

clean_train_reviews = []
for review in labeled_train_review["review"]:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)

print "Creating average feature vecs for test reviews"
clean_test_reviews = []
for review in test_reviews["review"]:
    clean_test_reviews.append(review_to_wordlist(review, remove_stopwords=True))

testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

KeyboardInterrupt: 

In [43]:
# Fit a random forest to the training data, using 100 trees
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)

print "Fitting a random forest to labeled training data..."
where_are_NaNs = isnan(trainDataVecs)
trainDataVecs[where_are_NaNs] = 0

forest = forest.fit(trainDataVecs, labeled_train_review["sentiment"])


Fitting a random forest to labeled training data...


In [46]:
# Test & extract results 
where_are_NaNs = isnan(testDataVecs)
testDataVecs[where_are_NaNs] = 0
result = forest.predict(testDataVecs)

# Write the test results 
output = pd.DataFrame(data={"id": test_reviews["id"], "sentiment": result})
output.to_csv("./data/movie_reviews/Word2Vec_AverageVectors.csv", index=False, quoting=3)

