In [11]:
# The purpose is to classify a review (text) as positive or negative using labeled review already tagged
# as positive / negative, then we build a vocabulary from our reviews and turn each review to vector
# of all the words in the voc and the amount of occurrences in the review.

# Loading the reviews
import pandas as pd

MAX_VOC_WORDS = 5000
RANDOM_FOREST_ESTIMATORS = 100

labeled_train_review = pd.read_csv("./data/movie_reviews/labeledTrainData.tsv", header=0, delimiter="\t", 
                                   quoting=3)

# Verify the number of reviews that were read (100,000 in total)
print "labeled reviews: {}".format(labeled_train_review.shape)
print "labeled reviews columns: {}".format(labeled_train_review.columns.values)

labeled reviews: (25000, 3)
labeled reviews columns: ['id' 'sentiment' 'review']


In [12]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_words(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review, "html5lib").get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return " ".join(words)


In [None]:
# Vectorizing reviews
from sklearn.feature_extraction.text import CountVectorizer

# Clean reviews from HTML Tags, stop words, non-alphabetical characters and upper case
num_reviews = labeled_train_review["review"].size
clean_train_reviews = []

for i in xrange(0, num_reviews):
    clean_train_reviews.append(review_to_words(labeled_train_review["review"][i]))

# We can use the vectorizer pre processing but in that case i preferred the build the cleaning process myself
vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None,
                             max_features=MAX_VOC_WORDS)

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

print "words vec shape: {}".format(train_data_features.shape)

In [7]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators=RANDOM_FOREST_ESTIMATORS)

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(train_data_features, labeled_train_review["sentiment"])

In [10]:
# Now we'll test our model on test data
test_reviews = pd.read_csv("./data/movie_reviews/testData.tsv", header=0, delimiter="\t", quoting=3)

clean_test_reviews = []

for i in xrange(0, len(test_reviews['review'])):
    clean_review = review_to_words(test_reviews["review"][i])
    clean_test_reviews.append(clean_review)

# Get a bag of words for the test set, and convert to a numpy array
test_reviews_data_features = vectorizer.transform(clean_test_reviews).toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_reviews_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame(data={"id": test_reviews["id"], "sentiment": result})

# Use pandas to write the comma-separated output file
output.to_csv("./data/movie_reviews/output.csv", index=False, quoting=3)
