#Bag of Words Meets Bag of Popcorn#

This notebook will provide a tutorial in using Word2Vectors to get a deeper understanding of sentiment analysis. 

Some of the libraries we will use are:

- Beautiful Soup
- nltk
- logging

Imports

In [22]:
#local packages
!pip install numpy pandas nltk gensim scikit-learn



In [24]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier

nltk.download('punkt') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Load Data

In [25]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

print("Read {} labeled train reviews, {} labeled test reviews, and {} unlabeled reviews\n".format(
    train["review"].size, test["review"].size, unlabeled_train["review"].size))


Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



Preprocess the Data

In [26]:
def review_to_sentences(review, remove_stopwords=True):
    review_text = BeautifulSoup(review, "html.parser").get_text()
    raw_sentences = sent_tokenize(review_text)
    sentences = []
    stop_words = set(stopwords.words("english")) if remove_stopwords else set()

    for raw_sentence in raw_sentences:
        sentence_text = re.sub("[^a-zA-Z]", " ", raw_sentence)
        words = word_tokenize(sentence_text.lower())
        if remove_stopwords:
            words = [w for w in words if w not in stop_words]
        if words:
            sentences.append(words)
    return sentences

# 4. Parse all reviews into sentences
sentences = []
print("Parsing sentences from training set...")
for review in train["review"]:
    sentences += review_to_sentences(review)

print("Parsing sentences from unlabeled set...")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review)

Parsing sentences from training set...
Parsing sentences from unlabeled set...


Train the Model

In [None]:
print("Training Word2Vec model...")
model = Word2Vec(sentences, vector_size=300, window=10, min_count=40, workers=4)
model.init_sims(replace=True)

In [None]:
def make_feature_vector(words, model, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    n_words = 0
    index2word_set = set(model.wv.index_to_key)
    
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

def get_avg_feature_vectors(reviews, model, num_features):
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for i, review in enumerate(reviews):
        review_feature_vecs[i] = make_feature_vector(review, model, num_features)
    return review_feature_vecs


print("Tokenizing reviews...")
clean_train_reviews = [word_tokenize(re.sub("[^a-zA-Z]", " ", 
                          BeautifulSoup(review, "html.parser").get_text().lower())) 
                       for review in train["review"]]

clean_test_reviews = [word_tokenize(re.sub("[^a-zA-Z]", " ", 
                         BeautifulSoup(review, "html.parser").get_text().lower())) 
                      for review in test["review"]]


print("Creating average feature vectors...")
train_data_vecs = get_avg_feature_vectors(clean_train_reviews, model, 300)
test_data_vecs = get_avg_feature_vectors(clean_test_reviews, model, 300)

print("Training the classifier...")
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data_vecs, train["sentiment"])

print("Predicting on test set...")
predictions = forest.predict(test_data_vecs)
output = pd.DataFrame(data={"id": test["id"], "sentiment": predictions})
output.to_csv("submission.csv", index=False)
print("Submission file created!")

In [None]:
def tokenize_reviews(reviews):
    clean_reviews = []
    stop_words = set(stopwords.words("english"))
    
    for review in reviews:
        review_text = BeautifulSoup(review, "html.parser").get_text()
        words = word_tokenize(re.sub("[^a-zA-Z]", " ", review_text.lower()))
        words = [w for w in words if w not in stop_words]
        clean_reviews.append(words)
    
    return clean_reviews

clean_train_reviews = tokenize_reviews(train["review"])
clean_test_reviews = tokenize_reviews(test["review"])

In [None]:
train_data_vecs = get_avg_feature_vectors(clean_train_reviews, model, 300)
test_data_vecs = get_avg_feature_vectors(clean_test_reviews, model, 300)

In [None]:
predictions = forest.predict(test_data_vecs)
output = pd.DataFrame(data={"id": test["id"], "sentiment": predictions})
output.to_csv("submission.csv", index=False)
print("Saved submission.csv!")