In [1]:
import pandas as pd
import numpy as np
# parses the HTML formatted reviews 
from bs4 import BeautifulSoup
# regular expressions
import re
# natural language toolkit
from nltk.corpus import stopwords
import nltk.data
#nltk.download()  

# natural language toolkit
from nltk.stem import WordNetLemmatizer
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
# random forest
from sklearn.ensemble import RandomForestClassifier
# save and load models and data
import pickle

In [None]:
OUTPUT = False

In [2]:
# Read data from files 
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

# Verify the number of reviews that were read (100,000 in total)
if OUTPUT:
    print("Read {} labeled train reviews, {} labeled test reviews " \
        "and {} unlabeled reviews.".format(
        train["review"].size,
        test["review"].size, 
        unlabeled_train["review"].size)
    )

Read 25000 labeled train reviews, 25000 labeled test reviews and 50000 unlabeled reviews.


In [None]:
def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review, features="html.parser").get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z0-9]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [None]:
# could use stemming instead of lemmatization
# from nltk.stem.porter import PorterStemmer
# porter_stemmer = PorterStemmer()
# porter_stemmer.stem(w)

def review_to_words(raw_review):
    """
    Converts a raw IMDb review into an string of (meaningful) words.
    :param raw_review: String (raw movie review)
    :return: String (preprocessed movie review)
    """
    # remove HTML tags
    review_text = BeautifulSoup(raw_review).get_text() 
    # remove non-letters (esp. digits and punctuation) vie reg. expr.  
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # convert to lower case and split into words
    words = letters_only.lower().split()                             
    # sets are faster searched than lists
    stops = set(stopwords.words("english"))                  
    # remove stop words and use a lemmatizer to get word stems
    wordnet_lemmatizer = WordNetLemmatizer()
    meaningful_words = [wordnet_lemmatizer.lemmatize(w) for w in words \
                        if w not in stops]
    # join the set of words into a space-separated string
    return " ".join(meaningful_words)

In [None]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(review_to_wordlist( raw_sentence, \
              remove_stopwords))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [None]:
try:
    sentences = pickle.load(open("./models/sentences.pickle", "rb"))
except (OSError, IOError) as e:
    sentences = []  # Initialize an empty list of sentences
    
    if OUTPUT:
        print("Parsing sentences from training set")
    for review in train["review"]:
        sentences += review_to_sentences(review, tokenizer)
    
    if OUTPUT:
        print("Parsing sentences from unlabeled set")
    for review in unlabeled_train["review"]:
        sentences += review_to_sentences(review, tokenizer)
    pickle.dump(sentences, open("./models/sentences.pickle", "wb"))

In [None]:
test_sentiments = pd.read_csv("data/test_sentiments.csv", header=0, delimiter=",", quoting=3)
test["id"] = test["id"].apply(lambda x: x.replace('"', ''))
test = test.join(test_sentiments.set_index("id"), on="id")