In [56]:
import re
import pickle
import pandas as pd

## Load the data

In [42]:
dfx = pd.read_csv('data/IMDB Dataset.csv').fillna("none")
dfx.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [43]:
# see all review has labeled
assert len(dfx.review) == len(dfx.sentiment)

## Data Cleaning

Common data cleaning steps on all text:

- Make text all lower case
- Remove punctuation
- Remove numerical values

In [44]:
# Remove noise from text and do lower case.

def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},                                     # remove spaces after a tag opens or closes
        {r'\s+': u' '},                                      # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},                         # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},                          # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},                      # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},            # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},         # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},                          # remove remaining tags
        {r'^\s+': u''}                                       # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
    return text.lower()

In [45]:
dfx.review = dfx.review.apply(lambda x: text_cleaner(x))

In [46]:
# change label into numeric form
dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

In [48]:
dfx.to_pickle('pickles/cleaned_data.pkl')

## Data pre-processing

Common text-processing steps are:

- Tokenize
- Stop word removal
- Stemming / lemmatization
- Parts of speech tagging

In [53]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [54]:
def text_pre_processing(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
    return filtered_sentence