## Applying Machine Learning To Sentiment Analysis

The IMDB movie review set can be downloaded from http://ai.stanford.edu/~amaas/data/sentiment/

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/movie_data.csv')
df.head()

In [None]:
df.shape

Tasks

- Clean up the review text using the pre-processing techniques discussed in the `cleanSentence` example
- Vectorize the text using a `CountVectorizer(binary=True)`
- Prepare the train/test data 
- Train a decision tree model and evaluate the model
- Create a pipeline that combine the previous three steps (except the evaluation)
- Evaluate the pipeline
- Try use instead a random forest classifier as the estimator in the pipeline

In [None]:
# Prepare cleaning functions
import re, string
import nltk
from nltk.stem import SnowballStemmer

stop_words = ["a", "an", "the", "this", "that", "is", "it", "to", "and"]

stemmer = SnowballStemmer('english')

def preProcessText(text):
    # lowercase and strip leading/trailing white space
    text = text.lower().strip()
    
    # remove HTML tags
    text = re.compile('<.*?>').sub('', text)
    
    # remove punctuation
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    
    # remove extra white space
    text = re.sub('\s+', ' ', text)
    
    return text

def lexiconProcess(text, stop_words, stemmer):
    filtered_sentence = []
    words = text.split(" ")
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(stemmer.stem(w))
    text = " ".join(filtered_sentence)
    
    return text

def cleanSentence(text, stop_words, stemmer):
    return lexiconProcess(preProcessText(text), stop_words, stemmer)

In [None]:
# This is the first review
df['review'].values[0]

In [None]:
# Clean Up Approach 1: Call the cleanSentence() function on each item from the review column
# df['review'] = [cleanSentence(item, stop_words, stemmer) for item in df['review'].values]

In [None]:
# Clean Up Apprach 2: Apply the cleanSentence() function as a lambda function directly on the review column with apply()
df['review'] = df['review'].apply(lambda x : cleanSentence(x, stop_words, stemmer))

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Or you can use a tf-idf vectorizer 
# tfidf = TfidfVectorizer()

textvectorizer = CountVectorizer(binary=True)
tokens = textvectorizer.fit_transform(df['review'])
tokens.shape

In [None]:
# Prepare the data: use the first 40000 for training and last 10000 for testing. 
X_train = tokens[:40000]
y_train = df.iloc[:40000, 1]

X_test = tokens[40000:]
y_test = df.iloc[40000:, 1]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_predict = dt.predict(X_test)
print(classification_report(y_test, y_predict))

In [None]:
# Let's create a pipline using a random forest classifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('text_vect', CountVectorizer(binary=True)),
    ('dt', RandomForestClassifier())                           
])

In [None]:
# And use the train_test_split to prepare the train test sets
# The df dataframe has been cleaned. 

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.1, shuffle=True, random_state=42)

X_train = train_data['review']
y_train = train_data['output']
X_test = test_data['review']
y_test = test_data['output']


In [None]:
# Fit the pipeline 
pipeline.fit(X_train, y_train)

In [None]:
# Evaluate the pipeline on the test set
y_predict = pipeline.predict(X_test)
print(classification_report(y_test, y_predict))

In [None]:
label = {0: 'negative', 1: 'positive'}
example = ['The movie was not worthy of my time, and I was very disappointed. The story was so dull and the characters were lame. ']

print('Prediction:', label[pipeline.predict(example)[0]])