In [2]:
# coding: utf-8

# # Predicting the Sentiment of Movie Reviews

# There are two goals for this analysis. The first is to accurately predict the sentiment of movie reviews, and the second is to develop my model in such a way that its outputs can be analyzed with TensorBoard. This is the first time that I am using TensorBoard, so I want to have a somewhat challenging task, and not use a huge dataset. There are 25,000 training and testing reviews, so this model can train multiple iterations overnight on my MacBook Pro. The data is provided by a Kaggle competition from 2015 (https://www.kaggle.com/c/word2vec-nlp-tutorial). Despite it having concluded, it can still be used as an excellent learning opportunity. The sections of this analysis are:
# - Inspect the Data
# - Clean and Format the Data
# - Build and Train the Model
# - Make the Predictions
# - Summary

# In[1]:

import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from collections import namedtuple


# In[2]:

# Load the data
train = pd.read_csv("labeledTrainData.tsv", delimiter="\t")
test = pd.read_csv("testData.tsv", delimiter="\t")


# # Inspect the Data

# In[3]:

train.head()


# In[4]:

test.head()


# In[5]:

print(train.shape)
print(test.shape)


# The reviews are rather long, so we won't be using all of the text to train our model. Using all of the text would increase our training to a longer timeframe than I would rather give to this project, but it should make the predictions more accurate.

# In[6]:

# Inspect the reviews
for i in range(3):
    print(train.review[i])
    print()


# In[7]:

# Check for any null values
print(train.isnull().sum())
print(test.isnull().sum())


# # Clean and Format the Data

# In[8]:

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    
    # Convert words to lower case and split them
    text = text.lower().split()
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Return a list of words
    return(text)


# Clean the training and testing reviews

# In[9]:

train_clean = []
for review in train.review:
    train_clean.append(clean_text(review))


# In[10]:

test_clean = []
for review in test.review:
    test_clean.append(clean_text(review))


# In[11]:

# Inspect the cleaned reviews
print(train_clean[i])

Using TensorFlow backend.


(25000, 3)
(25000, 2)
With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bi

In [4]:
all_reviews = train_clean + test_clean
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_reviews)
train_seq = tokenizer.texts_to_sequences(train_clean)
test_seq = tokenizer.texts_to_sequences(test_clean)


# In[14]:

# Find the number of unique tokens
word_index = tokenizer.word_index


# In[15]:

# Inspect the reviews after they have been tokenized
print(train_seq[1])

[1, 354, 296, 4, 1, 3574, 33, 4528, 10475, 6, 3, 53, 426, 19, 11, 523, 266, 5, 80, 766, 2, 8423, 5, 11006, 8657, 1893, 1141, 4618, 354, 267, 435, 10475, 2770, 8, 401, 36, 9, 2, 146, 35, 290, 26, 19, 17, 68, 2544, 1, 192, 11, 7, 13, 23, 1, 1248, 706, 344, 2579, 11, 270, 43, 171, 292, 739, 1141, 1, 3346, 316, 17, 815, 3743, 11, 67, 62, 1, 3899, 4335, 5, 1, 267, 523, 293, 287, 16, 277, 179, 8, 3, 15, 146, 35, 19918, 524, 14, 2405, 1391, 165, 62, 5, 6475, 264, 32, 50, 396, 951, 3, 15, 22, 52, 638, 12478, 38, 109, 2282, 61, 6, 134, 89, 79, 112, 1004, 17, 1, 1391, 70, 503, 1, 766, 435, 10475, 269, 83, 109, 2812, 5, 1893, 1141, 4618, 354, 662, 2, 70, 253, 7, 5, 29, 53, 426, 10, 90, 7, 750, 5, 5235, 47, 1, 1391, 11007, 5, 29, 91, 5622]


print(train)

In [6]:
# Find the length of reviews
lengths = []
for review in train_seq:
    lengths.append(len(review))

for review in test_seq:
    lengths.append(len(review))

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

In [7]:
lengths.counts.describe()


# In[18]:

print(np.percentile(lengths.counts, 80))
print(np.percentile(lengths.counts, 85))
print(np.percentile(lengths.counts, 90))
print(np.percentile(lengths.counts, 95))

324.0
377.0
457.0
598.0


print(train_pad)

In [8]:
max_review_length = 200

train_pad = pad_sequences(train_seq, maxlen = max_review_length)
print("train_pad is complete.")

test_pad = pad_sequences(test_seq, maxlen = max_review_length)
print("test_pad is complete.")


train_pad is complete.
test_pad is complete.


In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(train_pad, train.sentiment, test_size = 0.15, random_state = 2)
x_test = test_pad

In [12]:
print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)
print(train.sentiment)

(21250, 200)
(3750, 200)
(25000, 200)
0        1
1        1
2        0
3        0
4        1
        ..
24995    0
24996    0
24997    0
24998    0
24999    1
Name: sentiment, Length: 25000, dtype: int64


In [11]:
print(x_train)

[[    0     0     0 ... 15719     8   410]
 [76056     5  3799 ...   399    43     4]
 [    0     0     0 ...    16  3721  1475]
 ...
 [  949    63    17 ...   125    23  4336]
 [    1  1566    12 ...   243   120   621]
 [    0     0     0 ...     1  6051  7540]]
