In [1]:
%mkdir ../data
!wget -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf ../data/aclImdb_v1.tar.gz -C ../data

--2024-03-18 18:03:10--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘../data/aclImdb_v1.tar.gz’


2024-03-18 18:03:17 (11.7 MB/s) - ‘../data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
import os
import glob

def read_imdb_data(data_dir='../data/aclImdb'):
    data = {}
    labels = {}

    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}

        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []

            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)

            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)

            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)

    return data, labels

In [3]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))


IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [4]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']

    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)

    # Return a unified training data, test data, training labels, test labels
    return data_train, data_test, labels_train, labels_test

In [5]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print(len(train_X))
print(len(train_y))

25000
25000


The first step in processing the reviews is to make sure that any html tags that appear should be removed. In addition we wish to tokenize our input, that way words such as entertained and entertaining are considered the same with regard to sentiment analysis.

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()

    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem

    return words

In [7]:
print(train_X[100])
review_to_words(train_X[100])


This is one of quite a few cartoons with Bugs Bunny and Marvin the Martian - and a space dog called K-9 is included as well. This Looney Tunes episode is very funny, has reasonably good cartoon animation (Marvin's animation is very well done) and the plot is well done. The end is rather weird, so be prepared for it, it is slightly boring. <br /><br />In this episode, Marvin the Martian has been sent to earth to capture an earth creature and bring it back to Mars. With his trusty dog K-9, Marvin sets out and soon finds the tracks of no other rabbit but Bugs Bunny! He greets them with treats, thinking they are trick or treaters in their costumes. Little does he realise they are preparing themselves to take this rabbit to Mars...<br /><br />I recommend this episode to anyone who likes Bugs Bunny, Marvin the Martian and Looney Tunes in general. As far as the beginning and the middle of the episode are concerned, you are likely to like this. Enjoy "Hasty Hare"! :-)


['one',
 'quit',
 'cartoon',
 'bug',
 'bunni',
 'marvin',
 'martian',
 'space',
 'dog',
 'call',
 'k',
 '9',
 'includ',
 'well',
 'looney',
 'tune',
 'episod',
 'funni',
 'reason',
 'good',
 'cartoon',
 'anim',
 'marvin',
 'anim',
 'well',
 'done',
 'plot',
 'well',
 'done',
 'end',
 'rather',
 'weird',
 'prepar',
 'slightli',
 'bore',
 'episod',
 'marvin',
 'martian',
 'sent',
 'earth',
 'captur',
 'earth',
 'creatur',
 'bring',
 'back',
 'mar',
 'trusti',
 'dog',
 'k',
 '9',
 'marvin',
 'set',
 'soon',
 'find',
 'track',
 'rabbit',
 'bug',
 'bunni',
 'greet',
 'treat',
 'think',
 'trick',
 'treater',
 'costum',
 'littl',
 'realis',
 'prepar',
 'take',
 'rabbit',
 'mar',
 'recommend',
 'episod',
 'anyon',
 'like',
 'bug',
 'bunni',
 'marvin',
 'martian',
 'looney',
 'tune',
 'gener',
 'far',
 'begin',
 'middl',
 'episod',
 'concern',
 'like',
 'like',
 'enjoy',
 'hasti',
 'hare']

In [8]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay

    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]

        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])

    return words_train, words_test, labels_train, labels_test

In [9]:
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)


  text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags


Wrote preprocessed data to cache file: preprocessed_data.pkl


Since we will be using a recurrent neural network, it will be convenient if the length of each review is the same. To do this, we will fix a size for our reviews and then pad short reviews with the category 'no word' (which we will label 0) and truncate long reviews. Basically in the dictionary the one with the most higher rank is the one that occurs most frequently. You dont care for the first two these are no words

In [10]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""

    # DONE: Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    #       sentence is a list of words.

    # A dict storing the words that appear in the reviews along with how often they occur
    word_count = {}

    for sentence in data:
        for word in sentence:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

    # DONE: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.

    sorted_words = sorted(word_count, key=word_count.get, reverse=True)
    #print(sorted_words[0])
    #print(sorted_words[-1])

    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels

    return word_dict

In [11]:
word_dict = build_dict(train_X)
print(list(word_dict.keys())[0:5])
data_dir = '../data/pytorch' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

['movi', 'film', 'one', 'like', 'time']


Now that we have our word dictionary which allows us to transform the words appearing in the reviews into integers, it is time to make use of it and convert our reviews to their integer sequence representation, making sure to pad or truncate to a fixed length, which in our case is 500.

In [13]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict

    working_sentence = [NOWORD] * pad

    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ

    return working_sentence, min(len(sentence), pad)

In [14]:
def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []

    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)

    return np.array(result), np.array(lengths)

In [15]:
train_X, train_X_len = convert_and_pad_data(word_dict, train_X)
test_X, test_X_len = convert_and_pad_data(word_dict, test_X)


In [17]:
data_dir = '../data/pytorch' # The folder we will use for storing data
if not os.path.exists(data_dir): # Make sure that the folder exists
    os.makedirs(data_dir)

with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

import pandas as pd

pd.concat([pd.DataFrame(train_y), pd.DataFrame(train_X_len), pd.DataFrame(train_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [19]:
print(train_X[0])
print(train_X_len[0])

[ 660 1278    1 1880  347  601    4  329  358  306 2617  682  184  347
 3507  841  445    6  814  213  197    1  101  797   20    4    1  284
  373 1881  163 3533    3  640  280    1  245  287  240    1    1   60
  410   49   52  338  181   15    5   11   79    3    1    4   54 3508
    1 1615    1    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [20]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    """ LSTMClassifier class for initializing the layers for the simple
    recurrent neural network model (RNN) used for Sentiment Analysis of
    IMDB reviews.

    Attributes:
        embedding_dim (int) dimensionality of the embedding layer
        hidden_dim (int) dimensionality of the hidden layer(s)
        vocab_size (int) size of the vocabulary used by Bag of Words

    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):

        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=1)
        self.sig = nn.Sigmoid()

        self.word_dict = None


    def forward(self, x):
        x = x.t()
        lengths = x[0,:]
        reviews = x[1:,:]
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

In [27]:
import torch
import torch.utils.data

# Read in only the first 250 rows
train_sample = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, names=None)

# Turn the input pandas dataframe into tensors
train_sample_y = torch.from_numpy(train_sample[[0]].values).float().squeeze()
train_sample_X = torch.from_numpy(train_sample.drop([0], axis=1).values).long()

# Build the dataset
train_sample_ds = torch.utils.data.TensorDataset(train_sample_X, train_sample_y)
# Build the dataloader
train_sample_dl = torch.utils.data.DataLoader(train_sample_ds, batch_size=100)

In [28]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch_X, batch_y = batch

            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            # DONE: Complete this train method to train the model provided.
            optimizer.zero_grad()
            output = model.forward(batch_X)
            loss = loss_fn(output, batch_y)
            loss.backward()
            optimizer.step()

            total_loss += loss.data.item()
        print("Epoch: {}, Loss: {}".format(epoch, total_loss / len(train_loader)))

In [30]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(32, 100, 5000).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_sample_dl, 15, optimizer, loss_fn, device)

Epoch: 1, Loss: 0.6284457104206085
Epoch: 2, Loss: 0.511864448428154
Epoch: 3, Loss: 0.3934649955034256
Epoch: 4, Loss: 0.33832233828306196
Epoch: 5, Loss: 0.302040314078331
Epoch: 6, Loss: 0.28056073033809664
Epoch: 7, Loss: 0.28352860754728315
Epoch: 8, Loss: 0.23945282605290413
Epoch: 9, Loss: 0.23098776400089263
Epoch: 10, Loss: 0.24441413468122483
Epoch: 11, Loss: 0.21906791719794275
Epoch: 12, Loss: 0.20859959319233895
Epoch: 13, Loss: 0.17155315992236136
Epoch: 14, Loss: 0.1639059560596943
Epoch: 15, Loss: 0.14581003014743327


In [36]:
data_dir = '../data/pytorch'
os.makedirs(data_dir, exist_ok=True)

# Specify the file path where you want to save the model within the data_dir
model_path = os.path.join(data_dir, 'trained_model.pth')

# Save the model state dictionary
torch.save(model.state_dict(), model_path)

print("Model saved successfully at:", model_path)

Model saved successfully at: ../data/pytorch/trained_model.pth


In [31]:
pd.concat([pd.DataFrame(test_y), pd.DataFrame(test_X_len), pd.DataFrame(test_X)], axis=1) \
        .to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [32]:
# Read in only the first 250 rows
test_sample = pd.read_csv(os.path.join(data_dir, 'test.csv'), header=None, names=None)

# Turn the input pandas dataframe into tensors
test_sample_y = torch.from_numpy(test_sample[[0]].values).float().squeeze()
test_sample_X = torch.from_numpy(test_sample.drop([0], axis=1).values).long()

# Build the dataset
test_sample_ds = torch.utils.data.TensorDataset(test_sample_X, test_sample_y)
# Build the dataloader
test_sample_dl = torch.utils.data.DataLoader(test_sample_ds, batch_size=100)

In [39]:
def evaluate(model, dataloader, loss_fn, device):
    """
    Function to evaluate the model on a given dataset.

    Args:
    - model: the PyTorch model to evaluate
    - dataloader: DataLoader object for the dataset
    - loss_fn: loss function used for training
    - device: device to run the evaluation on (e.g., "cpu" or "cuda")

    Returns:
    - accuracy: accuracy of the model on the dataset
    - loss: average loss on the dataset
    """

    model.eval()  # Set the model to evaluation mode

    correct = 0
    total = 0
    running_loss = 0.0

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Compute loss
            loss = loss_fn(outputs, labels)
            running_loss += loss.item()

            # Compute accuracy
            predicted = torch.round(outputs)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    avg_loss = running_loss / len(dataloader)

    return accuracy, avg_loss


In [40]:
accuracy, avg_loss = evaluate(model, test_sample_dl, loss_fn, device)
print(f"Accuracy on test set: {accuracy:.4f}")
print(f"Average loss on test set: {avg_loss:.4f}")


Accuracy on test set: 0.8629
Average loss on test set: 0.4438
