In [14]:
import torch

print(torch.cuda.is_available())

True


In [2]:
import os
import urllib.request
import zipfile
import tarfile

maximum_words = 500

def download_and_extract_if_not_exist(url, extract_path, zip_file_name, zip_file_contents_file_name):

    # If any of the files already exist, skip the download - it's already been done and extracted
    path = os.path.join(extract_path, zip_file_contents_file_name)
    if os.path.exists(path):
        print("Files already exist. Skipping download.")
        return

    # Check if the zip file already exists, if it does, skip the download to save time
    zip_file_path = os.path.join(extract_path, zip_file_name)
    if not os.path.exists(zip_file_path):
        print("Downloading zip file...")
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        # Download the zip file
        zip_file_path, _ = urllib.request.urlretrieve(url, zip_file_path)
    else:
        print("Zip file already exists. Skipping download.")

    # Extract the contents of the zip file to the given directory based on the file extension
    if zip_file_path.endswith(".zip"):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    elif zip_file_path.endswith(".tar.gz") or zip_file_path.endswith(".tgz"):
        with tarfile.open(zip_file_path, 'r:gz') as tar_ref:
            tar_ref.extractall(extract_path)

    # Close and remove the zip file to free up space
    os.remove(zip_file_path)
    print(f"Files extracted to: {extract_path}")


In [3]:
# Download the word embeddings into the following folder:
folder_to_download = "./models"

# 6 Billion token model
embedding_url = "https://nlp.stanford.edu/data/glove.6B.zip"
file_name_to_download = "glove.6B.zip"

# 42 Billion token model
# embedding_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip"
# file_name_to_download = "glove.42B.300d.zip"

# Download and extract the embedding file, then set a variable with the path to the embedding file
download_and_extract_if_not_exist(embedding_url, folder_to_download, file_name_to_download, "glove.6B.50d.txt")
word_embedding_file_path = os.path.join(folder_to_download, "glove.6B.50d.txt")

Files already exist. Skipping download.


In [4]:
# Download the dataset into the following folder:
folder_to_download = "./dataset"

# 6 Billion token model
dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
file_name_to_download = "aclImdb_v1.tar.gz"

# Download and extract, then set the path to the dataset
download_and_extract_if_not_exist(dataset_url, folder_to_download, file_name_to_download, "aclImdb/imdb.vocab")
dataset_folder= os.path.join(folder_to_download, "aclImdb")

Files already exist. Skipping download.


In [5]:
dataset_folder= os.path.join(folder_to_download, "aclImdb")
def read_imdb_data_from_folder(imdb_reviews_folder):
    texts = []
    labels = []
    review_folders = ["pos", "neg"] # there are 2 folders in each dataset folder: pos and neg
    for connotation_dir in review_folders:
        review_dir = os.path.join(imdb_reviews_folder, connotation_dir)
        for filename in os.listdir(review_dir):
            with open(os.path.join(review_dir, filename), "r", encoding="utf8") as f:
                texts.append(f.read())
            labels.append(0 if connotation_dir == "neg" else 1)
    return texts, labels

def get_negative_positive_from_label(label):
    return "negative" if label == 0 else "positive"

In [6]:
# Load the IMDB Movie Review dataset from the downloaded folder
test_texts, test_labels = read_imdb_data_from_folder(os.path.join(dataset_folder, "test"))
train_texts, train_labels = read_imdb_data_from_folder(os.path.join(dataset_folder, "train"))

In [7]:
for i in range(5):
    print(f"Review #{i}")
    print(f"Label: {train_labels[i+24000]}")
    print(f"Sentiment: {get_negative_positive_from_label(train_labels[i+24000])}")
    print(f"Text: {train_texts[i+24000]}")
    print()

Review #0
Label: 0
Sentiment: negative
Text: Christ, oh Christ... One watches stunned, incredulous, and possibly deranged, as this tawdry exercise in mirthless smut unfolds with all the wit and dexterity of a palsied Galapagos tortoise. Can such things be? Does this movie actually exist, or was I the unwitting guinea pig of some shadowy international drugs company, sipping my coffee unaware that it had been spiked with a dangerous hallucinogen? I've seen a lot of films, and a lot of bad films, but nothing prepared me for this; by the end of it I was a gibbering, snivelling wreck, tearing at the carpet with my teeth like a dog, clawing at the walls, howling till my lungs were sore. I pleaded desperately, frenziedly for mercy (to whom this appeal was made, I don't know), and longed with burning desire for the soothing balm of Ozu Yasujiro. Sweet Weeping Jesus, the memories... sometimes they come back to me. When I'm at my most vulnerable, when I'm least able to handle them. I shudder, I 

In [36]:
import torch

# Hyperparameters
state_dimensions = [20, 50, 100, 200, 500]
embedding_dim = 50  # You can choose the dimension of the word embeddings -- Note: this needs to match the model I pick from Glove!!
max_len = 170  # You can adjust this based on the average length of reviews

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [24]:
# If the texts are longer than 500 words, shorten them, so they have a max of 500 words.
def truncate_nested_arrays(array, max_length):
    new_array = [None] * len(array)
    for i in range(len(array)):
        new_array[i] = array[i][:max_length]
    return new_array


In [25]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torchtext

# Tokenize and pad sequences
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
train_texts_tokenized = [tokenizer(review) for review in train_texts]
test_texts_tokenized = [tokenizer(review) for review in test_texts]

train_texts_tokenized = truncate_nested_arrays(train_texts_tokenized, max_len)
test_texts_tokenized = truncate_nested_arrays(test_texts_tokenized, max_len)


In [26]:
# Calculate the total length and count of nested arrays
total_length = 0
count = 0

for nested_array in test_texts_tokenized:
    total_length += len(nested_array)
    count += 1

# Calculate the average length
average_length = total_length / count
print(f"Torch test: {average_length}")

for nested_array in train_texts_tokenized:
    total_length += len(nested_array)
    count += 1

# Calculate the average length
average_length = total_length / count
print(f"Torch training: {average_length}")

Torch test: 152.4964
Torch training: 152.87782


In [27]:
# Build vocabulary
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

vocab = torchtext.vocab.build_vocab_from_iterator(train_texts_tokenized + test_texts_tokenized) # Note: This probably shouldn't include the test_texts, but deal with this later

train_sequences = [torch.tensor([vocab[token] for token in review], dtype=torch.long, device=device)for review in train_texts_tokenized]
test_sequences = [torch.tensor([vocab[token]for token in review], dtype=torch.long, device=device) for review in test_texts_tokenized] # Error thrown here since the test tokens may not exist in the vocabulary

train_padded = nn.utils.rnn.pad_sequence(train_sequences, batch_first=True, padding_value=0)
test_padded = nn.utils.rnn.pad_sequence(test_sequences, batch_first=True, padding_value=0)

In [28]:
print(vocab["i"])
print(vocab["found"])

print(train_texts_tokenized[0])

print(len(train_padded[4]))

print(len(test_padded[5]))

9
232
['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', 'teachers', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'high', "'", 's', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.', '.

In [29]:
# Download GloVe word embeddings
# You can download from https://nlp.stanford.edu/projects/glove
# I'll assume you've downloaded the 100-dimensional GloVe embeddings

glove_file = word_embedding_file_path
embedding_index = {}
with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [30]:
print(embedding_index.get("123"))
print(embedding_index.get("volptuous"))

[ 0.0034133 -0.14757    1.2649    -0.21882    0.31136    0.094664
  0.21661   -0.56447    0.22246    0.090778  -0.93579   -0.76006
 -0.066726  -0.16403   -0.016907  -1.2052    -0.31426    0.25058
 -0.78974    0.33977   -0.086043   0.10269    0.71572   -0.26135
 -0.37595    0.23748    0.62505   -0.43565   -0.39561   -0.23105
  1.3102     0.22167    0.64756    0.86903    0.85008   -0.32513
  0.84591   -0.12774    0.11723    0.41916   -0.54855   -0.45053
  1.1328     0.30328   -1.0642     0.97721   -0.75989    0.18138
  0.22958   -0.31125  ]
None


In [31]:
# Create embedding matrix
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for word, i in vocab.get_stoi().items():
    # print(word, i)
    embedding_vector = embedding_index.get(word)
    # print(embedding_vector)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_matrix_tensor = torch.tensor(embedding_matrix, device=device)

In [39]:
num_epochs = 5

# Build model
class LSTMTextClassifier(nn.Module):
    def __init__(self, embedding_dim, state_dim):
        super(LSTMTextClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)
        self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=state_dim, batch_first=True, device=device)
        self.fc = nn.Linear(state_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = x.long()
        x = self.embedding(x)
        x = x.float()
        output, _ = self.rnn(x)
        output = output[:, -1, :]
        output = self.fc(output)
        output = self.sigmoid(output)
        return output


print_things = False

# Build model
class RNNTextClassifier(nn.Module):
    def __init__(self, embedding_dim, state_dim):
        super(RNNTextClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=True)
        self.rnn = nn.RNN(embedding_dim, state_dim, batch_first=True, device=device)
        self.fc = nn.Linear(state_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """
        Args:
            x: Input tensor containing a sequence of tokens
        Return: The output of the current RNN, a list out tensors (a list of binary classifications)
        """
        x = x.long()
        if print_things: print(f"Passed in: {x}")
        vector_embedding = self.embedding(x).float()
        if print_things: print(f"Embedding lookup float: {vector_embedding}")
        rnn_output, _ = self.rnn(vector_embedding)

        if print_things: print(f"Embedding lookup float: {rnn_output}")
        # Option 1: Pooling (e.g., max pooling)
        max_pooling_output, _ = torch.max(rnn_output, 1)
        output = self.fc(max_pooling_output)
        output = self.sigmoid(output)
        if print_things: print(f"Model output: {output}")
        return output

def model_training_loop(model, criterion, optimizer, training_data_loader):

    training_loop_print = False

    loss = None
    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for batch in training_data_loader:
            inputs, labels = batch[0], batch[1]
            inputs, labels = inputs.to(device), labels.to(device)
            # if training_loop_print: print(f"Training Inputs: {inputs}")
            if training_loop_print: print(f"Training Label: {labels}")
            optimizer.zero_grad()
            outputs = model(inputs)
            if training_loop_print: print(f"Training Output: {outputs}")
            labels = labels.unsqueeze(1)
            loss = criterion(outputs, labels)
            if training_loop_print: print(f"Training Loss: {loss}")
            if training_loop_print: print(f"----")
            loss.backward()
            optimizer.step()

def evaluation_loop(model, test_loader) -> float:
    # Evaluation
    model.eval()

    print_test_info = False
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            # if print_test_info: print(f"Input: {inputs}")
            if print_test_info: print(f"Input Labels: {labels}")
            outputs = model(inputs)
            if print_test_info: print(f"Model Output: {outputs}")
            predicted = (outputs.squeeze() > 0.5).float()
            if print_test_info: print(f"Predicted: {predicted}")
            if print_test_info: print("---")
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total * 100
    return accuracy


In [40]:
batch_size_training = 64
batch_size_testing = 64
learning_rate = 0.001

# Model training loop
results_rnn = []
# Convert padded to torch.float32
train_padded = train_padded.float()
test_padded = test_padded.float()

train_label_tensor = torch.tensor(train_labels, dtype=torch.float32, device=device)
test_label_tensor = torch.tensor(test_labels, dtype=torch.float32, device=device)

# Convert to tensor dataset
train_dataset = TensorDataset(train_padded, train_label_tensor)
test_dataset = TensorDataset(test_padded, test_label_tensor)

# Converty the tensor datasets into a dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size_training, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size_testing, shuffle=True)


for state_dim in state_dimensions:
    for model_type in ["RNN"]: #, "LSTM"]:

        model = None
        if model_type == "RNN":
          model = RNNTextClassifier(embedding_dim, state_dim).to(device)
        elif model_type == "LSTM":
          model = LSTMTextClassifier(embedding_dim, state_dim).to(device)

        # Loss and optimizer
        criterion = nn.BCELoss() # Different criterion commonly used for classification tasks
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        model_training_loop(model, criterion, optimizer, train_loader)
        accuracy = evaluation_loop(model, test_loader)
        results_rnn.append((model_type, state_dim, accuracy))

# Display results
print("Results:")
print("| Model | State Dimension | Accuracy | Max Words |")
print("|-------|-----------------|----------| --------- |")
for result in results_rnn:
    print(f"| {result[0]} | {result[1]} | {result[2]:.4f}% | {max_len}")

Results:
| Model | State Dimension | Accuracy | Max Words |
|-------|-----------------|----------| --------- |
| RNN | 20 | 80.1360% | 170
| RNN | 50 | 79.4720% | 170
| RNN | 100 | 82.0360% | 170
| RNN | 200 | 82.7840% | 170
| RNN | 500 | 82.4000% | 170


## LSTM Hyperparameter Evaluation

In [35]:
results_lstm = []

for state_dim in state_dimensions:

    model = LSTMTextClassifier(embedding_dim, state_dim).to(device)
    # Loss and optimizer
    criterion = nn.BCELoss() # Different criterion commonly used for classification tasks
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model_training_loop(model, criterion, optimizer, train_loader)
    accuracy = evaluation_loop(model, test_loader)
    results_lstm.append((model_type, state_dim, accuracy))

# Display results
print("Results:")
print("| Model | State Dimension | Accuracy | Max Words |")
print("|-------|-----------------|----------| --------- |")
for result in results_lstm:
    print(f"| {result[0]} | {result[1]} | {result[2]:.4f}% | {max_len}")

tensor(0.6732, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7005, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6678, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6604, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.7093, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6942, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6911, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6931, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6802, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6909, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6722, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6639, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6809, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.6763, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor