# Assignment 3
## Download Dependencies - Dataset, Pre-Trained Models
* Download a pre-trained word embedding vector from https://nlp.stanford.edu/projects/glove to help with the assignment and classification
* Download the IMDB movie review dataset from https://ai.stanford.edu/~amaas/data/sentiment/

In [19]:
import os
import urllib.request
import zipfile
import tarfile

def download_and_extract_if_not_exist(url, extract_path, zip_file_name, zip_file_contents_file_name):

    # If any of the files already exist, skip the download - it's already been done and extracted
    path = os.path.join(extract_path, zip_file_contents_file_name)
    if os.path.exists(path):
        print("Files already exist. Skipping download.")
        return

    # Check if the zip file already exists, if it does, skip the download to save time
    zip_file_path = os.path.join(extract_path, zip_file_name)
    if not os.path.exists(zip_file_path):
        print("Downloading zip file...")
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        # Download the zip file
        zip_file_path, _ = urllib.request.urlretrieve(url, zip_file_path)
    else:
        print("Zip file already exists. Skipping download.")

    # Extract the contents of the zip file to the given directory based on the file extension
    if zip_file_path.endswith(".zip"):
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
    elif zip_file_path.endswith(".tar.gz") or zip_file_path.endswith(".tgz"):
        with tarfile.open(zip_file_path, 'r:gz') as tar_ref:
            tar_ref.extractall(extract_path)

    # Close and remove the zip file to free up space
    os.remove(zip_file_path)
    print(f"Files extracted to: {extract_path}")

### Download Pre-Trained Word Embeddings

In [2]:
# Download the word embeddings into the following folder: 
folder_to_download = "./models"

# 6 Billion token model
embedding_url = "https://nlp.stanford.edu/data/glove.6B.zip"
file_name_to_download = "glove.6B.zip"

# 42 Billion token model
# embedding_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip"
# file_name_to_download = "glove.42B.300d.zip"

# Download and extract the embedding file, then set a variable with the path to the embedding file
download_and_extract_if_not_exist(embedding_url, folder_to_download, file_name_to_download, "glove.6B.50d.txt")
word_embedding_file_path = os.path.join(folder_to_download, "glove.6B.50d.txt")

Files already exist. Skipping download.


### Download Movie Review Dataset

In [22]:
# Download the dataset into the following folder: 
folder_to_download = "./dataset"

# 6 Billion token model
dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
file_name_to_download = "aclImdb_v1.tar.gz"

# Download and extract, then set the path to the dataset
download_and_extract_if_not_exist(dataset_url, folder_to_download, file_name_to_download, "aclImdb/imdb.vocab")
dataset_folder= os.path.join(folder_to_download, "aclImdb")

Files already exist. Skipping download.


# Create the Dataset for training
From the downloaded files, create a dataset for training using torch.

In [35]:
dataset_folder= os.path.join(folder_to_download, "aclImdb")
def read_imdb_data_from_folder(imdb_reviews_folder):
    texts = []
    labels = []
    review_folders = ["pos", "neg"] # there are 2 folders in each dataset folder: pos and neg
    for connotation_dir in review_folders:
        review_dir = os.path.join(imdb_reviews_folder, connotation_dir)
        for filename in os.listdir(review_dir):
            with open(os.path.join(review_dir, filename), "r", encoding="utf8") as f:
                texts.append(f.read())
            labels.append(0 if connotation_dir == "neg" else 1)
    return texts, labels

def get_negative_positive_from_label(label):
    return "negative" if label == 0 else "positive"

In [34]:
# Load the IMDB Movie Review dataset from the downloaded folder
test_texts, test_labels = read_imdb_data_from_folder(os.path.join(dataset_folder, "test"))
train_texts, train_labels = read_imdb_data_from_folder(os.path.join(dataset_folder, "train"))

In [38]:
for i in range(5):
    print(f"Review #{i}")
    print(f"Label: {train_labels[i+24000]}")
    print(f"Sentiment: {get_negative_positive_from_label(train_labels[i+24000])}")
    print(f"Text: {train_texts[i+24000]}")
    print()


Review #0
Label: 0
Sentiment: negative
Text: Christ, oh Christ... One watches stunned, incredulous, and possibly deranged, as this tawdry exercise in mirthless smut unfolds with all the wit and dexterity of a palsied Galapagos tortoise. Can such things be? Does this movie actually exist, or was I the unwitting guinea pig of some shadowy international drugs company, sipping my coffee unaware that it had been spiked with a dangerous hallucinogen? I've seen a lot of films, and a lot of bad films, but nothing prepared me for this; by the end of it I was a gibbering, snivelling wreck, tearing at the carpet with my teeth like a dog, clawing at the walls, howling till my lungs were sore. I pleaded desperately, frenziedly for mercy (to whom this appeal was made, I don't know), and longed with burning desire for the soothing balm of Ozu Yasujiro. Sweet Weeping Jesus, the memories... sometimes they come back to me. When I'm at my most vulnerable, when I'm least able to handle them. I shudder, I 

In [None]:
# import spacy

# # Tokenize the sentences into words
# train_sentence_example = train_texts[i+24000]

# # Load the English Spacy Tokenizer Model
# nlp = spacy.load("en_core_web_sm")
# doc = nlp.tokenizer(train_sentence_example)

# tokenized_test_text_docs = [nlp.tokenizer(text) for text in test_texts]
# tokenized_train_text_docs = [nlp.tokenizer(text) for text in train_texts]

# print_doc = tokenized_test_text_docs[24001]
# print(print_doc)
# for token in print_doc:
#     print(token.text)





In [39]:
# Hyperparameters
state_dimensions = [20, 50, 100, 200, 500]
embedding_dim = 50  # You can choose the dimension of the word embeddings -- Note: this needs to match the model I pick from Glove!!
max_len = 200  # You can adjust this based on the average length of reviews
batch_size = 64
learning_rate = 0.001
num_epochs = 5


### Tokenize the dataset and pad the sequences

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torchtext


# Tokenize and pad sequences
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
train_texts_tokenized = [tokenizer(review) for review in train_texts]
test_texts_tokenized = [tokenizer(review) for review in test_texts]

# Build vocabulary
vocab = torchtext.vocab.build_vocab_from_iterator(train_texts_tokenized + test_texts_tokenized) # Note: This probably shouldn't include the test_texts, but deal with this later

train_sequences = [torch.tensor([vocab[token] for token in review], dtype=torch.long) for review in train_texts_tokenized]
test_sequences = [torch.tensor([vocab[token] for token in review], dtype=torch.long) for review in test_texts_tokenized] # Error thrown here since the test tokens may not exist in the vocabulary

train_padded = nn.utils.rnn.pad_sequence(train_sequences, batch_first=True, padding_value=0)
test_padded = nn.utils.rnn.pad_sequence(test_sequences, batch_first=True, padding_value=0)

### Start the model training

In [None]:
# Download GloVe word embeddings
# You can download from https://nlp.stanford.edu/projects/glove
# I'll assume you've downloaded the 100-dimensional GloVe embeddings

glove_file = word_embedding_file_path
embedding_index = {}
with open(glove_file, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [None]:
# Create embedding matrix
embedding_matrix = np.zeros((len(vocab), embedding_dim))
for word, i in vocab.get_stoi().items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Model training loop
results = []
# Convert padded to torch.float32
train_padded = train_padded.float()
test_padded = test_padded.float()


for state_dim in state_dimensions:
    for model_type in ["RNN", "LSTM"]:
        # Build model
        class TextClassifier(nn.Module):
            def __init__(self, embedding_dim, state_dim):
                super(TextClassifier, self).__init__()
                self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix), freeze=True)
                if model_type == "RNN":
                    self.rnn = nn.RNN(embedding_dim, state_dim, batch_first=True)
                elif model_type == "LSTM":
                    self.rnn = nn.LSTM(embedding_dim, state_dim, batch_first=True)
                self.fc = nn.Linear(state_dim, 1)
                self.sigmoid = nn.Sigmoid()

            def forward(self, x):
                x = x.long()
                x = self.embedding(x)
                x = x.float()
                output, _ = self.rnn(x)
                if model_type == "RNN":
                    output = output[:, -1, :]
                elif model_type == "LSTM":
                    output = output[:, -1, :]
                output = self.fc(output)
                output = self.sigmoid(output)
                return output

        model = TextClassifier(embedding_dim, state_dim)

        # Loss and optimizer
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        # Convert to DataLoader
        train_dataset = TensorDataset(train_padded, torch.tensor(train_labels, dtype=torch.float32))
        test_dataset = TensorDataset(test_padded, torch.tensor(test_labels, dtype=torch.float32))

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), labels)
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                predicted = (outputs.squeeze() > 0.5).float()
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        results.append((model_type, state_dim, accuracy))

# Display results
print("Results:")
print("| Model | State Dimension | Accuracy |")
print("|-------|------------------|----------|")
for result in results:
    print(f"| {result[0]} | {result[1]} | {result[2]:.4f} |")