## Imports

In [1]:
import torch
from torch import nn
from torch import Tensor
import torch.optim as optim
from torchvision import models, transforms, datasets
from tqdm import tqdm

torch.cuda.is_available()

True

In [2]:
from torchnlp.encoders.text import StaticTokenizerEncoder
from torchnlp.encoders import pad_tensor
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset, DataLoader
import cv2
import os

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Visualize the data

## Prepare data

In [4]:
def read_glove_file(path: str = 'glove') -> dict[str, np.ndarray]:
    embeddings_index = {}
    f = open(path, encoding ="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index

In [5]:
def read_texts() -> tuple[list[str], list[int]]:
    imdb_dir = 'data/aclImdb'
    train_dir = imdb_dir + 'train'

    labels = []
    texts = []

    for label_type in ['neg', 'pos']:
        dir_name = train_dir + label_type
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), encoding ="utf8")
                texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)

    return texts, labels

In [6]:
def texts_to_sequences(texts: list[str], maxlen: int, max_words: int) -> tuple[torch.Tensor, dict[any, int]]:
    tokenizer = StaticTokenizerEncoder(texts, min_occurrences=1, reserved_tokens=['<pad>'], num_tokens=max_words)

    # Tokenize the texts
    sequences = [tokenizer.encode(text) for text in texts]
    # Create a word index (token to index mapping)
    word_index = tokenizer.token_to_index
    print('Found %s unique tokens.' % len(word_index))
    # Pad the sequences
    padded_sequences = [pad_tensor(sequence, max_length=maxlen, padding_index=tokenizer.token_to_index['<pad>']) for sequence in sequences]
    # Convert to a tensor
    data = torch.stack(padded_sequences)

    return data, word_index

In [None]:
def get_embedding_matrix(max_words: int, embedding_index: dict[str, np.ndarray], word_index: dict[Any, int]) -> np.ndarray:
    embedding_dimension = embedding_index.get('the').shape[0]

    embedding_matrix = np.zeros((max_words, embedding_dimension))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                # Words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
def split_data(data: np.ndarray, labels: List[int], train_part: float, validation_part: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    labels = np.asarray(labels)

    data_size = data.shape[0]
    train_size = int(data_size * train_part)
    validation_size = int(data_size * validation_part)
    test_size = data_size - train_size - validation_size

    indices = np.arange(data_size)
    np.random.shuffle(indices)

    train_indices = indices[:train_size]
    validation_indices = indices[train_size:train_size + validation_size]
    test_indices = indices[train_size + validation_size:]

    train_data = data[train_indices]
    validation_data = data[validation_indices]
    test_data = data[test_indices]

    train_labels = labels[train_indices]
    validation_labels = labels[validation_indices]
    test_labels = labels[test_indices]

    print(f"Train data size: {train_data.shape[0]}")
    print(f"Validation data size: {validation_data.shape[0]}")
    print(f"Test data size: {test_data.shape[0]}")

    return train_data, validation_data, test_data, train_labels, validation_labels, test_labels