## Imports

In [146]:
import torch
from torch import nn
from torch import Tensor
import torch.optim as optim
from tqdm import tqdm
import torchtext
import torchtext.data

torch.cuda.is_available()

True

In [147]:
from torchnlp.encoders.text import StaticTokenizerEncoder
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset, DataLoader
import cv2
import os

In [148]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Visualize the data

In [149]:
import matplotlib.pyplot as plt

def plot_history(train_acc_history, train_loss_history, val_acc_history, val_loss_history):
    epochs = range(1, len(train_loss_history) + 1)
    
    # Loss
    plt.figure(1)
    plt.plot(epochs, train_loss_history, 'b', label='Training loss (' + str(format(train_loss_history[-1], '.5f')) + ')')
    plt.plot(epochs, val_loss_history, 'g', label='Validation loss (' + str(format(val_loss_history[-1], '.5f')) + ')')
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    # Accuracy
    plt.figure(2)
    plt.plot(epochs, train_acc_history, 'b', label='Training accuracy (' + str(format(train_acc_history[-1], '.5f')) + ')')
    plt.plot(epochs, val_acc_history, 'g', label='Validation accuracy (' + str(format(val_acc_history[-1], '.5f')) + ')')

    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()


## Prepare data

### Files operations

In [150]:
def read_glove_file(path: str) -> dict[str, np.ndarray]:
    embeddings: dict = {}
    f = open(path, encoding ="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings))

    return embeddings

In [151]:
def read_texts() -> tuple[list[str], list[int]]:
    imdb_dir = 'kaggle/Imdb/'
    train_dir = imdb_dir + 'train/'

    labels = []
    texts = []

    for label_type in ['neg', 'pos']:
        dir_name = train_dir + label_type
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), encoding ="utf8")
                texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)

    return texts, labels

In [152]:
embeddings = read_glove_file('kaggle/glove.6B/glove.6B.100d.txt')
texts, labels = read_texts()

Found 400000 word vectors.


### Data operations

In [153]:
def texts_to_sequences(texts: list[str], maxlen: int, max_words: int) -> tuple[torch.Tensor, dict[any, int]]:
    tokenizer = torchtext.data.utils.get_tokenizer(None)    
    filter_tokens = lambda x: [word for word in tokenizer(x) if word in embeddings]
    text_sequences = [filter_tokens(text) for text in texts]
    all_tokens_set = set()
    for seq in text_sequences:
        for token in seq:
            all_tokens_set.add(token)
    
    all_embedding_tokens = set(embeddings.keys())
    tokens = all_tokens_set.intersection(all_embedding_tokens)

    word_index = {word: i + 1 for i, word in enumerate(tokens) if i + 1 < max_words}
    sequences = [[word_index[word] for word in text if word in word_index] for text in text_sequences]
    padded_sequences = torch.zeros(len(sequences), maxlen, dtype=torch.long)
    
    for i, seq in enumerate(sequences):
        if len(seq) > maxlen:
            seq = seq[:maxlen]
        else:
            seq = seq + [0] * (maxlen - len(seq))
        padded_sequences[i] = torch.tensor(seq)

    return padded_sequences, word_index
    

In [154]:
def get_embedding_matrix(max_words: int, embedding_index: dict[str, np.ndarray], word_index: dict[any, int]) -> np.ndarray:
    embedding_dimension = embedding_index.get('the').shape[0]

    embedding_matrix = np.zeros((max_words, embedding_dimension))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                # Words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [155]:
def split_data(data: np.ndarray, labels: list[int], train_part: float, validation_part: float) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    labels = np.asarray(labels)

    data_size = data.shape[0]
    train_size = int(data_size * train_part)
    validation_size = int(data_size * validation_part)
    test_size = data_size - train_size - validation_size

    indices = np.arange(data_size)
    np.random.shuffle(indices)

    train_indices = indices[:train_size]
    validation_indices = indices[train_size:train_size + validation_size]
    test_indices = indices[train_size + validation_size:]

    train_data = data[train_indices]
    validation_data = data[validation_indices]
    test_data = data[test_indices]

    train_labels = labels[train_indices]
    validation_labels = labels[validation_indices]
    test_labels = labels[test_indices]

    print(f"Train data size: {train_data.shape[0]}")
    print(f"Validation data size: {validation_data.shape[0]}")
    print(f"Test data size: {test_data.shape[0]}")

    return train_data, validation_data, test_data, train_labels, validation_labels, test_labels

## Get the data

In [156]:
def prepare_data_for_model(maxlen: int, max_words: int, embeddings: dict[str, np.ndarray], texts: list[str], labels: list[int], train_part: float, validation_part: float) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, dict[any, int], np.ndarray]:
    data, word_index = texts_to_sequences(texts, maxlen=maxlen, max_words=max_words)
    embedding_matrix = get_embedding_matrix(max_words, embeddings, word_index)
    train_data, validation_data, test_data, train_labels, validation_labels, test_labels = split_data(data, labels, train_part, validation_part)
    return train_data, validation_data, test_data, train_labels, validation_labels, test_labels, word_index, embedding_matrix


In [157]:
train_data, validation_data, test_data, train_labels, validation_labels, test_labels, word_index, embedding_matrix = prepare_data_for_model(maxlen=100, max_words=10000, embeddings=embeddings, texts=texts, labels=labels, train_part=0.8, validation_part=0.1)

Train data size: 20000
Validation data size: 2500
Test data size: 2500


In [158]:
train_dataset = torch.utils.data.TensorDataset(train_data, torch.tensor(train_labels, dtype=torch.long))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

validation_dataset = torch.utils.data.TensorDataset(validation_data, torch.tensor(validation_labels, dtype=torch.long))
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(test_data, torch.tensor(test_labels, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

## Model

In [159]:
EPOCHS: int = 10

In [160]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size: int, embed_size: int, maxlen: int):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.conv = nn.Conv1d(embed_size, 32, kernel_size=3)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        conv_output_size = (maxlen - 2) // 2
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(32 * conv_output_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def set_embedding(self, embedding_matrix: np.ndarray) -> None:
        assert self.embedding.weight.shape == embedding_matrix.shape, "Embedding matrix shape mismatch"
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False
        print("Embedding shape:", self.embedding.weight.data.shape)
        
    def forward(self, x: Tensor) -> Tensor:
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # Change shape to [batch_size, embed_size, seq_len] for Conv1d
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.flatten(x)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

In [163]:
def train_model(model: nn.Module, criterion: nn.Module, optimizer: optim.Optimizer,
                train_loader: torch.utils.data.DataLoader, validation_loader: torch.utils.data.DataLoader) -> tuple[list[float], list[float], list[float], list[float]]:
    model.to(device)
    criterion.to(device)
    # torch.cuda.empty_cache()
    accuracy_history: list = []
    loss_history: list = []
    val_accuracy_history: list = []
    val_loss_history: list = []
    
    print("Training model...")
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        for data, target in tqdm(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target.float().view(-1, 1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            predicted = (output > 0.5).float()
            total += target.size(0)
            correct += (predicted == target.float().view(-1, 1)).sum().item()
        accuracy = correct / total
        loss_history.append(train_loss / len(train_loader))
        accuracy_history.append(accuracy)
        
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for data, target in validation_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = criterion(output, target.float().view(-1, 1))
                val_loss += loss.item()
                predicted = (output > 0.5).float()
                total += target.size(0)
                correct += (predicted == target.float().view(-1, 1)).sum().item()
        val_accuracy = correct / total
        val_loss_history.append(val_loss / len(validation_loader))
        val_accuracy_history.append(val_accuracy)
        
        print(f"Epoch {epoch + 1}/{EPOCHS}:")
        print(f"Train loss: {loss_history[-1]}, accuracy: {accuracy}")
        print(f"Validation loss: {val_loss_history[-1]}, accuracy: {val_accuracy}")
        
    return accuracy_history, loss_history, val_accuracy_history, val_loss_history
        

In [164]:
model = TextClassificationModel(vocab_size=10000, embed_size=100, maxlen=100)
model.set_embedding(embedding_matrix)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
accuracy_history, loss_history, val_accuracy_history, val_loss_history = train_model(model, criterion, optimizer, train_loader, validation_loader)

Embedding shape: torch.Size([10000, 100])
Training model...


  0%|          | 0/625 [00:00<?, ?it/s]

100%|██████████| 625/625 [00:05<00:00, 113.76it/s]


Epoch 1/10:
Train loss: 0.6524694971084595, accuracy: 0.61135
Validation loss: 0.6140984964521625, accuracy: 0.6776


100%|██████████| 625/625 [00:04<00:00, 125.57it/s]


Epoch 2/10:
Train loss: 0.6028225239276886, accuracy: 0.6706
Validation loss: 0.6044006743763066, accuracy: 0.676


100%|██████████| 625/625 [00:05<00:00, 121.60it/s]


Epoch 3/10:
Train loss: 0.5810943242549896, accuracy: 0.6943
Validation loss: 0.5836772235888469, accuracy: 0.7032


100%|██████████| 625/625 [00:04<00:00, 125.88it/s]


Epoch 4/10:
Train loss: 0.5606883839607238, accuracy: 0.71165
Validation loss: 0.5881670558754402, accuracy: 0.6912


100%|██████████| 625/625 [00:06<00:00, 98.43it/s] 


Epoch 5/10:
Train loss: 0.5441222069740296, accuracy: 0.7231
Validation loss: 0.5753334353241739, accuracy: 0.7008


100%|██████████| 625/625 [00:05<00:00, 121.01it/s]


Epoch 6/10:
Train loss: 0.5265481192588806, accuracy: 0.7386
Validation loss: 0.577605883154688, accuracy: 0.706


100%|██████████| 625/625 [00:05<00:00, 119.83it/s]


Epoch 7/10:
Train loss: 0.5095807530403137, accuracy: 0.74805
Validation loss: 0.614638537922992, accuracy: 0.6832


100%|██████████| 625/625 [00:05<00:00, 118.51it/s]


Epoch 8/10:
Train loss: 0.4962110005378723, accuracy: 0.75705
Validation loss: 0.5967873636680313, accuracy: 0.6996


100%|██████████| 625/625 [00:05<00:00, 119.00it/s]


Epoch 9/10:
Train loss: 0.4837517150878906, accuracy: 0.76385
Validation loss: 0.5960542819922483, accuracy: 0.6928


100%|██████████| 625/625 [00:05<00:00, 117.89it/s]


Epoch 10/10:
Train loss: 0.4730104826450348, accuracy: 0.7699
Validation loss: 0.6033857453473007, accuracy: 0.6948
