# 1. Import requirements

In [1]:
# ! pip install -r requirements.txt

In [2]:
# from data_loader import *
# from text_cnn import TextCNN
# from model_controller import *
# from hyper_tuning import hyper_tuning

# import pandas as pd
import torch
# import torch.nn as nn

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# 2. Load dataset

In [4]:
# import torch
from torch.utils.data import Dataset, DataLoader

import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string

from sklearn.model_selection import train_test_split
from collections import Counter

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")


class Data_Staff:
    def __init__(self, language):
        # Init neccessary tools
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words(language))
        self.MAX_SEQ_LENGTH = 256  # Max length of sentence to encode

    # Sentence preprocessing function
    def preprocess_text(self, text):
        text = text.lower()  # Convert into lower case
        tokens = word_tokenize(text)
        tokens = [
            word for word in tokens if word not in string.punctuation
        ]  # Remove punctuation
        tokens = [
            word for word in tokens if word not in self.stop_words
        ]  # Remove stopwords
        tokens = [self.stemmer.stem(word) for word in tokens]  # Stemming
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]  # Lematizing
        return " ".join(tokens)

    # Build vocab
    def build_vocab(self, texts, max_vocab_size=10000):
        word_counts = Counter()
        for text in texts:
            word_counts.update(text.split())
        vocab = {
            word: idx + 2
            for idx, (word, _) in enumerate(word_counts.most_common(max_vocab_size))
        }
        vocab["<PAD>"] = 0
        vocab["<UNK>"] = 1
        return vocab

    # Indexing
    def encode_text(self, text, vocab):
        return [vocab.get(word, vocab["<UNK>"]) for word in text.split()]

    # Encode padding for train and test set
    def pad_sequences(self, sequences, max_length):
        padded_sequences = []
        for seq in sequences:
            if len(seq) > max_length:
                padded_sequences.append(seq[:max_length])
            elif len(seq) < max_length:
                padded_sequences.append(seq + [0] * (max_length - len(seq)))
            else:
                padded_sequences.append(seq)
        return torch.Tensor(padded_sequences)

    def staff(self, dataframe):
        # Preprocess review
        dataframe["review"] = dataframe["review"].apply(self.preprocess_text)

        # Split data into train and test
        X_train, X_test, y_train, y_test = train_test_split(
            dataframe["review"], dataframe["sentiment"], test_size=0.2, random_state=42
        )

        # Build vocab
        vocab = self.build_vocab(X_train)

        # Encode and padding
        X_train_encoded = [self.encode_text(text, vocab) for text in X_train]
        X_test_encoded = [self.encode_text(text, vocab) for text in X_test]
        X_train_padded = self.pad_sequences(X_train_encoded, self.MAX_SEQ_LENGTH)
        X_test_padded = self.pad_sequences(X_test_encoded, self.MAX_SEQ_LENGTH)

        # Convert into Tensor
        y_train = torch.tensor([1 if label == "positive" else 0 for label in y_train])
        y_test = torch.tensor([1 if label == "positive" else 0 for label in y_test])

        return vocab, X_train_padded, X_test_padded, y_train, y_test


# Define dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts.long()
        self.labels = labels.long()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


# Get dataloader
def load_data(dataframe, batch_size):
    ds = Data_Staff(language="english")
    vocab, X_train, X_test, y_train, y_test = ds.staff(dataframe)

    # Create Dataset and DataLoader
    train_dataset = TextDataset(X_train, y_train)
    test_dataset = TextDataset(X_test, y_test)
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True
    )
    test_loader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False
    )

    return vocab, train_loader, test_loader


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\longt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_csv('data/imdb/review.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [6]:
vocab, train_loader, test_loader = load_data(df, batch_size=64)

# 3. Train and evaluate model

In [7]:
# import torch
import torch.nn as nn

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters, dropout=0.5):
        super(TextCNN, self).__init__()
        
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Convolutional Layers with different kernel size
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim)) for k in kernel_sizes
        ])
        
        self.dropout = nn.Dropout(dropout)
        
        # Fully Connected Layer
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)

    def forward(self, x):
        # Get embedding
        x = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        x = x.unsqueeze(1)  # (batch_size, 1, seq_length, embedding_dim)

        # Apply Conv
        conv_results = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # (batch_size, num_filters, seq_len - k + 1)
        pooled_results = [torch.max(result, dim=2)[0] for result in conv_results]  # Max pooling

        # Concate result from kernel sizes
        x = torch.cat(pooled_results, dim=1)  # (batch_size, num_filters * len(kernel_sizes))
        
        x = self.dropout(x)
        
        # Fully connected layer to classification
        x = self.fc(x)  # (batch_size, num_classes)
        return x


In [8]:
# import torch
import tqdm


# Train model
def train(
    model, criterion, optimizer, train_loader, device=torch.device("cpu"), epochs=1
):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        with tqdm.tqdm(train_loader, unit="batch") as tepoch:
            for inputs, labels in tepoch:
                tepoch.set_description(f"Epoch {epoch}")
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Clear gradient
                optimizer.zero_grad()

                # Forward
                outputs = model(inputs)

                # Compute loss
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

                # Compute accuracy
                _, predicted = torch.max(outputs, 1)
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

            epoch_loss = running_loss / len(train_loader)
            epoch_acc = correct / total
            print(
                f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}"
            )

    return running_loss / len(train_loader)


# Evaluate model
def evaluate(model, test_loader, device=torch.device("cpu")):
    model.to(device)
    # Evaluate model on test set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy


# Save model
def save(model, filepath):
    torch.save(model.state_dict(), filepath)


In [9]:
# Init model
vocab_size = len(vocab)
embedding_dim = 100
num_classes = 2  # Positive/Negative
kernel_sizes = [3, 4, 5]
num_filters = 100

model = TextCNN(vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters)

In [10]:
# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [11]:
train(model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, device=device, epochs=5)

Epoch 0: 100%|██████████| 625/625 [00:06<00:00, 98.32batch/s] 


Epoch 1/5, Loss: 0.5689, Accuracy: 0.7029


Epoch 1: 100%|██████████| 625/625 [00:06<00:00, 103.44batch/s]


Epoch 2/5, Loss: 0.4164, Accuracy: 0.8056


Epoch 2: 100%|██████████| 625/625 [00:06<00:00, 103.44batch/s]


Epoch 3/5, Loss: 0.3459, Accuracy: 0.8492


Epoch 3: 100%|██████████| 625/625 [00:06<00:00, 102.82batch/s]


Epoch 4/5, Loss: 0.2955, Accuracy: 0.8748


Epoch 4: 100%|██████████| 625/625 [00:06<00:00, 102.23batch/s]

Epoch 5/5, Loss: 0.2590, Accuracy: 0.8920





0.258951305603981

In [12]:
evaluate(model=model, test_loader=test_loader, device=device)

Test Accuracy: 0.8800


0.88

In [13]:
save(model, 'models/model1.pth')

# 4. Hyperparameter tuning

In [14]:
# from data_loader import load_data
# from model_controller import train
# from text_cnn import TextCNN

# import torch.nn as nn

# from itertools import product
# from torch.optim import Adam, SGD



# # Fine tuning model
# def hyper_tuning(dataframe, epochs=5):
#     param_grid = {
#         'batch_size': [16, 32, 64],
#         'learning_rate': [1e-3, 5e-3],
#         'optimizer': ['SGD', 'Adam'],
#         'embedding_dim': [50, 100, 150],
#         'num_filters': [50, 100, 150]
#     }

#     param_combinations = list(product(*param_grid.values()))
    
#     best_params = None
#     best_loss = float('inf')
#     for params in param_combinations:
#         batch_size, lr, opt, emb_dim, num_fils = params
#         print(f"\nTesting with batch size={batch_size}, learning rate={lr}, optimizer={opt}, embedding_dim={emb_dim}, num_filters={num_fils}")
        
#         vocab, train_loader, _ = load_data(dataframe, batch_size=batch_size)
        
#         vocab_size = len(vocab)
#         embedding_dim = emb_dim
#         num_classes = 2  # Positive/Negative
#         kernel_sizes = [3, 4, 5]
#         num_filters = num_fils

#         model = TextCNN(vocab_size, embedding_dim, num_classes, kernel_sizes, num_filters)

#         criterion = nn.CrossEntropyLoss()
#         optimizer = SGD(model.parameters(), lr=lr) if opt == 'SGD' else Adam(model.parameters(), lr=lr)

#         avg_loss = train(model=model, criterion=criterion, optimizer=optimizer, train_loader=train_loader, epochs=epochs)
        
#         if avg_loss < best_loss:
#             best_loss = avg_loss
#             best_params = {
#                 'batch_size': batch_size,
#                 'learning_rate': lr,
#                 'optimizer': opt,
#                 'embedding_dim': emb_dim,
#                 'num_filters': num_fils
#             }

#     return best_params, best_loss

In [15]:
df = pd.read_csv('data/imdb/review.csv')

In [16]:
best_params, best_loss = hyper_tuning(df, epochs=5)

NameError: name 'hyper_tuning' is not defined

In [None]:
print(f'Best params: {best_params}\n\nBest loss: {best_loss}')