In [None]:
import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


pl.Config.set_tbl_hide_dataframe_shape(True)
pl.Config.set_fmt_str_lengths(60)
pl.Config.set_tbl_rows(25)

warnings.filterwarnings('ignore')

In [None]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset

bikeshop_dataset = load_dataset('m0saan/bikeshop')


df = bikeshop_dataset['train'].to_pandas()
df

In [None]:
# Split the data into input (text) and labels
X = df["text"]
y = df["intent"]

# Create a label encoder to convert labels to numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Create a custom dataset class
class TextDataset(Dataset):
    def __init__(self, X, y, max_len):
        self.X = X
        self.y = y
        self.max_len = max_len

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = self.X.iloc[idx]
        label = self.y[idx]

        # Tokenize the text
        tokens = text.split()

        # Pad the tokens to the maximum length
        tokens = tokens[:self.max_len]
        tokens += ["<PAD>"] * (self.max_len - len(tokens))

        # Convert tokens to numerical values
        tokens = [dataset_vocab.get(token, dataset_vocab["<UNK>"]) for token in tokens]

        # Convert to tensor
        tokens = torch.tensor(tokens)

        return tokens, torch.tensor(label)

# Create a vocabulary
dataset_vocab = {"<PAD>": 0, "<UNK>": 1}
for text in X:
    tokens = text.split()
    for token in tokens:
        if token not in dataset_vocab:
            dataset_vocab[token] = len(dataset_vocab)

In [None]:
print("Vocabulary size:", len(dataset_vocab))

In [None]:
# Create the dataset and data loader
max_len = 32
dataset = TextDataset(X, y_encoded, max_len)
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
xb, yb = next(iter(train_loader))
xb.shape, yb.shape

In [None]:
import torch.utils
import torch.utils.data


def train(
    dl_loader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    model: torch.nn.Module,
    ):
    
    """ Train the model on the training dataset """
    
    total_loss, total_acc = 0, 0
    
    model.train()
    for text, label in dl_loader:
        text = text.to(device)
        label = label.to(device)
        optimizer.zero_grad()
        preds = model(text)
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()
        _, preds = torch.max(preds, 1)
        total_loss += loss.item() * label.size(0)
        total_acc += (preds  == label).float().sum().item()
    return total_loss / len(dl_loader.dataset), total_acc / len(dl_loader.dataset)

def evaluate(
    dl_loader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    model: torch.nn.Module,
    ):
    
    """ Evaluate the model on the validation or test set """
    
    total_loss, total_acc = 0, 0

    model.eval()
    with torch.inference_mode():
        for text, label in dl_loader:
            preds = model(text)
            loss = criterion(preds, label)
            _, preds = torch.max(preds, 1)
            total_acc += (preds == label).float().sum().item()
            total_loss += loss.item() * label.size(0)
        return total_loss / len(dl_loader.dataset), total_acc / len(dl_loader.dataset)

In [None]:
from dataclasses import dataclass, field

@dataclass
class CNNConfig:
    
    vocab_size: int = len(dataset_vocab)
    embed_dim: int = 100
    lstm_hidden_dim: int = 100
    fc_hidden_dim: int = 32
    output_dim: int = 9
    dropout: float = 0.5

    num_filters: int = 100
    num_channels: list = field(default_factory=lambda: [100, 100, 100])
    kernel_sizes: list = field(default_factory=lambda: [3, 4, 5]) # Use default_factory for mutable defaults
    
    
config = CNNConfig()
config

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

def load_glove_embeddings(glove_path, word_to_index, embed_dim):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    embedding_matrix = np.zeros((len(word_to_index), embed_dim))
    for word, i in word_to_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

class CNNModel(nn.Module):
    def __init__(self, config: CNNConfig, glove_embeddings, **kwargs):
        super(CNNModel, self).__init__(**kwargs)
        self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
        
        # The embedding layer not to be trained
        self.constant_embedding = nn.Embedding.from_pretrained(glove_embeddings, freeze=True)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(config.num_channels), 2)
        # The max-over-time pooling layer has no parameters, so this instance
        # can be shared
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.relu = nn.ReLU()
        # Create multiple one-dimensional convolutional layers
        self.convs = nn.ModuleList()
        for c, k in zip(config.num_channels, config.kernel_sizes):
            self.convs.append(nn.Conv1d(2 * config.embed_dim, c, k))

    def forward(self, inputs):
        # Concatenate two embedding layer outputs with shape (batch size, no.
        # of tokens, token vector dimension) along vectors
        embeddings = torch.cat((
            self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
        # Per the input format of one-dimensional convolutional layers,
        # rearrange the tensor so that the second dimension stores channels
        embeddings = embeddings.permute(0, 2, 1)
        # For each one-dimensional convolutional layer, after max-over-time
        # pooling, a tensor of shape (batch size, no. of channels, 1) is
        # obtained. Remove the last dimension and concatenate along channels
        encoding = torch.cat([
            torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1)
            for conv in self.convs], dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [10]:
glove_embeddings = load_glove_embeddings('../glove/glove.6B.100d.txt', dataset_vocab, config.embed_dim)

In [None]:
net = CNNModel(config, glove_embeddings)

def init_weights(module):
    if type(module) in (nn.Linear, nn.Conv1d):
        nn.init.xavier_uniform_(module.weight)

net.apply(init_weights)

In [None]:
device = torch.device('mps')
cnn_model = CNNModel(config, glove_embeddings).to(device)
print(cnn_model)

In [None]:
lr, num_epochs = 0.001, 25
criteria = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=lr)


for epoch in range(num_epochs):
    train_loss, train_acc = train(train_loader, optimizer, criteria, cnn_model)
    # valid_loss, valid_acc = evaluate(valid_dataloader, criteria, cnn_model)
    print(f'Epoch {epoch}  train loss: {train_loss:.4f} train accuracy: {train_acc:.4f}')
    