# Multilabel Classification Trainer Pipeline

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score, accuracy_score, hamming_loss
import numpy as np
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./dataset.csv')
print(df.shape)
df = df.loc[:, ~(df == 0).all()]
df = df.drop_duplicates()
df = pd.read_csv('./dataset.csv')
label_columns = df.columns[2:]
y = df[label_columns].values
phrases = df["Phrase"].tolist()

(62605, 16)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(phrases, y, test_size=0.3, random_state=42)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, phrases, labels, tokenizer, max_length):
        self.phrases = phrases
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.phrases)
    
    def __getitem__(self, index):
        phrase = self.phrases[index]
        label = self.labels[index]
        encoding = self.tokenizer(phrase, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.float)
        }


In [5]:
class BertCustomModel(nn.Module):
    def __init__(self, num_labels, dropout_rate=0.5):
        super(BertCustomModel, self).__init__()
        # Load pretrained BERT model
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        
        # Convolutional Layers
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        
        # Attention Layer
        self.attention = nn.Linear(768, 1)
        
        # Fully Connected Layers
        self.fc1 = nn.Linear(768 + 768 + 128, 512)  # Combine CLS and attention outputs
        self.fc2 = nn.Linear(512, num_labels)
        
        # Activation and Dropout
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
    
    def forward(self, input_ids, attention_mask):
        # Extract BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = outputs.last_hidden_state[:, 0, :]  # CLS token (batch_size, 768)
        token_embeddings = outputs.last_hidden_state  # (batch_size, seq_len, 768)
        
        # Attention Mechanism
        attention_weights = self.attention(token_embeddings)  # (batch_size, seq_len, 1)
        attention_weights = torch.softmax(attention_weights, dim=1)  # Normalize weights
        weighted_embeddings = (token_embeddings * attention_weights).sum(dim=1)  # (batch_size, 768)
        
        # Pass through convolutional layers
        token_embeddings_permuted = token_embeddings.permute(0, 2, 1)  # (batch_size, 768, seq_len)
        x = self.conv1(token_embeddings_permuted)
        x = self.relu(x)
        x = self.conv2(x)
        x = torch.mean(x, dim=2)  # Global average pooling (batch_size, 128)
        
        # Combine CLS token and attention output
        combined = torch.cat((cls_token, weighted_embeddings, x), dim=1)  # (batch_size, 768 + 768 + 128)
        
        # Pass through fully connected layers
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [6]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [7]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [8]:
def compute_metrics(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask)
            preds = torch.sigmoid(outputs).cpu().numpy() > 0.5
            all_preds.append(preds)
            all_labels.append(labels)
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=1)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=1)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=1)
    subset_acc = accuracy_score(all_labels, all_preds)
    jaccard = jaccard_score(all_labels, all_preds, average='macro')
    hamming = hamming_loss(all_labels, all_preds)
    return {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "Subset Accuracy": subset_acc,
        "Jaccard": jaccard,
        "Hamming Loss": hamming
    }

In [9]:
# Setup
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = CustomDataset(X_train, y_train, tokenizer, max_length=128)
test_dataset = CustomDataset(X_test, y_test, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BertCustomModel(num_labels=y.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-7)
criterion = nn.BCEWithLogitsLoss()
scheduler = StepLR(optimizer, step_size=3, gamma=0.5)



In [None]:
epochs = 20
for epoch in range(epochs):
    train_start = time.time()
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    train_end = time.time()
    eval_start = time.time()
    test_loss = evaluate_model(model, test_loader, criterion, device)
    metrics = compute_metrics(model, test_loader, device)
    eval_end = time.time()
    print(
        f"Epoch {epoch + 1}/{epochs} | "
        f"Train Loss: {train_loss:.4f}, Time: {train_end - train_start:.2f}s | "
        f"Test Loss: {test_loss:.4f}, Time: {eval_end - eval_start:.2f}s | "
        f"Precision: {metrics['Precision']:.4f}, Recall: {metrics['Recall']:.4f}, F1-Score: {metrics['F1-Score']:.4f} | "
        f"Subset Accuracy: {metrics['Subset Accuracy']:.4f}, Jaccard: {metrics['Jaccard']:.4f}, Hamming Loss: {metrics['Hamming Loss']:.4f}"
    )
    print("-" * 50)

In [None]:
model_save_path = "./model/label_model.pth"
torch.save(model.state_dict(), model_save_path)

# Save tokenizer (optional, for inference later)
tokenizer_save_path = "./model/label_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)

('BERT_SQL_TOKENIZER/tokenizer_config.json',
 'BERT_SQL_TOKENIZER/special_tokens_map.json',
 'BERT_SQL_TOKENIZER/vocab.txt',
 'BERT_SQL_TOKENIZER/added_tokens.json')