In [14]:
# !pip install -U accelerate
# !pip install -U transformers

In [15]:
import pandas as pd
df = pd.read_csv("./dataset/problems.csv", usecols=["description", "labels"])
df.head(10)

Unnamed: 0,description,labels
0,John gave Jack a very hard problem. He wrote a...,['math']
1,Due to the recent popularity of the Deep learn...,"['dynamic programming', 'matrices']"
2,Bill is a famous mathematician in BubbleLand. ...,"['greedy', 'sorting']"
3,The competitors of Bubble Cup X gathered after...,"['shortest path', 'graphs', 'binary search']"
4,John has just bought a new car and is planning...,['dynamic programming']
5,"Consider an array A with N elements, all being...","['combinatorics', 'number theory', 'math']"
6,The citizens of BubbleLand are celebrating the...,"['dynamic programming', 'geometry']"
7,This story is happening in a town named Bubble...,"['trees', 'graphs']"
8,You are given an integer $$$x$$$ of $$$n$$$ di...,"['greedy', 'strings']"
9,You are given a Young diagram. Given diagram ...,"['greedy', 'dynamic programming', 'math']"


In [16]:
# df.shape
df.info()
# df.duplicated().sum()
# df['description'].str.len().plot.hist(bins=50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10912 entries, 0 to 10911
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  10912 non-null  object
 1   labels       10912 non-null  object
dtypes: object(2)
memory usage: 170.6+ KB


In [17]:
import ast

df['labels'] = df['labels'].apply(ast.literal_eval)
# labels_cnt = [l for lab in df['labels'] for l in lab]
# label_series = pd.Series(labels_cnt).value_counts()
# print(label_series)

# print("總共有", label_series.index.nunique(), "種 labels")

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import torch
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df["labels"]).astype('float32') # NumPy ndarray # To align label format with model prediction (both are float)
texts = df["description"].tolist()

co_matrix = np.dot(labels.T, labels)  # labels: shape (N_samples, N_labels)
total = np.sum(co_matrix)
# 機率矩陣
P_ij = co_matrix / total
# 邊際機率 P(i)
P_i = np.diag(co_matrix) / total  # shape: (n_labels,)
# 外積計算 P(i) * P(j)
P_i_P_j = np.outer(P_i, P_i)
# PMI 計算，加上小常數避免 log(0)
PMI = np.log(P_ij / (P_i_P_j + 1e-10) + 1e-10)
np.fill_diagonal(PMI, PMI.max())
# Normalize PMI to [0, 1] for soft label weight
PMI_norm = (PMI - PMI.min()) / (PMI.max() - PMI.min())
PMI_tensor = torch.tensor(PMI_norm, dtype=torch.float32)

# soft_labels = torch.matmul(torch.tensor(labels), PMI_tensor)
# soft_labels = torch.clamp(soft_labels, 0.0, 1.0)
# soft_labels = torch.round(soft_labels * 10) / 10

# import seaborn as sns
# import matplotlib.pyplot as plt
# label_names = multilabel.classes_
# plt.figure(figsize=(12, 10))
# sns.heatmap(PMI, xticklabels=label_names, yticklabels=label_names, cmap='YlGnBu', annot=False)
# plt.title("Label Co-occurrence Matrix")
# plt.show()


In [40]:
from transformers import DistilBertTokenizer, DistilBertTokenizerFast
from skmultilearn.model_selection import IterativeStratification
# from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np

In [20]:

texts_np = np.array(texts)  
labels_np = np.array(labels)

def iterative_kfold_split(X, y, k=5, test_size=0.2):
    folds = []
    # stratifier = IterativeStratification(n_splits=k, order=2, 
    #                                      sample_distribution_per_fold=[test_size, 1.0-test_size])
    stratifier = IterativeStratification(n_splits=k, order=1)
    # train_idx, val_idx = next(stratifier.split(X, y))          
    # X_train, y_train = X[train_idx], y[train_idx]
    # X_val, y_val = X[val_idx], y[val_idx]
    # folds.append((X_train, y_train, X_val, y_val))                      
    for train_idx, val_idx in stratifier.split(X, y):
        print(train_idx)
        print(val_idx)
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        folds.append((X_train, y_train, X_val, y_val))
    
    return folds

folds = iterative_kfold_split(texts_np,labels_np,k=5)

[    0     4     5 ... 10909 10910 10911]
[    1     2     3 ... 10886 10891 10900]
[    0     1     2 ... 10909 10910 10911]
[    5     6    13 ... 10898 10899 10902]
[    0     1     2 ... 10909 10910 10911]
[    8     9    12 ... 10895 10904 10908]
[    0     1     2 ... 10904 10905 10908]
[   16    18    21 ... 10909 10910 10911]
[    1     2     3 ... 10909 10910 10911]
[    0     4    11 ... 10893 10897 10905]


In [21]:
# calculate loss_weights to deal with dataset imbalance
label_counts = labels.sum(axis=0)
k = 100
weights = 1.0 / np.log(label_counts + k)
weights = weights / np.max(weights)  # normalize to [0, 1]
loss_weights = torch.tensor(weights, dtype=torch.float32)
# print(multilabel.classes_)
# print(label_counts)
print(loss_weights.max())
print(loss_weights.min())

tensor(1.)
tensor(0.7014)


In [34]:
# model
import torch
import torch.nn as nn
from transformers import DistilBertPreTrainedModel, DistilBertModel, DistilBertConfig

class DistilBertWithSoftLabel(DistilBertPreTrainedModel):
    def __init__(self, config, loss_weights=None):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = DistilBertModel(config)
        self.classifier = nn.Linear(config.dim, config.num_labels)
        self.loss_weights = loss_weights
        self.loss_fn = nn.BCEWithLogitsLoss(reduction='none')

        if loss_weights is None:
            self.loss_weights = torch.ones(config.num_labels)
        else:
            self.loss_weights = loss_weights

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0])

        if labels is not None:
            loss_matrix = self.loss_fn(logits, labels)
            loss_weights = self.loss_weights.to(logits.device)
            loss = (loss_matrix * loss_weights).mean()
            
            return {"logits": logits, "loss": loss}
        return {"logits": logits}
    
def data_collator(batch):
    return {
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
        # 'labels': torch.stack([torch.tensor(x['labels'], dtype=torch.float32) for x in batch])
        'labels': torch.stack([x['labels'] for x in batch])
    }


In [41]:
# model settings
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
label_counts = labels.sum(axis=0)
config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=len(labels[0]))
# model = DistilBertWithSoftLabel(num_labels=len(labels[0]),loss_weights=loss_weights)

In [42]:
# tokenize
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]
    # label = torch.tensor(self.labels[idx])
    if not isinstance(label, torch.Tensor):
        label = torch.tensor(label, dtype=torch.float32)
    else:
        label = label.detach().clone().float()

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].squeeze(0),
        'attention_mask': encoding['attention_mask'].squeeze(0),
        'token_type_ids': encoding.get('token_type_ids', torch.zeros_like(encoding['input_ids'])).squeeze(0),
        'labels': label
    }



In [25]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss, roc_curve
from transformers import EvalPrediction
import torch


def find_optimal_thresholds(y_true, y_probs):
    thresholds = []
    for i in range(y_true.shape[1]):
        fpr, tpr, th = roc_curve(y_true[:, i], y_probs[:, i])
        youdens_j = tpr - fpr
        best_th = th[np.argmax(youdens_j)]
        thresholds.append(best_th)
    # print("Optimal thresholds:", thresholds)
    return np.array(thresholds)


def find_f1_optimal_thresholds(y_true, y_probs):
    thresholds = []
    y_true = (y_true >= 0.5).astype(int)
    for i in range(y_true.shape[1]):
        best_f1 = 0
        best_th = 0.5
        for th in np.linspace(0.05, 0.95, 50):
            y_pred_i = (y_probs[:, i] >= th).astype(int)
            f1 = f1_score(y_true[:, i], y_pred_i)
            if f1 > best_f1:
                best_f1 = f1
                best_th = th
        thresholds.append(best_th)
    return np.array(thresholds)


def multi_labels_metrics(predictions, labels):
  sigmoid = torch.nn.Sigmoid()
  # probs = sigmoid(torch.Tensor(predictions))
  probs = sigmoid(torch.tensor(predictions)).detach().cpu().numpy()

  y_pred = np.zeros(probs.shape)
#   thresholds = np.maximum(find_optimal_thresholds(labels,probs), 0.05)
  thresholds = np.maximum(find_f1_optimal_thresholds(labels,probs), 0.05)
#   thresholds = np.full(probs.shape[1], 0.3)

  y_pred = (probs >= thresholds).astype(int)
  y_true = labels
  
  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, probs, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }
  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  labels = (p.label_ids > 0.5).astype(int) # p.label_ids
  result = multi_labels_metrics(predictions=preds,
                                labels=labels)
  
  return result

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

import torch
import time

start = time.time()
all_fold_metrics = []

for fold_id, (X_train, y_train, X_val, y_val) in enumerate(folds):
    train_texts = X_train.ravel().tolist()
    val_texts = X_val.ravel().tolist()

    train_labels = torch.clamp(torch.matmul(torch.tensor(y_train), PMI_tensor), 0.0, 1.0)
    val_labels = torch.clamp(torch.matmul(torch.tensor(y_val), PMI_tensor), 0.0, 1.0)

    train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
    val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

    model = DistilBertWithSoftLabel(config,num_labels=len(labels[0]),loss_weights=loss_weights)
    
    args = TrainingArguments(
        output_dir=f"./results/fold_{fold_id}",
        per_device_train_batch_size=48,          # 提升到 32，RTX 4050 沒問題
        per_device_eval_batch_size=48,
        num_train_epochs=8,                      # 稍微延長訓練週期，讓稀有 label 能學到
        eval_strategy="epoch",             # 每個 epoch 評估一次
        save_strategy="epoch",                   # 每個 epoch 存一次模型
        save_total_limit=2,                      # 最多保留 2 個 checkpoint
        load_best_model_at_end=True,             # 使用 val F1 最佳的模型
        metric_for_best_model="f1",              # 根據 F1 分數挑選 best model
        greater_is_better=True,
        logging_dir=f"./logs/fold_{fold_id}",
        logging_strategy="epoch",                # 每 epoch log 資訊
        report_to=["tensorboard"],
        seed=42,
        dataloader_num_workers = 4
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print(f"Fold {fold_id+1}: {len(train_texts)} train, {len(val_texts)} val")
    trainer.train()
    metrics = trainer.evaluate()
    
    all_fold_metrics.append(metrics)

torch.cuda.synchronize()
end = time.time()
print("Total training time:", end - start)

Fold 1: 8738 train, 2174 val


Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,0.4946,0.483675,0.649162,0.270992,0.84098,7.2636,299.3,6.333
2,0.4799,0.477894,0.671719,0.266476,0.842039,6.8442,317.642,6.721
3,0.463,0.484116,0.668496,0.270887,0.840966,7.0907,306.599,6.487
4,0.4368,0.492211,0.664894,0.269716,0.841264,7.0224,309.581,6.55


Fold 2: 8721 train, 2191 val


Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,0.4957,0.484442,0.648042,0.284138,0.833272,7.0378,311.317,6.536
2,0.4812,0.480994,0.657409,0.280756,0.834264,7.3039,299.978,6.298
3,0.4635,0.491401,0.643679,0.277229,0.835448,7.0873,309.145,6.49
4,0.4326,0.498373,0.6539,0.27864,0.834927,7.11,308.156,6.47
5,0.4071,0.516692,0.633753,0.279366,0.834661,7.0432,311.079,6.531


Fold 3: 8752 train, 2160 val


Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,0.4954,0.484144,0.66283,0.27601,0.836717,6.7366,320.636,6.68
2,0.4783,0.484936,0.660813,0.275926,0.836567,6.9741,309.717,6.452
3,0.4589,0.484898,0.657302,0.271591,0.838347,7.1632,301.541,6.282
4,0.428,0.514471,0.632475,0.27702,0.836722,7.159,301.719,6.286
5,0.4012,0.516271,0.639324,0.275568,0.837126,7.0784,305.152,6.357


Fold 4: 8699 train, 2213 val


Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,0.495,0.489968,0.6554,0.294725,0.824601,6.8506,323.039,6.861
2,0.4786,0.48607,0.676523,0.292774,0.825572,6.9555,318.166,6.757
3,0.4608,0.495718,0.667249,0.290576,0.82585,6.8368,323.687,6.875
4,0.4297,0.505685,0.663906,0.295855,0.824487,7.0167,315.389,6.698
5,0.4021,0.522492,0.655092,0.295095,0.824443,7.0504,313.881,6.666


Fold 5: 8738 train, 2174 val


Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,0.494,0.484771,0.624101,0.271682,0.841704,7.0436,308.649,6.531
2,0.4765,0.483876,0.642603,0.26934,0.842553,6.775,320.886,6.79
3,0.459,0.492317,0.640256,0.269675,0.842244,7.2015,301.882,6.388
4,0.4304,0.501851,0.64129,0.268713,0.842752,7.4018,293.712,6.215
5,0.4027,0.517897,0.639427,0.268734,0.842646,7.4025,293.685,6.214
6,0.3862,0.526435,0.633367,0.269047,0.842573,7.4716,290.969,6.157


Total training time: 1829.483668088913


In [43]:
for i, m in enumerate(all_fold_metrics):
    print(f"Fold {i+1} F1: {m['eval_f1']:.4f}")

avg_f1 = sum(m['eval_f1'] for m in all_fold_metrics) / len(all_fold_metrics)
print(f"\nAverage F1 across folds: {avg_f1:.4f}")


Fold 1 F1: 0.8420
Fold 2 F1: 0.8354
Fold 3 F1: 0.8383
Fold 4 F1: 0.8258
Fold 5 F1: 0.8428

Average F1 across folds: 0.8369


In [31]:
print(multilabel.classes_)

['binary search' 'bit manipulation' 'combinatorics' 'data structures'
 'divide and conquer' 'dynamic programming' 'game theory' 'geometry'
 'graphs' 'greedy' 'hashing' 'interactive' 'math' 'matrices'
 'number theory' 'probabilities' 'shortest path' 'sorting' 'strings'
 'trees' 'two pointers' 'union find']


In [44]:
from transformers import DistilBertTokenizerFast
checkpoint_dir = "./results/fold_0/m364"
output_dir = "./best_model"

# 1. 載入模型與 tokenizer
model = DistilBertWithSoftLabel(config,loss_weights=loss_weights)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# 2. 存成部署用格式
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/vocab.txt',
 './best_model/added_tokens.json',
 './best_model/tokenizer.json')

## k=100, Threshold - roc optimal , p.label_ids > 0.8 as P
{'eval_loss': 0.5422307252883911,
 'eval_roc_auc': 0.6355803896134508,
 'eval_hamming_loss': 0.39180860367301046,
 'eval_f1': 0.5670835103814952,
 'eval_runtime': 22.0219,
 'eval_samples_per_second': 99.129,
 'eval_steps_per_second': 12.397,
 'epoch': 5.0}

## k=100, Threshold = 0.3 , p.label_ids > 0.8 as P
{'eval_loss': 0.5422307252883911,
 'eval_model_preparation_time': 0.0041,
 'eval_roc_auc': 0.6355803896134508,
 'eval_hamming_loss': 0.5140132428268022,
 'eval_f1': 0.6445478260151899,
 'eval_runtime': 21.2506,
 'eval_samples_per_second': 102.727,
 'eval_steps_per_second': 12.847}

## k=100, Threshold - f1 optimal , p.label_ids > 0.5 as P
{'eval_loss': 0.5422307252883911,
 'eval_model_preparation_time': 0.0029,
 'eval_roc_auc': 0.6209105006225866,
 'eval_hamming_loss': 0.2817015783117478,
 'eval_f1': 0.8335942488331942,
 'eval_runtime': 21.624,
 'eval_samples_per_second': 100.953,
 'eval_steps_per_second': 12.625}

## k=100, Threshold - roc optimal , p.label_ids > 0.5 as P
{'eval_loss': 0.5422307252883911,
 'eval_model_preparation_time': 0.0062,
 'eval_roc_auc': 0.6209105006225866,
 'eval_hamming_loss': 0.3794194811143964,
 'eval_f1': 0.7055115285615337,
 'eval_runtime': 19.9769,
 'eval_samples_per_second': 109.276,
 'eval_steps_per_second': 13.666}

In [28]:
print(multilabel.classes_)


['binary search' 'bit manipulation' 'combinatorics' 'data structures'
 'divide and conquer' 'dynamic programming' 'game theory' 'geometry'
 'graphs' 'greedy' 'hashing' 'interactive' 'math' 'matrices'
 'number theory' 'probabilities' 'shortest path' 'sorting' 'strings'
 'trees' 'two pointers' 'union find']


In [29]:
trainer.save_model("distilbert-finetuned-imdb-multi-label")

In [30]:
# import pickle
# with open("multi-label-binarizer.pkl", "wb") as f:
#   pickle.dump(multilabel, f)