In [60]:
# !pip install -U accelerate
# !pip install -U transformers

In [61]:
import pandas as pd
df = pd.read_csv("./dataset/problems.csv", usecols=["description", "labels"])
df.head(10)

Unnamed: 0,description,labels
0,John gave Jack a very hard problem. He wrote a...,['math']
1,Due to the recent popularity of the Deep learn...,"['dynamic programming', 'matrices']"
2,Bill is a famous mathematician in BubbleLand. ...,"['greedy', 'sorting']"
3,The competitors of Bubble Cup X gathered after...,"['shortest path', 'graphs', 'binary search']"
4,John has just bought a new car and is planning...,['dynamic programming']
5,"Consider an array A with N elements, all being...","['combinatorics', 'number theory', 'math']"
6,The citizens of BubbleLand are celebrating the...,"['dynamic programming', 'geometry']"
7,This story is happening in a town named Bubble...,"['trees', 'graphs']"
8,You are given an integer $$$x$$$ of $$$n$$$ di...,"['greedy', 'strings']"
9,You are given a Young diagram. Given diagram ...,"['greedy', 'dynamic programming', 'math']"


In [62]:
# df.shape
df.info()
# df.duplicated().sum()
# df['description'].str.len().plot.hist(bins=50)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10912 entries, 0 to 10911
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  10912 non-null  object
 1   labels       10912 non-null  object
dtypes: object(2)
memory usage: 170.6+ KB


In [63]:
import ast

df['labels'] = df['labels'].apply(ast.literal_eval)
# labels_cnt = [l for lab in df['labels'] for l in lab]
# label_series = pd.Series(labels_cnt).value_counts()
# print(label_series)

# print("總共有", label_series.index.nunique(), "種 labels")

In [64]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import torch
multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df["labels"]).astype('float32') # NumPy ndarray # To align label format with model prediction (both are float)
texts = df["description"].tolist()

co_matrix = np.dot(labels.T, labels)  # labels: shape (N_samples, N_labels)
total = np.sum(co_matrix)
# 機率矩陣
P_ij = co_matrix / total
# 邊際機率 P(i)
P_i = np.diag(co_matrix) / total  # shape: (n_labels,)
# 外積計算 P(i) * P(j)
P_i_P_j = np.outer(P_i, P_i)
# PMI 計算，加上小常數避免 log(0)
PMI = np.log(P_ij / (P_i_P_j + 1e-10) + 1e-10)
np.fill_diagonal(PMI, PMI.max())
# Normalize PMI to [0, 1] for soft label weight
PMI_norm = (PMI - PMI.min()) / (PMI.max() - PMI.min())
PMI_tensor = torch.tensor(PMI_norm, dtype=torch.float32)

# soft_labels = torch.matmul(torch.tensor(labels), PMI_tensor)
# soft_labels = torch.clamp(soft_labels, 0.0, 1.0)
# soft_labels = torch.round(soft_labels * 10) / 10

# import seaborn as sns
# import matplotlib.pyplot as plt
# label_names = multilabel.classes_
# plt.figure(figsize=(12, 10))
# sns.heatmap(PMI, xticklabels=label_names, yticklabels=label_names, cmap='YlGnBu', annot=False)
# plt.title("Label Co-occurrence Matrix")
# plt.show()


In [65]:
label_counts = labels.sum(axis=0)
k = 100
weights = 1.0 / np.log(label_counts + k)
weights = weights / np.max(weights)  # normalize to [0, 1]
loss_weights = torch.tensor(weights, dtype=torch.float32)
# print(multilabel.classes_)
# print(label_counts)
print(loss_weights.max())
print(loss_weights.min())

tensor(1.)
tensor(0.7014)


In [66]:
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
# from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np

In [67]:
from skmultilearn.model_selection import iterative_train_test_split
texts_np = np.array(texts)  
labels_np = np.array(labels)
# labels_np = soft_labels.numpy()


X_train, y_train, X_val, y_val = iterative_train_test_split( # need  nulti-hot vector should be integer
        texts_np.reshape(-1, 1), labels_np, test_size=0.2
    )


# 還原回原本格式
train_texts = X_train.ravel().tolist()
val_texts = X_val.ravel().tolist()
# train_labels = y_train
# val_labels = y_val

train_labels = torch.matmul(torch.tensor(y_train), PMI_tensor)
val_labels = torch.matmul(torch.tensor(y_val), PMI_tensor)
train_labels = torch.clamp(train_labels, 0.0, 1.0)
val_labels = torch.clamp(val_labels, 0.0, 1.0)


# 統計出現次數（要轉回 numpy）
# train_labels_cnt = train_labels.sum(dim=0).numpy()
# val_label_counts = val_labels.sum(dim=0).numpy()

# print("Train label counts:", train_labels_cnt)
# print("Val label counts:", val_label_counts)
print(labels_np[0])
print(val_labels.numpy()[0])


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0.8922318  0.59647906 0.5165833  0.8321489  0.7325249  0.7068355
 0.7786813  0.685588   0.71685344 1.         0.6454635  0.5578946
 0.7105639  0.5049038  0.57523435 0.40439364 0.68423986 1.
 0.66521287 0.67514175 0.94502926 0.78884274]


In [68]:
import torch
import torch.nn as nn
from transformers import DistilBertModel

class DistilBertWithSoftLabel(nn.Module):
    def __init__(self, num_labels, loss_weights):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.loss_fn = nn.BCEWithLogitsLoss(reduction='none')  # No reduction 
        self.loss_weights = loss_weights


    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0])  # [CLS] token
        
        if labels is not None: # label is soft_labels
            loss_matrix = self.loss_fn(logits, labels)
            # loss = loss_matrix.mean()  # or weighted sum
            loss_weights = self.loss_weights.to(logits.device)
            loss = (loss_matrix * loss_weights).mean()

            return {"logits": logits, "loss": loss}

        return {"logits": logits}

def data_collator(batch):
    return {
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
        # 'labels': torch.stack([torch.tensor(x['labels'], dtype=torch.float32) for x in batch])
        'labels': torch.stack([x['labels'] for x in batch])
    }


In [69]:
labels[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.], dtype=float32)

In [70]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
# model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels[0]),
#                                                             problem_type="multi_label_classification")
label_counts = labels.sum(axis=0)
model = DistilBertWithSoftLabel(num_labels=len(labels[0]),loss_weights=loss_weights)

In [71]:

class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = self.labels[idx]
    # label = torch.tensor(self.labels[idx])
    if not isinstance(label, torch.Tensor):
        label = torch.tensor(label, dtype=torch.float32)
    else:
        label = label.detach().clone().float()

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].squeeze(0),
        'attention_mask': encoding['attention_mask'].squeeze(0),
        'labels': label
    }

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [72]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels
  
  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, probs, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)
  # # PMI-aware accuracy
  # correct = 0
  # total = 0
  # N, L = y_true.shape # (num_of_samples,num_of_labels)
  # pmi_matrix = pmi_matrix.numpy()  # Convert to numpy if it's a tensor

  # for sample_idx in range(N):
  #   for label_idx in range(L):
  #     if y_true[sample_idx][label_idx] == 1:
  #       total += 1
  #       if y_pred[sample_idx][label_idx] == 1:
  #         correct += 1
  #       else:
  #         related_preds = np.where((y_pred[sample_idx] == 1) & 
  #                                  (pmi_matrix[label_idx] >= pmi_threshold))[0]
  #         if len(related_preds) > 0:
  #             correct += 0.5

  # pmi_acc = correct / total if total > 0 else 0.0

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
      # "pmi_aware_accuracy": pmi_acc
  }
  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
  labels = (p.label_ids > 0.5).astype(int) # p.label_ids
  result = multi_labels_metrics(predictions=preds,
                                labels=labels)

  return result

In [73]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics,
                  data_collator=data_collator)

In [74]:
trainer.train()

Step,Training Loss
500,0.5048
1000,0.487
1500,0.4816
2000,0.4851
2500,0.4719
3000,0.4617
3500,0.4471
4000,0.4357
4500,0.4234
5000,0.4094


TrainOutput(global_step=5460, training_loss=0.45647012899210165, metrics={'train_runtime': 731.6308, 'train_samples_per_second': 59.682, 'train_steps_per_second': 7.463, 'total_flos': 0.0, 'train_loss': 0.45647012899210165, 'epoch': 5.0})

In [75]:
trainer.evaluate()

{'eval_loss': 0.5321438312530518,
 'eval_roc_auc': 0.6209178427016014,
 'eval_hamming_loss': 0.27911051775209644,
 'eval_f1': 0.835518813834735,
 'eval_runtime': 22.7607,
 'eval_samples_per_second': 95.735,
 'eval_steps_per_second': 11.994,
 'epoch': 5.0}

In [76]:
print(multilabel.classes_)


['binary search' 'bit manipulation' 'combinatorics' 'data structures'
 'divide and conquer' 'dynamic programming' 'game theory' 'geometry'
 'graphs' 'greedy' 'hashing' 'interactive' 'math' 'matrices'
 'number theory' 'probabilities' 'shortest path' 'sorting' 'strings'
 'trees' 'two pointers' 'union find']


In [77]:
trainer.save_model("distilbert-finetuned-imdb-multi-label")

In [78]:
# import pickle
# with open("multi-label-binarizer.pkl", "wb") as f:
#   pickle.dump(multilabel, f)

In [79]:
from sklearn.metrics import classification_report
print("Evaluating...")
preds = trainer.predict(val_dataset).predictions
pred_binary = (preds > 0.5).astype(int)
val_labels_binary = (val_labels > 0.5).int().numpy()
print("\nClassification Report:")
label_names = multilabel.classes_
print(classification_report(val_labels_binary, pred_binary, target_names=label_names))


Evaluating...

Classification Report:
                     precision    recall  f1-score   support

      binary search       0.77      0.80      0.78      1597
   bit manipulation       0.76      0.76      0.76      1570
      combinatorics       0.83      0.73      0.78      1744
    data structures       0.78      0.79      0.79      1618
 divide and conquer       0.79      0.77      0.78      1641
dynamic programming       0.77      0.82      0.80      1618
        game theory       0.75      0.76      0.76      1537
           geometry       0.73      0.69      0.71      1421
             graphs       0.79      0.75      0.77      1619
             greedy       0.79      0.87      0.83      1673
            hashing       0.77      0.77      0.77      1597
        interactive       0.70      0.64      0.67      1329
               math       0.80      0.86      0.83      1687
           matrices       0.74      0.65      0.69      1484
      number theory       0.79      0.74      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
