In [None]:
!pip install -U accelerate
!pip install -U transformers

In [None]:
import pandas as pd
df = pd.read_csv("./dataset/problems.csv", usecols=["description", "labels"])
df.head()
df['labels'][2]

In [None]:
# df.shape
df.info()
# df.duplicated().sum()
df['description'].str.len().plot.hist(bins=50)

In [None]:
import ast

df['labels'] = df['labels'].apply(ast.literal_eval)
labels_cnt = [l for lab in df['labels'] for l in lab]
label_series = pd.Series(labels_cnt).value_counts()
print(label_series)

print("總共有", label_series.index.nunique(), "種 labels")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel = MultiLabelBinarizer()
labels = multilabel.fit_transform(df["labels"]).astype('float32')  # To align label format with model prediction (both are float)
texts = df["description"].tolist()
# labels
# texts[:5]


In [None]:
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np

In [None]:

import numpy as np
from skmultilearn.model_selection import iterative_train_test_split

texts_np = np.array(texts)  # 假設 texts 是 list[str]
labels_np = np.array(labels)  # 假設 labels 是 np.ndarray [N, C] multi-hot

# MIN_VAL_COUNT = 40
# for i in range(100):  # 最多嘗試 100 次
    
#     val_counts = np.sum(y_val, axis=0)
#     if np.all(val_counts >= MIN_VAL_COUNT):
#         print(f"Valid split found at iteration {i}")
#         break
X_train, y_train, X_val, y_val = iterative_train_test_split(
        texts_np.reshape(-1, 1), labels_np, test_size=0.2
    )
# 還原回原本格式
train_texts = X_train.ravel().tolist()
val_texts = X_val.ravel().tolist()
train_labels = y_train
val_labels = y_val

# 統計出現次數
train_labels_cnt = np.sum(train_labels, axis=0)
val_label_counts = np.sum(val_labels, axis=0)

print("Train label counts:", train_labels_cnt)
print("Val label counts:", val_label_counts)




In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels[0]),
                                                            problem_type="multi_label_classification")

In [None]:
labels[0]

In [None]:
# Lets build custom dataset
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [None]:
val_dataset[0]

In [None]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

In [None]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("distilbert-finetuned-imdb-multi-label")

In [None]:
# import pickle
# with open("multi-label-binarizer.pkl", "wb") as f:
#   pickle.dump(multilabel, f)

In [None]:
from sklearn.metrics import classification_report
print("Evaluating...")
preds = trainer.predict(val_dataset).predictions
pred_binary = (preds > 0.5).astype(int)

print("\nClassification Report:")
label_names = multilabel.classes_
print(classification_report(val_labels, pred_binary, target_names=label_names))
