In [23]:
import pandas as pd
from labels import LABELS

In [24]:
data = "/teamspace/studios/this_studio/max/dataset_resume.csv"
data = "./dataset_resume.csv"
df = pd.read_csv(data, usecols=LABELS + ['description', 'resume'])


In [25]:
import numpy as np


def setup(df_orig: pd.DataFrame) -> pd.DataFrame:
    df = df_orig.copy()
    df = df.fillna(0)

    for index, row in df.iterrows():
        # s'il y a un résumé, on l'utilise comme description
        df.loc[index, "description"] = (
                row.resume
                if isinstance(row.resume, str)
                else row.description
            )

    df['label'] = df[LABELS].values.tolist()
    df = df.rename(columns={'description': 'text'})
    label = [list_label for list_label in df.label]
    return df.text.tolist(), label#.astype('float32')

features = df[LABELS]

texts, labels = setup(df)
labels[0]

[0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [26]:
import torch
from transformers import CamembertTokenizer, AutoTokenizer
from transformers import CamembertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [27]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2, random_state=42)

In [28]:
checkpoint = "almanach/camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(checkpoint)
model = CamembertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels[0]), problem_type="multi_label_classification")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
class EventDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = torch.tensor(self.labels[index])

        encoding = self.tokenizer(text, 
                                  padding="max_length", 
                                  max_length=self.max_len,
                                  truncation=True,
                                  return_tensors="pt")

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [30]:
train_ds = EventDataset(train_texts, train_labels, tokenizer)
val_ds = EventDataset(val_texts, val_labels, tokenizer)

In [31]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

In [32]:
# Training Arguments
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_ds,
                  eval_dataset = val_ds,
                  compute_metrics=compute_metrics)

In [33]:
trainer.train()

  0%|          | 0/195 [01:08<?, ?it/s]
  3%|▎         | 5/195 [00:20<13:04,  4.13s/it]

KeyboardInterrupt: 