In [1]:
import pandas as pd
from labels import LABELS

In [2]:
data = "/teamspace/studios/this_studio/max/dataset_resume.csv"
#data = "./dataset_resume.csv"
df = pd.read_csv(data, usecols=LABELS + ['description', 'resume'])


In [3]:
import numpy as np


def setup(df_orig: pd.DataFrame) -> pd.DataFrame:
    df = df_orig.copy()
    df = df.fillna(0)

    for index, row in df.iterrows():
        # s'il y a un résumé, on l'utilise comme description
        df.loc[index, "description"] = (
                row.resume
                if isinstance(row.resume, str)
                else row.description
            )

    df['label'] = df[LABELS].values.tolist()
    df = df.rename(columns={'description': 'text'})
    label = [list_label for list_label in df.label]
    return df.text.tolist(), label#.astype('float32')

features = df[LABELS]

texts, labels = setup(df)

In [4]:
import torch
from transformers import CamembertTokenizer, AutoTokenizer
from transformers import CamembertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=.2, random_state=42)

In [6]:
checkpoint = "almanach/camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(checkpoint)
model = CamembertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels[0]), problem_type="multi_label_classification")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [134]:
class EventDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=324):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = torch.tensor(self.labels[index])

        encoding = self.tokenizer(text, 
                                  padding="max_length", 
                                  max_length=self.max_len,
                                  truncation=True,
                                  return_tensors="pt")

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [135]:
train_ds = EventDataset(train_texts, train_labels, tokenizer)
val_ds = EventDataset(val_texts, val_labels, tokenizer)

In [136]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch


def multi_labels_metrics(predictions, labels, threshold=0.3):
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(torch.Tensor(predictions))

  y_pred = np.zeros(probs.shape)
  y_pred[np.where(probs>=threshold)] = 1
  y_true = labels

  f1 = f1_score(y_true, y_pred, average = 'macro')
  roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
  hamming = hamming_loss(y_true, y_pred)

  metrics = {
      "roc_auc": roc_auc,
      "hamming_loss": hamming,
      "f1": f1
  }

  return metrics

def compute_metrics(p:EvalPrediction):
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  result = multi_labels_metrics(predictions=preds,
                                labels=p.label_ids)

  return result

In [137]:
# Training Arguments
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

args = TrainingArguments(
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    output_dir = './results',
    num_train_epochs=300,
    save_steps=1000,
    save_total_limit=2,
    eval_steps=10,
    evaluation_strategy = "epoch",
    save_strategy = 'epoch',
    load_best_model_at_end=True
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_ds,
                  eval_dataset = val_ds, 
                  compute_metrics=compute_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
                  )

In [138]:
trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc,Hamming Loss,F1,Runtime,Samples Per Second,Steps Per Second
1,No log,0.166125,0.767039,0.058226,0.569341,0.4593,169.836,8.71
2,No log,0.163735,0.761228,0.056624,0.558799,0.4614,169.063,8.67
3,No log,0.168069,0.76142,0.060363,0.557858,0.4621,168.805,8.657
4,No log,0.169989,0.762445,0.055556,0.566533,0.4603,169.445,8.69
5,No log,0.168844,0.772779,0.057692,0.584103,0.4637,168.221,8.627
6,No log,0.16633,0.75809,0.059295,0.555409,0.4635,168.285,8.63
7,No log,0.169071,0.76736,0.05609,0.575887,0.4623,168.728,8.653


TrainOutput(global_step=91, training_loss=0.043861721898173237, metrics={'train_runtime': 52.2528, 'train_samples_per_second': 1791.29, 'train_steps_per_second': 74.637, 'total_flos': 363707751633408.0, 'train_loss': 0.043861721898173237, 'epoch': 7.0})

In [139]:
trainer.evaluate()

{'eval_loss': 0.163735494017601,
 'eval_roc_auc': 0.7612276623072908,
 'eval_hamming_loss': 0.056623931623931624,
 'eval_f1': 0.5587993872052183,
 'eval_runtime': 0.4631,
 'eval_samples_per_second': 168.424,
 'eval_steps_per_second': 8.637,
 'epoch': 7.0}

In [38]:
trainer.save_model("/teamspace/studios/this_studio/max/camembert-tourism-events")

In [39]:
import pickle
with open("/teamspace/studios/this_studio/max/features.pkl", "wb") as f:
  pickle.dump(features, f)

In [132]:
text = """Après de multiples tournées à succès et plus de 2 millions de billets vendus, Messmer, connu et reconnu comme le Maître Mondial de l’Hypnose revient près de chez vous !

Dans 13Hz, Messmer vous invite à entrer dans son mystérieux et hilarant univers où la frontière entre la réalité et l’illusion s’efface, pour diriger vos pensées vers des territoires inconnus.

Avec sa présence charismatique inégalée et son talent exceptionnel, le recordman en hypnose collective avec 1066 personnes hypnotisées en moins de 5 minutes, vous plonge au cœur de vos pensées les plus profondes avant de vous guider à travers un jeu subtil d’ondes cérébrales à 13Hz.

Le fascinateur vous entraîne vers un état de conscience unique où la volonté et le contrôle de nos vies prennent une nouvelle dimension.

Osez découvrir l’expérience Messmer, où la maîtrise de soi et la fascination se rencontrent."""

encoding = tokenizer(text, return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding)

In [133]:
import json
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(outputs.logits[0].cpu()).detach().numpy()
preds = np.zeros(probs.shape)
##preds[np.where(probs>=0.3)] = 1


def sort_dict_by_value(dict1):
  """Trie un dictionnaire par ordre décroissant des valeurs."""
  sorted_dict = {x: y for x, y in sorted(dict1.items(), key=lambda item: item[1], reverse=True)}
  return sorted_dict
  
preds = '{'
for idx, label in enumerate(LABELS):
    preds += f'"{label}": {probs[idx]},'

preds = preds.strip(',')
preds += '}'
preds = json.loads(preds)
sort_dict_by_value(preds)
#probs.
#features.inverse_transform(preds.reshape(1,-1))

{'Détente': 0.5166682600975037,
 'Atelier': 0.3122008740901947,
 'Spectacle': 0.09285923093557358,
 'Santé': 0.08189744502305984,
 'Concert': 0.07728264480829239,
 'Famille': 0.07240385562181473,
 'Danse': 0.048642776906490326,
 'Jeu': 0.028794407844543457,
 'Théatre': 0.017140071839094162,
 'Sport': 0.014915966428816319,
 'Exposition': 0.012590804137289524,
 'Culture': 0.009760606102645397,
 'Environnement': 0.009404394775629044,
 'Art': 0.009255075827240944,
 'Festival': 0.008814648725092411,
 'Fête': 0.00797022134065628,
 'Brocante': 0.007022770121693611,
 'Gastronomie': 0.006755351088941097,
 'Visite': 0.005003015510737896,
 'Balade': 0.003963761031627655,
 'Action': 0.0037308589089661837,
 'Marché': 0.0035765995271503925,
 'Conférence': 0.002429952146485448,
 'Histoire': 0.0015247827395796776}