In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from transformers import DataCollatorWithPadding
from sklearn.metrics import precision_score, recall_score, fbeta_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import EarlyStoppingCallback
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim
import torch
from transformers import AdamW
import torch.nn.functional as F

In [4]:
# Chargement des données

from google.cloud import bigquery

client = bigquery.Client(project="itg-cldataops-gbl-ww-pd")

query = f"""
    SELECT quote_id, topic_id, quote_text_normalized as quote_text, label
    FROM `itg-cldataops-gbl-ww-pd.bta_consumerloop_bqdset_labelled_data_eu_pd.tbl_topics_dataset_2024_08_30`
    WHERE country_id=3 and split="train" and topic_id in (548, 558)
"""

df = client.query(query).to_dataframe()

df



Unnamed: 0,quote_id,topic_id,quote_text,label
0,15225446003,548,"l appliqu etre un peu bizarr , pourquoi pas si...",True
1,1840193003,548,mais cel l avoir repulp un peu .,False
2,4603383003,548,point fort odeur facil d utilis po...,False
3,9409317003,548,dior avoir just chang le packaging ce derni an...,False
4,9063266003,548,mais cel demand un peu plus de temp que de s a...,False
...,...,...,...,...
12584,13640720003,548,a voir pour le autr essenc ... pas d' odeur de...,False
12585,9803863003,548,le pomp distribu un quantit suffis de produit .,True
12586,8098980003,548,en un clic on arriv a obten un quantit ampleme...,True
12587,9180479003,548,le bross etre gener et ne depos que le quantit...,True


In [5]:
topics=df["topic_id"].unique().tolist()

In [6]:
filtered_df = df[df['label'] == True]

# Grouper le DataFrame
grouped_df = df.groupby('quote_id').agg({
    'topic_id': lambda x: set(x),
    'quote_text': 'first'
}).reset_index()

# Créer le DataFrame avec les vecteurs binaire
mlb = MultiLabelBinarizer()
topic_matrix = mlb.fit_transform(grouped_df['topic_id'])

topic_df = pd.DataFrame(topic_matrix, columns=mlb.classes_)

# Créer la colonne 'labels'
true_labels = set(zip(filtered_df['quote_id'], filtered_df['topic_id']))

labels_matrix = [
    [1 if (quote_id, topic) in true_labels else 0 for topic in mlb.classes_]
    for quote_id in grouped_df['quote_id']
]

grouped_df['labels'] = labels_matrix


# Créer la colonne 'quote_associated'
all_combinations = set(zip(df['quote_id'], df['topic_id']))

associated_matrix = [
    [1 if (quote_id, topic) in all_combinations else 0 for topic in mlb.classes_]
    for quote_id in grouped_df['quote_id']
]

grouped_df['quote_associated'] = associated_matrix


result_df = grouped_df[['quote_id', 'quote_text', 'labels', 'quote_associated']]

result_df

Unnamed: 0,quote_id,quote_text,labels,quote_associated
0,200,mais c etre un pet quantit qui couvr beaucoup ! !,"[0, 0]","[1, 0]"
1,206,"le instruct disent d utilis un pet quantit , c...","[0, 0]","[1, 0]"
2,207,"en le voir deven bleu , vous pouvoir voir si l...","[0, 0]","[1, 0]"
3,208,j util un pet quantit et cel avoir aid a gard ...,"[0, 0]","[1, 0]"
4,209,"mais ensuit j avoir achet , et croi moi , vous...","[0, 0]","[1, 0]"
...,...,...,...,...
12560,379580006001,"quand j avoir ouvr le boit , j etre excit , je...","[0, 1]","[0, 1]"
12561,384072502001,mais j avoir avoir de mal a comprendr comment ...,"[0, 1]","[0, 1]"
12562,405098231001,mon seul problem etre que je trouv vrai diffic...,"[0, 0]","[0, 1]"
12563,414014706001,je devoir prevoir 20 minut supplementair lorsq...,"[0, 1]","[0, 1]"


In [7]:
result_df['labels'] = result_df['labels'].apply(lambda x: [float(i) for i in x])
result_df['quote_associated'] = result_df['quote_associated'].apply(lambda x: [float(i) for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['labels'] = result_df['labels'].apply(lambda x: [float(i) for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['quote_associated'] = result_df['quote_associated'].apply(lambda x: [float(i) for i in x])


In [8]:
# Train-Val Split

train_df, val_df = train_test_split(
    result_df,
    test_size=0.2,
    random_state=42,
    stratify=result_df['labels']
)

In [9]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [10]:
model_path = 'microsoft/deberta-v3-small'

tokenizer = AutoTokenizer.from_pretrained(model_path)



In [11]:
def preprocess_function(row):
    tokenized_example = tokenizer(row["quote_text"], truncation=True, max_length=512)
    tokenized_example["labels"] = row["labels"]
    tokenized_example["quote_associated"] = row["quote_associated"]
    return tokenized_example

tokenized_dataset_train = train_dataset.map(preprocess_function)
tokenized_dataset_val = val_dataset.map(preprocess_function)

Map:   0%|          | 0/10052 [00:00<?, ? examples/s]

Map:   0%|          | 0/2513 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int)
    
    scores_dict = {}

    for index, topic in enumerate(topics):
        topic_predictions = predictions[:, index].reshape(-1)
        topic_labels = labels[:, index].reshape(-1)
        
        precision = precision_score(topic_labels, topic_predictions, zero_division=0)
        recall = recall_score(topic_labels, topic_predictions, zero_division=0)
        fbeta = fbeta_score(topic_labels, topic_predictions, beta=0.33, zero_division=0)
        f1 = f1_score(topic_labels, topic_predictions, zero_division=0)

        topic_str = str(topic)

        scores_dict[topic_str] = {
        'precision': precision,
        'recall': recall,
        'fbeta': fbeta,
        'f1' : f1
    }
    
    return scores_dict

In [17]:
class CustomLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        
        print(inputs.keys())
        
        labels = inputs.get("labels")
        quote_associated = inputs.get("quote_associated")
                
        outputs = model(**inputs)
        logits = outputs.logits
        
        bce_loss = F.binary_cross_entropy_with_logits(logits, labels.float(), reduction='none')
        
        weighted_loss = bce_loss * quote_associated

        loss = weighted_loss.mean()
        
        return (loss, outputs) if return_outputs else loss

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=len(topics),
    problem_type = "multi_label_classification",
    hidden_dropout_prob=0.2
)

training_args = TrainingArguments(
   output_dir="weighted_loss",
   learning_rate=1e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=35,
   weight_decay=0.1,
   evaluation_strategy="epoch", #essayer avec step
   save_strategy="epoch",
   load_best_model_at_end=True,
   logging_dir='weighted_loss/logs',  # Répertoire de logs pour TensorBoard
    logging_steps=1,
    logging_first_step=True,  # Enregistrer les logs dès la première étape
)

num_train_epochs = 35
total_steps = len(train_dataset) // training_args.per_device_train_batch_size * num_train_epochs
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.1)
#optimizer = optim.Adam(model.parameters(), lr=2e-6)


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps= total_steps * 0.1,
    num_training_steps=total_steps
)

trainer = CustomLossTrainer(
       model=model,
       args=training_args,
       train_dataset=tokenized_dataset_train,
       eval_dataset=tokenized_dataset_val,
       tokenizer=tokenizer,
       #data_collator=data_collator,
       compute_metrics=compute_metrics,
       optimizers=(optimizer, scheduler)
   )

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

trainer.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])


TypeError: unsupported operand type(s) for *: 'Tensor' and 'NoneType'