<a href="https://colab.research.google.com/github/mdjamina/model_relatives_psr_fr/blob/model_v2/src/psr_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation modules

In [None]:
!pip install datasets
!pip install -U transformers[torch]
!pip install -U accelerate

# Chargement du dataset

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

#chargement de dataset à partir de Hugging Face
raw_datasets = load_dataset("djamina/relatives_psr")

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'psr_tags', 'psr_seq_tags'],
        num_rows: 392
    })
    validation: Dataset({
        features: ['id', 'tokens', 'psr_tags', 'psr_seq_tags'],
        num_rows: 99
    })
    test: Dataset({
        features: ['id', 'tokens', 'psr_tags', 'psr_seq_tags'],
        num_rows: 55
    })
})

# Préparation des données

In [5]:
task = "psr" # Should be one of "psr" or "psr_seq"

model_checkpoint = "almanach/camembert-large"
batch_size = 16

## Tokenizer




In [None]:
 #Importation de la classe AutoTokenizer de la bibliothèque 'transformers'
from transformers import CamembertTokenizerFast


tokenizer = CamembertTokenizerFast.from_pretrained(model_checkpoint, force_download=True)

In [9]:
# Vérification si le tokenizer est une version rapide

tokenizer.is_fast

True

### Traitement des labels

In [10]:
def show(words, labels,label_names):
  line1 = ""
  line2 = ""
  for word, label in zip(words, labels):
      full_label = label_names[label]
      max_length = max(len(word), len(full_label))
      line1 += word + " " * (max_length - len(word) + 1)
      line2 += full_label + " " * (max_length - len(full_label) + 1)

  # Affichage aligné des tokens et de leurs tags
  print(line1)
  print(line2)

In [11]:

# Extraction des features 'psr_tags' à partir du dataset
label_names = raw_datasets["train"].features[f"{task}_tags"].feature.names
print(f'label_names : {label_names}')
print("-----------------")

words = raw_datasets["train"][15]["tokens"]
labels = raw_datasets["train"][15][f"{task}_tags"]

show(words, labels,label_names)



label_names : ['O', 'DET', 'APPO', 'AMBIGUE']
-----------------
Notre peuple n' est pas formé d' un peu plus de soixante millions d' individus qui cohabiteraient . Il est indivisible , précisément parce que ce qui le tient est plus fort que de simples règles ou des organisations . C' est un engagement chaque jour répété qui fait que notre citoyenneté n' est jamais abstraite et froide , mais qu ' elle est pleine et entière par ce lien fraternel qui nous unit et dont nous devons retrouver la vigueur . 
O     O      O  O   O   O     O  O  O   O    O  O        O        O  O         DET O              O O  O   O           O O           O     O   O  DET O  O     O   O    O    O   O  O       O      O  O   O             O O  O   O  O          O      O    O      O   O    O   O     O           O  O   O      O         O  O      O O    O  O O    O   O      O  O       O   O  O    O         DET O    O    O  DET  O    O      O         O  O       O 


In [12]:
# cette méthode permet d'aligner les labels avec les nouveaux tokens générés par le tokenizer

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [13]:
# cette méthode permet de tokeniser le dataset (par ligne) et d'aligner les labels avec les tokens générés par le tokenizer

def tokenize_and_align_labels(rows):
    tokenized_inputs = tokenizer(
        rows["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = rows[f"{task}_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs




### Tokenization

Application de la méthode 'tokenize_and_align_labels' sur le dataset 'raw_datasets'


pour obtenir le dataset 'tokenized_datasets' qui sera utilisé pour l'entraînement du modèle

In [None]:
# psr_tags

tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
# psr_tags_seq
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 281
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 71
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 88
    })
})

In [None]:
print(tokenized_datasets["train"][15]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


# Evaluate

In [16]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Méthode pour calculer les métriques d'évaluation
# On utilise les métriques de classification NER (Named Entity Recognition)
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [
        [label_names[l] for l in label if l != -100] for label in labels
    ]
    true_predictions = [
        [label_names[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten the lists to calculate metrics
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    true_predictions_flat = [item for sublist in true_predictions for item in sublist]

    results = {
        "precision": precision_score(true_labels_flat, true_predictions_flat, average='macro',zero_division=1),
        "recall": recall_score(true_labels_flat, true_predictions_flat, average='macro'),
        "f1": f1_score(true_labels_flat, true_predictions_flat, average='macro'),
        "accuracy": accuracy_score(true_labels_flat, true_predictions_flat),
    }
    return results

# Train

In [17]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

In [21]:
label_names = raw_datasets["train"].features[f"{task}_tags"].feature.names

id2label = {x:y for x,y in enumerate(label_names)}
label2id = {y:x for x,y in enumerate(label_names)}
label2id

{'O': 0, 'DET': 1, 'APPO': 2, 'AMBIGUE': 3}

In [22]:
from transformers import CamembertConfig, CamembertForTokenClassification

# Récupérer les noms des labels

num_labels = len(label_names)

# Définir la configuration du modèle
config = CamembertConfig(
    vocab_size=500_000,  # Taille du vocabulaire
    num_labels=num_labels,  # Nombre de labels dans votre dataset
    hidden_size=768,  # Taille des embeddings
    num_attention_heads=12,  #le lien entre les tokens, jusqu'au 12 eme token
    num_hidden_layers=12,  # Nombre de couches
)

config.label2id = label2id
config.id2label = id2label

# Initialiser le modèle avec la configuration
model = CamembertForTokenClassification(config)

In [None]:
label_names

['O', 'DET', 'APPO', 'AMBIGUE']

In [24]:
from transformers import TrainingArguments

model_name = "relatives_psr"
model_name =  f"relatives_{task}"

args = TrainingArguments(
    model_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    #weight_decay=0.01,
    push_to_hub=False,

)

In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [26]:
history = trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.060219,0.739755,0.630885,0.56918,0.976851
2,No log,0.058347,0.682092,0.630401,0.595208,0.978886
3,No log,0.058382,0.660397,0.609801,0.561016,0.977614
4,No log,0.056385,0.586838,0.610673,0.598057,0.976088
5,No log,0.053619,0.664043,0.645029,0.648263,0.980412


In [None]:
trainer.push_to_hub(commit_message="Training complete")

## Inference