<a href="https://colab.research.google.com/github/ludoveltz/test_github_fev25/blob/main/Exercice_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np

class BERTNamedEntityRecognizer:
    def __init__(self):
        # Initialisation du modèle et du tokenizer
        self.model_name = 'dslim/bert-base-NER'
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)

        # Configuration du device (GPU si disponible, sinon CPU)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = self.model.to(self.device)

    def recognize_entities(self, text):
        # Tokenization avec les paramètres demandés
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Déplacement vers le device approprié
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Passage en mode évaluation et désactivation du gradient
        self.model.eval()
        with torch.no_grad():
            # Obtention des prédictions
            outputs = self.model(**inputs)
            predictions = outputs.logits.argmax(-1)

        # Conversion des prédictions en labels
        predictions = predictions[0].cpu().numpy()
        tokens = self.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        labels = [self.model.config.id2label[pred] for pred in predictions]

        # Extraction des entités avec le schéma B-I-O
        entities = []
        current_entity = {'text': [], 'type': None}

        for token, label in zip(tokens, labels):
            if label.startswith('B-'):  # Début d'une nouvelle entité
                if current_entity['text']:
                    entities.append({
                        'text': ' '.join(current_entity['text']).replace(' ##', ''),
                        'type': current_entity['type']
                    })
                current_entity = {'text': [token], 'type': label[2:]}

            elif label.startswith('I-') and current_entity['type'] == label[2:]:
                # Continuation d'une entité
                current_entity['text'].append(token)

            elif label == 'O':  # Token hors entité
                if current_entity['text']:
                    entities.append({
                        'text': ' '.join(current_entity['text']).replace(' ##', ''),
                        'type': current_entity['type']
                    })
                current_entity = {'text': [], 'type': None}

        # Ajout de la dernière entité si elle existe
        if current_entity['text']:
            entities.append({
                'text': ' '.join(current_entity['text']).replace(' ##', ''),
                'type': current_entity['type']
            })

        return entities

# Test du reconnaisseur d'entités
ner = BERTNamedEntityRecognizer()

# Texte d'exemple standard pour l'exercice
test_text = "John Smith works at Microsoft in New York, and he visited Google's headquarters in California last summer."

# Analyse des entités
results = ner.recognize_entities(test_text)

# Affichage des résultats
print("\nTexte analysé :")
print(test_text)
print("\nEntités reconnues :")
for entity in results:
    if entity['text'] not in ['[CLS]', '[SEP]', '[PAD]']:  # Filtrage des tokens spéciaux
        print(f"Type: {entity['type']}, Texte: {entity['text']}")



Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Texte analysé :
John Smith works at Microsoft in New York, and he visited Google's headquarters in California last summer.

Entités reconnues :
Type: PER, Texte: John Smith
Type: ORG, Texte: Microsoft
Type: LOC, Texte: New York
Type: ORG, Texte: Google
Type: LOC, Texte: California
Type: LOC, Texte: [PAD] [PAD] [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD]
Type: ORG, Texte: [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD] [PAD] [PAD]
Type: ORG, Texte: [PAD] [PAD]
Type: ORG, Texte: [PAD] [PAD]
Type: LOC, Texte: [PAD] [PAD] [PAD]
