In [1]:
import re
import json
from difflib import get_close_matches
import spacy
import nltk
from nltk.corpus import words
import random
from IPython.display import display, HTML
from spacy import displacy

In [2]:
# Déjà, je vais récuper tous les noms des pokémons pour corriger les typos du fichier NER-Pokemons.txt
# (Snorlx, Scyter, Garchommp et d'autres peut-être jsp.)
# https://github.com/sindresorhus/pokemon/blob/main/data/en.json
with open("all_pokemons.json", "r", encoding="utf-8") as file:
    all_pokemon_names = json.load(file)
#print(all_pokemon_names)

In [3]:
try:
    nltk.data.find('words')
except LookupError:
    nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\moqp3\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [4]:
# En gros là le but, c'est d'identifier les indices des caractères de la phrase, qui contiennent un nom de pokémon.
#{
#        "text": "Pikachu used Thunderbolt to defeat the wild Charmander",
#        "entities": [
#            [
#                0,
#                7,
#                "POKEMON"
#            ],
#            [
#                44,
#                54,
#                "POKEMON"
#            ]
#        ]
#    },
# De O à 7, on a pikachu, et de 44 à 54 on a charmander (salamèche)

# Liste des mots anglais
english_words = set(words.words())

# Liste des mots à exclure de la correction
exclude_words = ["shiny", "pokémon", "center"]

# Fonction pour corriger les fautes avec Levenshtein Distance
def correct_typo(word, pokemon_names, special_terms):
    if word.lower() in special_terms:
        return word.lower()
    
    # Vérifier si le mot est déjà un mot anglais
    if word.lower() in english_words:
        return word
    
    close_match = get_close_matches(word, pokemon_names, n=1, cutoff=0.8)
    
    # Correction uniquement si une correspondance proche est trouvée
    if close_match:
        return close_match[0]
    else:
        return word

# Fonction pour traiter et formater les données
def process_data(input_file, output_file, pokemon_names):
    json_data = []
    
    with open(input_file, 'r', encoding='utf-8') as file2:
        for line in file2:
            line = line.strip()
            if not line:
                continue

            # Extraction du texte
            match = re.match(r'^\d+\.\s*(.*)', line)
            if not match:
                continue
            extracted_text = match.group(1)

            # Utilisation de re.finditer pour capturer tous les mots, la ponctuation et les positions
            words_and_punctuation = list(re.finditer(r'\w+|[^\w\s]', extracted_text))

            corrected_text = []
            
            for item in words_and_punctuation:
                word = item.group(0)
                start_idx = item.start()
                end_idx = item.end()
                
                corrected_word = correct_typo(word, pokemon_names, exclude_words)
                corrected_text.append((corrected_word, start_idx, end_idx))

            # Reconstruction du texte et mise à jour des indices
            final_text = ""
            last_end = 0
            new_entities = []

            for corrected_word, start_idx, end_idx in corrected_text:
                final_text += extracted_text[last_end:start_idx] + corrected_word
                new_start_idx = len(final_text) - len(corrected_word)  # Nouvel index début
                new_end_idx = len(final_text)  # Nouvel index fin
                last_end = end_idx  # Met à jour la dernière position

                # Vérifier si c'est un Pokémon et ajouter les nouvelles coordonnées
                if corrected_word in pokemon_names:
                    new_entities.append([new_start_idx, new_end_idx, "POKEMON"])

            # Ajout de la dernière portion du texte après le dernier mot
            final_text += extracted_text[last_end:]

            # Sauvegarde des résultats dans le format json
            json_data.append({
                "text": final_text,
                "entities": new_entities
            })

    # Écriture du fichier JSON
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(json_data, outfile, ensure_ascii=False, indent=4)

#entree = 'NER-Pokemons.txt'  
#sortie = 'formatted_pokemon_data.json'  
entree='larger_NER-Pokemons.txt'
sortie='larger_formatted_pokemon_data.json'
process_data(entree, sortie, all_pokemon_names)

In [5]:
# Ici une petite preuve que ça fonctionne correctement
def extract_pokemon_names(json_file):

    with open(json_file, 'r', encoding='utf-8') as file3:
        formated_data = json.load(file3)

    for line in formated_data:
        sentence = line['text']
        pokemons_entities = line['entities']

        print(f"Phrase étudiée : {sentence}")

        for entity in pokemons_entities:
            start_index = entity[0]
            end_index = entity[1]
            pokemon_name = sentence[start_index:end_index]
            print(f"  Nom du Pokémon : {pokemon_name}")

#extract_pokemon_names('formatted_pokemon_data.json')
extract_pokemon_names('larger_formatted_pokemon_data.json')

Phrase étudiée : Pikachu used Thunderbolt to defeat the wild Charmander.
  Nom du Pokémon : Pikachu
  Nom du Pokémon : Charmander
Phrase étudiée : Ash's favorite pokémon is Charizard because of its powerful Flamethrower.
  Nom du Pokémon : Charizard
Phrase étudiée : Bulbasaur and Squirtle teamed up to battle against a wild Gengar.
  Nom du Pokémon : Bulbasaur
  Nom du Pokémon : Squirtle
  Nom du Pokémon : Gengar
Phrase étudiée : Team Rocket tried to capture Eevee, but it quickly escaped.
  Nom du Pokémon : Eevee
Phrase étudiée : Snorlax blocked the road, forcing the trainer to find another path.
  Nom du Pokémon : Snorlax
Phrase étudiée : Gyarados appeared from the water and scared all the nearby trainers.
  Nom du Pokémon : Gyarados
Phrase étudiée : Mewtwo is known as one of the most powerful legendary pokémon.
  Nom du Pokémon : Mewtwo
Phrase étudiée : Psyduck always has a headache, which makes it use Confuson randomly.
  Nom du Pokémon : Psyduck
Phrase étudiée : Jigglypuff sang a lu

In [6]:
# Maintenant, je vais transposer mon json formaté en data d'entrainement exploitable pour spacy
with open("larger_formatted_pokemon_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Transformer en format spaCy
TRAIN_DATA = []
for entry in data:
    text = entry["text"]
    entities = [(start, end, label) for start, end, label in entry["entities"]]
    TRAIN_DATA.append((text, {"entities": entities}))

print(TRAIN_DATA[:2])

[('Pikachu used Thunderbolt to defeat the wild Charmander.', {'entities': [(0, 7, 'POKEMON'), (44, 54, 'POKEMON')]}), ("Ash's favorite pokémon is Charizard because of its powerful Flamethrower.", {'entities': [(26, 35, 'POKEMON')]})]


In [7]:
# Je vais utiliser le modèle spacy
nlp=None
ner=None
try:
    # Attempt to load the model
    nlp = spacy.load("en_core_web_md")
    ner = nlp.get_pipe("ner")
    print("Model loaded successfully.")

except OSError as e:
    print(f"Error loading model: {e}")
    print("It seems the 'en_core_web_sm' model is not downloaded or not accessible.")
    print("Attempting to download the model...")
    try:
        # Download the model
        spacy.cli.download("en_core_web_sm")
        print("Model downloaded successfully.")
        # Load the model again after downloading
        nlp = spacy.load("en_core_web_sm")
        ner = nlp.get_pipe("ner")
        print("Model loaded successfully after download.")
    except Exception as download_error:
        print(f"Error downloading or loading the model: {download_error}")
        print("Please ensure you have an internet connection and try again.")
        print("You can also try downloading the model manually using the command:")
        print("python -m spacy download en_core_web_sm")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Model loaded successfully.


In [8]:
# Check if nlp and ner are defined before using them
if nlp is not None and ner is not None:
    # Ici j'ajoute le label pokémon
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

In [9]:
# Entraînement du modèle
optimizer = nlp.create_optimizer()
optimizer.learn_rate = 0.005

n_iter = 10
losses = {}

# Initialize losses for each pipe in the pipeline
for pipe_name in nlp.pipe_names:
    losses[pipe_name] = 0.0

for i in range(n_iter):
    random.shuffle(TRAIN_DATA)
    batches = spacy.util.minibatch(TRAIN_DATA, size=8)
    total_loss = 0.0
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = spacy.training.Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.3, losses=losses, sgd=optimizer)

    # Calculate the average loss for this iteration
    iteration_loss = sum(losses.values()) / len(losses) if len(losses) > 0 else 0.0
    print(f"Losses at iteration {i}: {losses} - Average Loss: {iteration_loss}")

    # Reset losses for the next iteration
    for pipe_name in losses:
        losses[pipe_name] = 0.0
        
nlp.to_disk("C:/Users/moqp3/Desktop/catho/S2/NLP/nlp/tp4/NER_results")

Losses at iteration 0: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'attribute_ruler': 0.0, 'lemmatizer': 0.0, 'ner': 116.66741919643233} - Average Loss: 19.444569866072055
Losses at iteration 1: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'attribute_ruler': 0.0, 'lemmatizer': 0.0, 'ner': 51.219086370127194} - Average Loss: 8.536514395021198
Losses at iteration 2: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'attribute_ruler': 0.0, 'lemmatizer': 0.0, 'ner': 45.45839686851091} - Average Loss: 7.576399478085151
Losses at iteration 3: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'attribute_ruler': 0.0, 'lemmatizer': 0.0, 'ner': 25.00497204273036} - Average Loss: 4.167495340455059
Losses at iteration 4: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'attribute_ruler': 0.0, 'lemmatizer': 0.0, 'ner': 20.64320806789741} - Average Loss: 3.440534677982902
Losses at iteration 5: {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'attribute_ruler': 0.0, 'lemmatizer': 0.0, 'ner': 23.600893454

In [10]:
import spacy
import json

def visualize_ner_predictions(trained_model, sentence_to_predict, list_of_pokemon_names):
    try:
        trained_nlp = spacy.load(trained_model)
    except OSError as error:
        print(f"Error loading model from {trained_model}: {error}")
        print("Please ensure the model path is correct and the model is saved there.")
        return

    processed_sentence = trained_nlp(sentence_to_predict)
    # Filter out entities that are not in all_pokemon_names
    valid_ents = [entity for entity in processed_sentence.ents if entity.text in list_of_pokemon_names]
    processed_sentence.ents = tuple(valid_ents)

    if not processed_sentence.ents:
        print("No entities found in the text.")
        return

    print("Entities found:")
    for entity in processed_sentence.ents:
        print(f"  - {entity.text} ({entity.label_})")

    colors = {"POKEMON": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = {"ents": ["POKEMON"], "colors": colors}
    displacy.render(processed_sentence, style="ent", jupyter=True, options=options)

    highlighted_text = ""
    last_end = 0
    for token in processed_sentence:
        if token.text in list_of_pokemon_names and token.ent_type_ != "POKEMON":
            highlighted_text += sentence_to_predict[last_end:token.idx] + f"<mark style='background: lightgreen; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;'>{token.text}</mark>"

        last_end = token.idx + len(token.text) #Force last_end to update every loop.

    if last_end <= len(sentence_to_predict):
        highlighted_text += sentence_to_predict[last_end:]

    if highlighted_text != sentence_to_predict:
        highlighted_text+="\n<mark style='background:#d64b4e; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;'>Pokémon names highlighted (not detected as entities):</mark>"
        display(HTML(data=highlighted_text))

# Example usage (replace with your actual model path and text)
model_path = "C:/Users/moqp3/Desktop/catho/S2/NLP/nlp/tp4/NER_results"  # Replace with the path to your saved model
test_text = "Dragonite and Charizard are my favorite pokémon. I also like Pachirisu and Snorlax. What about you? I saw a wild Bulbasaur and a shiny Pikachu. I also saw a wild Pidgeot."
# Load all_pokemon_names if you haven't already
if 'all_pokemon_names' not in locals() and 'all_pokemon_names' not in globals():
    with open("all_pokemons.json", "r", encoding="utf-8") as file:
        all_pokemon_names = json.load(file)

visualize_ner_predictions(model_path, test_text, all_pokemon_names)

Entities found:
  - Dragonite (POKEMON)
  - Charizard (POKEMON)
  - Pachirisu (POKEMON)
  - Snorlax (POKEMON)
  - Bulbasaur (POKEMON)
  - Pikachu (POKEMON)
  - Pidgeot (POKEMON)


In [11]:
# The "Pokémon names highlighted (not detected as entities)" section is about names that:
# Are present in all_pokemon_names: This means they are known pokémon names according to the full list.
# Are NOT recognized as "POKEMON" entities by the trained NER model: this indicates that the model failed to identify them as such during its processing of the text.