<a href="https://colab.research.google.com/github/ljpetkovic/Charcot_KeyBERT_Keyphrase-Vectorizers/blob/main/scripts/KeyBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extraction des mots/phrases-cl√©s avec `keybert` et `keyphrase-vectorizers`
---

#1Ô∏è‚É£ `keybert`
* librairie Python pour extraire des mots/phrases-cl√©s les plus similaires √† un document en exploitant les plongements BERT<br>
‚ö†Ô∏è on doit sp√©cifier la longueur des n-grammes √† extraire, alors que l'on ne sait pas quelle est la longueur optimale<br>
‚ö†Ô∏è la grammaticalit√© des phrases n'est pas prise en compte

<p align="right"><a href="https://towardsdatascience.com/enhancing-keybert-keyword-extraction-results-with-keyphrasevectorizers-3796fa93f4db">Schopf, 2022</a></p>


In [30]:
# !pip install keybert

import torch # print(torch.__version__)
import os
from google.colab import drive
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

""" Initialiser le mod√®le de phrase :
en l'occurrence, le mod√®le d√©rive
les plongements de phrases s√©mantiquement signifiants
qui peuvent √™tre compar√©es en utilisant
la similarit√© cosinus.
"""
sentence_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")
kw_model = KeyBERT(model=sentence_model)

# Monter le Google Drive
drive.mount('/content/drive')

# Definir les chemins vers les fichiers d'entr√©e et de sortie
path = '/content/drive/MyDrive/ObTIC/ateliers/extraction_mots_cles/corpus/'
file_name = 'echantillon_charcot.txt'
file_path = '../output/test_keybert.txt'

""" Extraction des mots-cl√©s
1) Si on divise le texte en tranches
"""
with open(os.path.join(path, file_name), 'r') as myfile, open(os.path.join(path, file_path), 'w') as outfile:
    raw_data = myfile.readlines()
    start = 0
    end = 200 # diminuer le nb de lignes √† traiter si la m√©moire RAM est √©puis√©e
    while len(raw_data) >= end:
        data = " ".join(raw_data[start:end])
        start = end
        end += 200
        keywords = kw_model.extract_keywords(data, keyphrase_ngram_range=(1, 3), stop_words=None, use_mmr=True, diversity=0.7)
        for k in keywords:
            print(k)
            # Writing each keyword to the output file
            outfile.write(str(k) + '\n')

""" 2) Si on passe le fichier entier """

# with open(os.path.join(path, file_name), 'r') as myfile, open(os.path.join(path, file_path), 'w') as outfile:
#     data = myfile.readlines(10000) # pour s'arr√™ter apr√®s 10 000 premiers caract√®res
#     data = [line.strip('\n') for line in data]
#     data = ' '.join(data)
#     # print(data)
#     keywords = kw_model.extract_keywords(data, keyphrase_ngram_range=(1, 3), stop_words=None, use_mmr=True, diversity=0.7)
#     for k in keywords:
#         print(k)
#         # Writing each keyword to the output file
#         outfile.write(str(k) + '\n')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
('mo√´lle √©pini√®re 45', 0.3381)
('topographie anatomo pathologique', 0.2841)
('paul auteur date', 0.2187)
('texte explicatif mais', 0.1587)
('du syst√®me', -0.1271)
('scientifique les planches', 0.4944)
('chambre photographique horizontale', 0.1779)
('objectif de appareil', 0.1028)
('et nous lui', 0.086)
('parfaire cette √©dition', 0.0582)
('ant√©rieure corne post√©rieure', 0.486)
('les cellules sont', 0.2331)
('ii canal central', 0.1677)
('16 et', 0.0078)
('fourni cet auteur', 0.0044)
('post√©rieure cordon post√©rieur', 0.5078)
('ces pi√®ces proviennent', 0.0734)
('25 et', 0.0328)
('m√©thode de weigert', -0.0611)
('extr√™mement intenses seules', -0.0653)
('avoisinante cellule ganglionnaire', 0.3836)
('la pr√©paration pr√©c√©dente', 0.2169)
('ros√©s figure 30', 0.1267)
('voit la diminution', 0.1194)
('m√™me sujet planche', 0.0422)
('faisceau pyramidal intac

#2Ô∏è‚É£ PatternRank
* `keybert` + `keyphrase-vectorizers` = PatternRank<br>
 ‚ùáÔ∏è pas besoin de sp√©cifier la longueur des n-grammes √† extraire, car la librairie l'inf√®re elle-m√™me<br>
‚ùáÔ∏è la grammaticalit√© des phrases est prise en compte gr√¢ce aux extractions des parties du discours (p. ex. `<N.*>*<ADJ.*>*<ADJ.*>+`--> _scl√©rose lat√©rale amyotrophique_)
* _cf._ [Schopf _et al._, 2022](https://arxiv.org/pdf/2210.05245.pdf) et [Schopf, 2022](https://towardsdatascience.com/enhancing-keybert-keyword-extraction-results-with-keyphrasevectorizers-3796fa93f4db)



In [31]:
# !pip install keyphrase-vectorizers
# !pip install keybert
# !pip install flair

from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT
from flair.embeddings import TransformerDocumentEmbeddings
import os
from google.colab import drive
# Monter le Google Drive
drive.mount('/content/drive')

# Ajuster les chemins
path = '/content/drive/MyDrive/ObTIC/ateliers/extraction_mots_cles/corpus/'
input_file_name = 'echantillon_charcot.txt'
output_file_name = '../output/output_pattern_rank.txt'

# Initialiser le mod√®le KeyBERT multilingue
kw_model = KeyBERT(model=TransformerDocumentEmbeddings('google-bert/bert-base-multilingual-cased'))

# Ajuster les param√®tres
vectorizer = KeyphraseCountVectorizer(spacy_pipeline='fr_core_news_lg', pos_pattern='<N.*>+<ADJ.*>*', stop_words='french')

with open(os.path.join(path, input_file_name), 'r') as input_file, \
     open(os.path.join(path, output_file_name), 'w') as output_file:
    raw_data = input_file.readlines()
    start = 0
    end = 22 # diviser le texte en tranches
    while start < len(raw_data):  # s'assurer que l'on traite toutes les donn√©es
        data = " ".join(raw_data[start:end]).replace('\n', ' ')  # Joindre les lignes and g√©rer les nouvelles lignes
        start = end
        end += 22
        try:
            # extraire les phrases-cl√©s
            kp = kw_model.extract_keywords(data, vectorizer=vectorizer)
            for k in kp:
                print(k)
                output_file.write(str(k) + '\n')
        except ValueError as e:
            print(f"An error occurred while processing chunks starting at line {start}: {e}")
            # Accessoirement, √©crire un message ou g√©rer l'erreur si besoin
            # output_file.write("Pas de phrases-cl√©s extraites pour cette tranche.\n")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
('45 planches', 0.8495)
('mo√´lle √©pini√®re', 0.814)
('h√©liogravure', 0.809)
('anatomie pathologique', 0.8019)
('cote', 0.7404)
('texte explicatif', 0.8046)
('soci√©t√© clinique', 0.7987)
('service photographique', 0.7931)
('soci√©t√©', 0.7725)
('libraire', 0.7705)
('soin scrupuleux', 0.8969)
('pathologie spinale', 0.873)
('fa√ßon exacte', 0.8588)
('clinique', 0.8459)
('aise', 0.8457)
('foyer primitif', 0.8844)
('cordon mince', 0.8575)
('vue', 0.8527)
('fait distinctes', 0.8457)
('√©tude anatomique', 0.8405)
('travaux r√©cents', 0.9086)
('nettet√©', 0.8844)
('anatomie morbide', 0.884)
('avantage inestimable', 0.8773)
('soin', 0.8544)
('anatomie pathologique contemplative', 0.8684)
('terres inconnues', 0.8463)
('attention √©gale', 0.844)
('vue', 0.8425)
('clinique', 0.8416)
('centre ovale', 0.8727)
('mani√®re √©clatante', 0.8573)
('clinique', 0.8448)
('le√ßo

#üì∂ Trier des scores par ordre d√©croissant