# Projet PubMedNLP

**Élaboré par :** Nadia BEN YOUSSEF

---

## Phase d'annotation des articles extraits 

##### Bibliothèques

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import pipeline
import torch
import json
from torch.utils.data import Dataset, DataLoader
import re
import os

##### Lecture du Fichier JSON

In [None]:
def load_articles():
    """
    Charge les articles depuis le fichier JSON
    """
    file_path = '/Users/nadiabenyoussef/Projet NLP/scraping/articles.json'
    
    print("Chargement des articles...")
    try:
        with open(file_path, 'r') as file:
            articles = json.load(file)
        print(f"Nombre d'articles chargés : {len(articles)}")
        print("\nExemple d'article :")
        print(json.dumps(articles[0], indent=2))
        
        return articles
    except Exception as e:
        print(f"Erreur lors du chargement : {e}")
        return None

articles = load_articles()

Chargement des articles...
Nombre d'articles chargés : 949

Exemple d'article :
{
  "Article Title": "FBP1 controls liver cancer evolution from senescent MASH hepatocytes.",
  "pmid": "39743585",
  "authors": "Gu L, Zhu Y, Nandi SP, Lee M, Watari K, Bareng B, Ohira M, Liu Y, Sakane S, Carlessi R, Sauceda C, Dhar D, Ganguly S, Hosseini M, Teneche MG, Adams PD, Gonzalez DJ, Kisseleva T; Liver Cancer Collaborative; Tirnitz-Parker JEE, Simon MC, Alexandrov LB, Karin M.",
  "source": "Nature",
  "publication_date": "2025 Jan 1.",
  "abstract": "Hepatocellular carcinoma (HCC) originates from differentiated hepatocytes undergoing compensatory proliferation in livers damaged by viruses or metabolic-dysfunction-associated steatohepatitis (MASH)1. While increasing HCC risk2, MASH triggers p53-dependent hepatocyte senescence3, which we found to parallel hypernutrition-induced DNA breaks. How this tumour-suppressive response is bypassed to license oncogenic mutagenesis and enable HCC evolution was

In [None]:
def check_structure():
    """
    Vérifie la structure du fichier JSON
    """
    file_path = '/Users/nadiabenyoussef/Projet NLP/scraping/articles.json'
    with open(file_path, 'r') as file:
        articles = json.load(file)
        
    print("Structure du premier article:")
    first_article = articles[0]
    for key in first_article.keys():
        print(f"- {key}")
        
    print("\nContenu du premier article:")
    print(json.dumps(first_article, indent=2))
    
    return articles[0]

first_article = check_structure()

Structure du premier article:
- Article Title
- pmid
- authors
- source
- publication_date
- abstract
- doi
- keywords

Contenu du premier article:
{
  "Article Title": "FBP1 controls liver cancer evolution from senescent MASH hepatocytes.",
  "pmid": "39743585",
  "authors": "Gu L, Zhu Y, Nandi SP, Lee M, Watari K, Bareng B, Ohira M, Liu Y, Sakane S, Carlessi R, Sauceda C, Dhar D, Ganguly S, Hosseini M, Teneche MG, Adams PD, Gonzalez DJ, Kisseleva T; Liver Cancer Collaborative; Tirnitz-Parker JEE, Simon MC, Alexandrov LB, Karin M.",
  "source": "Nature",
  "publication_date": "2025 Jan 1.",
  "abstract": "Hepatocellular carcinoma (HCC) originates from differentiated hepatocytes undergoing compensatory proliferation in livers damaged by viruses or metabolic-dysfunction-associated steatohepatitis (MASH)1. While increasing HCC risk2, MASH triggers p53-dependent hepatocyte senescence3, which we found to parallel hypernutrition-induced DNA breaks. How this tumour-suppressive response is by

--------

#### Test sur 50 articles

##### Nettoyage

In [None]:
def clean_entity(text):
    """
    Nettoie et normalise une entité
    """
    text = text.strip().rstrip('.,;:')
    text = re.sub(r'(.*?)\s+and\s+\1.*', r'\1', text)
    if len(text.split()) > 10:
        return None
    return text

def verify_disease(text, title=None, abstract=None):
    """
    Vérifie si le texte est une maladie spécifique, rejette les génériques
    """
    non_diseases = {'tryptophan', 'bacteria', 'virus', 'protein', 'vitamin', 'NAD', 'NADH', 
                   'cholera', 'vibrio', 'microscopy', 'inflammation', 'biofilm', 'pathway',
                   'study', 'research', 'analysis', 'method', 'cells', 'unknown', 'meat', 
                   'single-cell', 'RNA', 'DNA', 'sequencing', 'abstract', 'mice', 'model',
                   'AKT', 'progenitor', 'gluconeogenic', 'nutrition', 'metagenomic', 'immune',
                   'disease', 'tumor', 'growth', 'prognosis', 'progression', 'mortality'}  # Génériques exclus
    disease_indicators = {'cancer', 'disease', 'tumor', 'syndrome', 'carcinoma', 'disorder',
                         'infection', 'itis', 'osis', 'emia', 'pathy', 'failure', 'injury',
                         'sclerosis', 'deficiency'}
    common_diseases = {'hepatocellular carcinoma', 'breast cancer', 'gastric cancer', 'lung cancer',
                       'glioblastoma', 'alzheimer\'s disease', 'cardiovascular disease', 'diabetes',
                       'osteoarthritis', 'autoimmune diseases', 'inflammatory bowel disease',
                       'pancreatic cancer', 'cervical cancer', 'endometriosis', 'ferroptosis'}
    
    text_lower = text.lower()
    
    if any(term == text_lower for term in non_diseases) or len(text.split()) > 5:  # "disease" seul rejeté
        return False
    
    # Accepter si dans common_diseases ou spécifique avec indicateur
    if text_lower in common_diseases or \
       (any(indicator in text_lower for indicator in disease_indicators) and len(text.split()) > 1):
        return True
    
    if title and abstract:
        context = (title + " " + abstract).lower()
        if text_lower in context and len(text.split()) <= 5 and text_lower in common_diseases:
            return True
            
    return False

def verify_disease_answer(text, title, abstract):
    """
    Vérifie si la réponse est une maladie valide en contexte
    """
    if isinstance(text, list):
        text = ' '.join(text)
    elif text is None:
        return None

    text = str(text).strip()
    if len(text.split()) > 5 or not text:
        return None

    common_terms = {'study', 'research', 'analysis', 'investigation', 'method', 'reagent', 
                    'interaction', 'cells', 'sequencing', 'technology', 'mice', 'model', 
                    'implications', 'progenitor', 'nutrition', 'metagenomic', 'disease', 
                    'tumor', 'prognosis', 'progression'}
    if any(term in text.lower() for term in common_terms):
        return None
        
    return text if verify_disease(text, title, abstract) else None

def clean_disease_responses(diseases):
    """
    Nettoie, déduplique et limite à 3 DISEASE spécifiques, ignore la casse
    """
    cleaned = set()
    for disease in diseases:
        disease_lower = disease.lower()
        if not any(disease_lower in other.lower() for other in cleaned if disease_lower != other.lower()):
            cleaned.add(disease)
    
    # Limiter à 3, prioriser les plus spécifiques
    cleaned = sorted(list(cleaned), key=lambda x: len(x.split()), reverse=True)[:3]
    return set(cleaned) if cleaned else {"unspecified disease"}

def find_disease_in_title(title):
    """
    Cherche une maladie spécifique dans le titre
    """
    disease_patterns = [
        r'(\w+\s)?cancer',
        r'(\w+\s)?carcinoma',
        r'(\w+\s)?disease',
        r'(\w+\s)?disorder',
        r'(\w+\s)?syndrome',
        r'\w+itis',
        r'\w+osis',
        r'\w+emia',
        r'\w+pathy'
    ]
    
    for pattern in disease_patterns:
        matches = re.finditer(pattern, title, re.IGNORECASE)
        for match in matches:
            candidate = match.group(0)
            if verify_disease(candidate, title, None):
                return candidate
    return None

##### Définition du modèle BioBERT

In [None]:
class QuestionDataset(Dataset):
    def __init__(self, articles, questions):
        self.examples = []
        for article in articles:
            text = f"{article['Article Title']}. {article['abstract']}"
            disease_questions = questions['DISEASE']
            self.examples.extend([{
                'pmid': article['pmid'],
                'question': q,
                'context': text,
                'category': 'DISEASE'
            } for q in disease_questions])
            
            for category, category_questions in questions.items():
                if category != 'DISEASE':
                    self.examples.extend([{
                        'pmid': article['pmid'],
                        'question': q,
                        'context': text,
                        'category': category
                    } for q in category_questions])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

def load_articles(limit=50):
    file_path = '/Users/nadiabenyoussef/Projet NLP/scraping/articles.json'
    with open(file_path, 'r') as file:
        articles = json.load(file)
    return articles[:limit]

In [None]:
print("Initialisation du modèle BioBERT...")
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1-squad")
model = AutoModelForQuestionAnswering.from_pretrained("dmis-lab/biobert-large-cased-v1.1-squad")

device = 0 if torch.cuda.is_available() else -1
if device == 0:
    model = model.to('cuda')
    print("Modèle chargé sur GPU")
else:
    print("Modèle chargé sur CPU")

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=device,
    batch_size=16
)
print("Pipeline configuré")

QUESTIONS = {
    'DISEASE': [
        "What disease or medical condition is mentioned?",
        "What type of cancer is discussed?",
        "What is the main disease being studied?",
        "What specific tumor type is mentioned?",
        "What pathology is described?",
        "What medical condition is investigated?",
        "What disease is the focus of this research?",
        "What syndrome or disorder is discussed?",
        "What disease or disorder is being treated?",
        "What medical condition does this study focus on?",
        "What is the pathological condition studied?",
        "What disease or syndrome is being investigated?",
        "What specific medical diagnosis is discussed?",
        "What health condition is being researched?",
        "What is the main pathology in this study?",
        "What disease are the researchers studying?",
        "What neurodegenerative disease is discussed?",
        "What type of cancer is being investigated?",
        "What chronic condition is being studied?", 
        "Based on the title and description, what is the primary disease investigated?",
        "Reading the full text, what is the main medical condition being researched?",
        "Considering both the title and abstract, what disease is being studied?",
        "What disease are the researchers studying in this work?",
        "What is the primary medical condition discussed in this research?",
        "Which specific disease or disorder is being investigated here?",
        "What is the main pathology addressed in this study?",
        "What medical condition is the focus of this research?"
    ],
    'CELL_TYPE': [
        "What cell types are mentioned?",
        "What are all the cell types described in the text?",
        "What specific types of stem cells are discussed?",
        "What type of cells are being studied?",
        "What cellular components are investigated?",
        "What type of immune cells are mentioned?",
        "What specific cell populations are studied?",
        "What cell lineages are discussed?"
    ]
}

Initialisation du modèle BioBERT...


config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/467k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

Modèle chargé sur GPU
Pipeline configuré


#### Prétraitement des donnees

In [5]:
def refine_disease_text(text):
    """
    Nettoie et reformate les réponses de l'annotation pour éviter les erreurs courantes.
    """
    text = re.sub(r'\bis\s+a\b', '', text)
    text = re.sub(r'\bhighly\b', '', text)
    text = re.sub(r'\bassociated with\b', '', text)
    text = text.strip()
    return text

def clean_duplicate_diseases(diseases):
    """
    Supprime les redondances et limite à 3 DISEASE spécifiques, ignore la casse.
    """
    cleaned = set()
    disease_dict = {}  # Pour suivre les variantes avec casse différente
    
    for disease in diseases:
        clean_disease = re.sub(r'\s*\(.*?\)', '', disease).strip()
        clean_lower = clean_disease.lower()
        
        # Si une variante existe déjà (en minuscule), garder la première forme rencontrée
        if clean_lower not in disease_dict:
            disease_dict[clean_lower] = clean_disease
            if not any(clean_lower in other.lower() and clean_lower != other.lower() for other in cleaned):
                cleaned.add(clean_disease)
            else:
                for other in list(cleaned):
                    if other.lower() in clean_lower and len(clean_disease) > len(other):
                        cleaned.remove(other)
                        cleaned.add(clean_disease)
    
    # Limiter à 3, prioriser les plus spécifiques
    cleaned = sorted(list(cleaned), key=lambda x: len(x.split()), reverse=True)[:3]
    return set(cleaned) if cleaned else {"unspecified disease"}

def filter_cell_types(cell_types):
    """
    Filtre les faux positifs de CELL_TYPE avec une liste stricte.
    """
    excluded_terms = {"technology", "research", "biomarker", "analysis", "study", 
                      "genetics", "biology", "method", "RNA", "DNA", "pathway", 
                      "unknown", "control", "data", "inflammation", "cancer", 
                      "meat", "abstract", "sequencing", "mice", "model", "implications",
                      "nutrition", "metagenomic", "disease",  "growth", "prognosis"}
    valid_cell_types = {"hepatocytes", "stem cells", "immune cells", "plasma cells", 
                        "macrophages", "b-lineage cells", "t cells", "glial cells", 
                        "neurons", "epithelial cells", "tumor cells", "progenitor cells",
                        "hcc progenitor", "differentiated hepatocytes", "somatic cells",
                        "endothelial cells", "fibroblasts", "mast cells", "nk cells"}
    
    filtered = set()
    
    for cell in cell_types:
        cell_lower = cell.lower()
        if any(term in cell_lower for term in excluded_terms) or len(cell.split()) > 3:
            continue
        
        clean_cell = re.sub(r'\s*\(.*?\)', '', cell).strip()
        if any(valid in cell_lower for valid in valid_cell_types) and len(clean_cell.split()) <= 3:
            filtered.add(clean_cell)
    
    return filtered

In [6]:
def process_batch(articles, qa_pipeline):
    dataset = QuestionDataset(articles, QUESTIONS)
    all_answers = qa_pipeline(dataset, batch_size=16)

    results = {}
    for i, answer in enumerate(all_answers):
        item = dataset.examples[i]
        pmid = item['pmid']
        category = item['category']

        if pmid not in results:
            results[pmid] = {
                "pmid": pmid,
                "title": next(art['Article Title'] for art in articles if art['pmid'] == pmid),
                "annotations": {"DISEASE": set(), "CELL_TYPE": set()}
            }

        answer_text = clean_entity(answer['answer'])
        if answer_text:
            title = results[pmid]["title"]
            abstract = next(art['abstract'] for art in articles if art['pmid'] == pmid)
            validated_disease = verify_disease_answer(answer_text, title, abstract)

            if validated_disease and len(results[pmid]["annotations"]["DISEASE"]) < 3:
                refined_disease = refine_disease_text(validated_disease)
                if verify_disease(refined_disease, title, abstract):
                    results[pmid]["annotations"]["DISEASE"].add(refined_disease)
            
            if category == "CELL_TYPE":
                clean_cell = clean_entity(answer_text)
                if clean_cell and len(clean_cell.split()) <= 3:
                    results[pmid]["annotations"]["CELL_TYPE"].add(clean_cell)

    # Forcer une DISEASE si absente
    common_diseases = {'hepatocellular carcinoma', 'breast cancer', 'gastric cancer', 'lung cancer',
                       'glioblastoma', 'alzheimer\'s disease', 'cardiovascular disease', 'diabetes',
                       'osteoarthritis', 'autoimmune diseases', 'inflammatory bowel disease',
                       'pancreatic cancer', 'cervical cancer', 'endometriosis', 'ferroptosis'}
    for pmid, data in results.items():
        if not data["annotations"]["DISEASE"]:
            disease_from_title = find_disease_in_title(data["title"])
            if disease_from_title:
                data["annotations"]["DISEASE"].add(disease_from_title)
            else:
                abstract = next(art['abstract'] for art in articles if art['pmid'] == pmid)
                context = (data["title"] + " " + abstract).lower()
                for disease in common_diseases:
                    if disease in context:
                        data["annotations"]["DISEASE"].add(disease)
                        break
                if not data["annotations"]["DISEASE"]:
                    data["annotations"]["DISEASE"].add("unspecified disease")

    # Nettoyer DISEASE et CELL_TYPE
    for pmid in results:
        diseases = results[pmid]["annotations"]["DISEASE"]
        results[pmid]["annotations"]["DISEASE"] = clean_duplicate_diseases(diseases)
        cell_types = results[pmid]["annotations"]["CELL_TYPE"]
        results[pmid]["annotations"]["CELL_TYPE"] = filter_cell_types(cell_types)

    return results

In [7]:
def display_results(results):
    print("\nRésultats de l'annotation :")
    for pmid, data in results.items():
        print(f"\nPMID: {pmid}")
        print(f"Titre: {data['title']}")

        if "DISEASE" in data["annotations"] and data["annotations"]["DISEASE"]:
            print("DISEASE:")
            for item in sorted(data["annotations"]["DISEASE"]):
                print(f"  - {item}")

        if "CELL_TYPE" in data["annotations"] and data["annotations"]["CELL_TYPE"]:
            print("CELL_TYPE:")
            for item in sorted(data["annotations"]["CELL_TYPE"]):
                print(f"  - {item}")

In [58]:
articles = load_articles(50)
results = process_batch(articles, qa_pipeline)
display_results(results)


Résultats de l'annotation :

PMID: 39743585
Titre: FBP1 controls liver cancer evolution from senescent MASH hepatocytes.
DISEASE:
  - Hepatocellular carcinoma
  - metabolic-dysfunction-associated steatohepatitis
CELL_TYPE:
  - HCC progenitor
  - HCC progenitor cells4,5
  - differentiated hepatocytes
  - hepatocytes
  - senescent MASH hepatocytes

PMID: 39743589
Titre: Aspartate signalling drives lung metastasis via alternative translation.
DISEASE:
  - breast cancer
CELL_TYPE:
  - immune cells

PMID: 39753770
Titre: Blood DNA virome associates with autoimmune diseases and COVID-19.
DISEASE:
  - multiple sclerosis

PMID: 39755327
Titre: Implications of an Off-Hours Setting in Patients Undergoing Transcatheter Edge-to-Edge Repair for Mitral Regurgitation.
DISEASE:
  - unspecified disease

PMID: 39753140
Titre: Infiltrating plasma cells maintain glioblastoma stem cells through IgG-Tumor binding.
DISEASE:
  - Glioblastoma
CELL_TYPE:
  - B-lineage cells
  - Plasma cells
  - glioblastoma st

-----

In [None]:
def load_articles():
    file_path = '/Users/nadiabenyoussef/Projet NLP/scraping/articles.json'
    
    print("Chargement des articles...")
    try:
        with open(file_path, 'r') as file:
            articles = json.load(file)
        print(f"Nombre d'articles chargés : {len(articles)}")
        print("\nExemple d'article :")
        print(json.dumps(articles[0], indent=2))
        
        return articles
    except Exception as e:
        print(f"Erreur lors du chargement : {e}")
        return None

articles = load_articles()

Chargement des articles...
Nombre d'articles chargés : 949

Exemple d'article :
{
  "Article Title": "FBP1 controls liver cancer evolution from senescent MASH hepatocytes.",
  "pmid": "39743585",
  "authors": "Gu L, Zhu Y, Nandi SP, Lee M, Watari K, Bareng B, Ohira M, Liu Y, Sakane S, Carlessi R, Sauceda C, Dhar D, Ganguly S, Hosseini M, Teneche MG, Adams PD, Gonzalez DJ, Kisseleva T; Liver Cancer Collaborative; Tirnitz-Parker JEE, Simon MC, Alexandrov LB, Karin M.",
  "source": "Nature",
  "publication_date": "2025 Jan 1.",
  "abstract": "Hepatocellular carcinoma (HCC) originates from differentiated hepatocytes undergoing compensatory proliferation in livers damaged by viruses or metabolic-dysfunction-associated steatohepatitis (MASH)1. While increasing HCC risk2, MASH triggers p53-dependent hepatocyte senescence3, which we found to parallel hypernutrition-induced DNA breaks. How this tumour-suppressive response is bypassed to license oncogenic mutagenesis and enable HCC evolution was

In [10]:
#articles = load_articles()  
results = process_batch(articles, qa_pipeline)
print(f"Annotation terminée !")
#display_results(results) 

Annotation terminée !


In [None]:
first_10_results = dict(list(results.items())[:10])
display_results(first_10_results)


Résultats de l'annotation :

PMID: 39743585
Titre: FBP1 controls liver cancer evolution from senescent MASH hepatocytes.
DISEASE:
  - Hepatocellular carcinoma
  - metabolic-dysfunction-associated steatohepatitis
CELL_TYPE:
  - HCC progenitor
  - HCC progenitor cells4,5
  - differentiated hepatocytes
  - hepatocytes
  - senescent MASH hepatocytes

PMID: 39743589
Titre: Aspartate signalling drives lung metastasis via alternative translation.
DISEASE:
  - breast cancer
CELL_TYPE:
  - immune cells

PMID: 39753770
Titre: Blood DNA virome associates with autoimmune diseases and COVID-19.
DISEASE:
  - multiple sclerosis

PMID: 39755327
Titre: Implications of an Off-Hours Setting in Patients Undergoing Transcatheter Edge-to-Edge Repair for Mitral Regurgitation.
DISEASE:
  - unspecified disease

PMID: 39753140
Titre: Infiltrating plasma cells maintain glioblastoma stem cells through IgG-Tumor binding.
DISEASE:
  - glioblastoma
CELL_TYPE:
  - B-lineage cells
  - Plasma cells
  - glioblastoma st

In [14]:
results_serializable = {}
for pmid, data in results.items():
    results_serializable[pmid] = {
        "pmid": data["pmid"],
        "title": data["title"],
        "annotations": {
            "DISEASE": list(data["annotations"]["DISEASE"]),
            "CELL_TYPE": list(data["annotations"]["CELL_TYPE"])
        }
    }

In [None]:
#output_file = '/kaggle/working/annotated_articles.json'
output_file = '/Users/nadiabenyoussef/Projet NLP/annotation/annotated_articles.json'
with open(output_file, 'w') as f:
    json.dump(results_serializable, f, indent=4)
print(f"Résultats sauvegardés dans {output_file}")

Résultats sauvegardés dans /kaggle/working/annotated_articles.json


In [None]:
#print("Fichiers dans /kaggle/working/ :", os.listdir('/kaggle/working/'))

Fichiers dans /kaggle/working/ : ['.virtual_documents', 'annotated_articles.json']


In [None]:
#output_file = '/kaggle/working/annotated_articles.json'
output_file = '/Users/nadiabenyoussef/Projet NLP/annotation/annotated_articles.json'

with open(output_file, 'r') as f:
    data = json.load(f)
first_5_entries = dict(list(data.items())[:5])

print("5 premières entrées du fichier annotated_articles.json :")
for pmid, info in first_5_entries.items():
    print(f"\nPMID: {pmid}")
    print(f"Titre: {info['title']}")
    print("DISEASE:", info['annotations']['DISEASE'])
    print("CELL_TYPE:", info['annotations']['CELL_TYPE'])

5 premières entrées du fichier annotated_articles.json :

PMID: 39743585
Titre: FBP1 controls liver cancer evolution from senescent MASH hepatocytes.
DISEASE: ['Hepatocellular carcinoma', 'metabolic-dysfunction-associated steatohepatitis']
CELL_TYPE: ['HCC progenitor cells4,5', 'HCC progenitor', 'differentiated hepatocytes', 'senescent MASH hepatocytes', 'hepatocytes']

PMID: 39743589
Titre: Aspartate signalling drives lung metastasis via alternative translation.
DISEASE: ['breast cancer']
CELL_TYPE: ['immune cells']

PMID: 39753770
Titre: Blood DNA virome associates with autoimmune diseases and COVID-19.
DISEASE: ['multiple sclerosis']
CELL_TYPE: []

PMID: 39755327
Titre: Implications of an Off-Hours Setting in Patients Undergoing Transcatheter Edge-to-Edge Repair for Mitral Regurgitation.
DISEASE: ['unspecified disease']
CELL_TYPE: []

PMID: 39753140
Titre: Infiltrating plasma cells maintain glioblastoma stem cells through IgG-Tumor binding.
DISEASE: ['glioblastoma']
CELL_TYPE: ['gli