# Projet PubMedNLP

**Élaboré par :** Nadia BEN YOUSSEF

---

## Phase de Scraping

Site : https://pubmed.ncbi.nlm.nih.gov/

In [None]:
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
import json

In [None]:
def extract_keywords_from_url(url):
    """
    Extrait les mots-clés d'un article à partir de son URL.
    
    Args:
        url (str): L'URL de l'article.
        
    Returns:
        str: Mots-clés sous forme de texte séparés par des virgules ou 'NO_KEYWORDS' si aucun mot-clé n'est trouvé.
    """
    try:
        response = requests.get(url)
        response.raise_for_status() 
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        strong_tags = soup.find_all('strong', {'class': 'sub-title'})
        
        keywords_section = None
        for tag in strong_tags:
            if 'Keywords:' in tag.text.strip():
                keywords_section = tag
                break
        
        if keywords_section:
            keywords = keywords_section.find_parent('p').get_text()
            keywords = keywords.replace('Keywords:', '').strip()
            keywords_list = [keyword.strip() for keyword in keywords.split(';')]
            return ', '.join(keywords_list)
        else:
            return 'NO_KEYWORDS'
    
    except Exception as e:
        print(f"Error: {e}")
        return 'NO_KEYWORDS'

def scrape_pubmed_articles(max_pages=1, sleep_time=1):
    """
    Scrape PubMed articles with specified date filter.
    Args:
        max_pages (int): Maximum number of pages to scrape.
        sleep_time (int): Time to wait between requests to avoid being blocked.
    """
    base_url = "https://pubmed.ncbi.nlm.nih.gov/trending/"
    date_filter = "dates.2024-2025%2F1%2F7"
    articles = []
    
    try:
        for page in range(1, max_pages + 1):
            url = f"{base_url}?filter={date_filter}&page={page}"
            print(f"\nExtracting page {page}...")
            
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            article_blocks = soup.find_all('div', class_='docsum-content')
            
            if not article_blocks:
                print("No more articles found.")
                break
            
            for block in article_blocks:
                article = {}
                
                title_link = block.find('a', class_='docsum-title')
                if title_link and 'href' in title_link.attrs:
                    pmid = title_link['href'].strip('/').split('/')[-1]
                    article['pmid'] = pmid
                
                # Titre
                article['title'] = title_link.text.strip() if title_link else 'No title'
                
                # Auteurs
                authors_tag = block.find('span', class_='docsum-authors full-authors')
                article['authors'] = authors_tag.text.strip() if authors_tag else 'No authors'
                
                # Accéder à la page détaillée pour plus d'informations
                if title_link and 'href' in title_link.attrs:
                    full_url = f"https://pubmed.ncbi.nlm.nih.gov{title_link['href']}"
                    try:
                        detail_response = requests.get(full_url)
                        detail_response.raise_for_status()
                        detail_soup = BeautifulSoup(detail_response.content, 'html.parser')
                        
                        source_button = detail_soup.find('button', {'class': 'journal-actions-trigger'})
                        if source_button:
                            article['source'] = source_button.get('title', 'No source')
                        else:
                            article['source'] = 'No source'
                        
                        date_span = detail_soup.find('span', class_='cit')
                        if date_span:
                            article['publication_date'] = date_span.text.strip()
                        else:
                            article['publication_date'] = 'No date'
                        
                        abstract_div = detail_soup.find('div', {'class': 'abstract-content', 'id': 'eng-abstract'})
                        if abstract_div:
                            abstract_p = abstract_div.find('p')
                            if abstract_p:
                                article['abstract'] = abstract_p.text.strip()
                            else:
                                article['abstract'] = abstract_div.text.strip()
                        else:
                            article['abstract'] = 'No abstract'
                        
                        doi_citation = detail_soup.find('span', class_='citation-doi')
                        if doi_citation:
                            article['doi'] = doi_citation.text.replace('doi:', '').strip().rstrip('.')
                        else:
                            doi_link = detail_soup.find('a', class_='id-link', attrs={'data-ga-action': 'DOI'})
                            if doi_link:
                                article['doi'] = doi_link.text.strip()
                            else:
                                article['doi'] = 'No DOI'
                        
                        article['keywords'] = extract_keywords_from_url(full_url)
                        time.sleep(sleep_time)
                        
                    except Exception as e:
                        print(f"Error accessing detail page: {e}")
                        continue
                
                articles.append(article)
                #print(f"Extracted article: {article['title'][:50]}...")
    
    except Exception as e:
        print(f"An error occurred: {e}")
    return articles

In [47]:
articles = scrape_pubmed_articles(max_pages=95)
'''for article in articles:
    print(f"Title: {article['title']}")
    print('-' * 80)'''


Extracting page 1...

Extracting page 2...

Extracting page 3...

Extracting page 4...

Extracting page 5...

Extracting page 6...

Extracting page 7...

Extracting page 8...

Extracting page 9...

Extracting page 10...

Extracting page 11...

Extracting page 12...

Extracting page 13...

Extracting page 14...

Extracting page 15...

Extracting page 16...

Extracting page 17...

Extracting page 18...

Extracting page 19...

Extracting page 20...

Extracting page 21...

Extracting page 22...

Extracting page 23...

Extracting page 24...

Extracting page 25...

Extracting page 26...

Extracting page 27...

Extracting page 28...

Extracting page 29...

Extracting page 30...

Extracting page 31...

Extracting page 32...

Extracting page 33...

Extracting page 34...

Extracting page 35...

Extracting page 36...

Extracting page 37...

Extracting page 38...

Extracting page 39...

Extracting page 40...

Extracting page 41...

Extracting page 42...

Extracting page 43...

Extracting page 44.

'for article in articles:\n    print(f"Title: {article[\'title\']}")\n    print(\'-\' * 80)'

In [43]:
def display_article_details(article):
    """
    Affiche les détails d'un article dans un format lisible.
    Args : article (dict): Un dictionnaire contenant les informations d'un article.
    """
    print(f"\nExtracted article details:")
    print(f"Title: {article['title']}")
    print(f"PMID: {article['pmid']}")
    print(f"Authors: {article['authors']}")
    print(f"Source: {article['source']}")
    print(f"Publication Date: {article['publication_date']}")
    print(f"Abstract: {article['abstract']}")
    print(f"DOI: {article['doi']}")
    print(f"Keywords: {article['keywords']}")
    print("-" * 80)

In [44]:
for article in articles :
    display_article_details(article)


Extracted article details:
Title: FBP1 controls liver cancer evolution from senescent MASH hepatocytes.
PMID: 39743585
Authors: Gu L, Zhu Y, Nandi SP, Lee M, Watari K, Bareng B, Ohira M, Liu Y, Sakane S, Carlessi R, Sauceda C, Dhar D, Ganguly S, Hosseini M, Teneche MG, Adams PD, Gonzalez DJ, Kisseleva T; Liver Cancer Collaborative; Tirnitz-Parker JEE, Simon MC, Alexandrov LB, Karin M.
Source: Nature
Publication Date: 2025 Jan 1.
Abstract: Hepatocellular carcinoma (HCC) originates from differentiated hepatocytes undergoing compensatory proliferation in livers damaged by viruses or metabolic-dysfunction-associated steatohepatitis (MASH)1. While increasing HCC risk2, MASH triggers p53-dependent hepatocyte senescence3, which we found to parallel hypernutrition-induced DNA breaks. How this tumour-suppressive response is bypassed to license oncogenic mutagenesis and enable HCC evolution was previously unclear. Here we identified the gluconeogenic enzyme fructose-1,6-bisphosphatase 1 (FBP1) 

In [48]:
def save_articles_info(articles, filename=None):
    """
    Sauvegarder les articles dans un fichier texte.
    Args:
        articles (list): Liste de dictionnaires contenant les informations des articles.
        filename (str, optional): Nom du fichier où les articles seront sauvegardés.
    """
    if not articles:
        print("No articles to save.")
        return

    if filename is None:
        filename = "articles.txt"
    
    with open(filename, 'w', encoding='utf-8') as f:
        for article in articles:
            f.write("\n" + "="*80 + "\n")
            f.write(f"Article Title: {article['title']}\n")
            for key, value in article.items():
                if key != 'title':  # On ne répète pas le titre
                    f.write(f"{key}: {value}\n")
            f.write("\n" + "="*80 + "\n")
    
    print(f"\nArticles saved to {filename}")

if __name__ == "__main__":
    save_articles_info(articles)


Articles saved to articles.txt


In [None]:
with open('/Users/nadiabenyoussef/Projet NLP/scraping/articles.txt', "r") as file:
    lines = file.readlines()
cleaned_lines = []
for i in range(len(lines)):
    if (
        i > 1
        and lines[i].strip() == "=" * 80
        and lines[i - 1].strip() == "" 
        and lines[i - 2].strip() == "=" * 80
    ):
        continue
    cleaned_lines.append(lines[i])
with open('/Users/nadiabenyoussef/Projet NLP/scraping/articles.txt', "w") as file:
    file.writelines(cleaned_lines)
print("Nettoyage terminé. Les lignes de séparateurs doubles avec ligne vide ont été supprimées.")

Nettoyage terminé. Les lignes de séparateurs doubles avec ligne vide ont été supprimées.


------

In [None]:
with open('/Users/nadiabenyoussef/Projet NLP/scraping/articles.txt', "r", encoding="utf-8") as file:
    articles = file.read()

In [None]:
with open('/Users/nadiabenyoussef/Projet NLP/scraping/articles.txt', "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i < 50:
            print(line.strip())
        else:
            break 

Article Title: FBP1 controls liver cancer evolution from senescent MASH hepatocytes.
pmid: 39743585
authors: Gu L, Zhu Y, Nandi SP, Lee M, Watari K, Bareng B, Ohira M, Liu Y, Sakane S, Carlessi R, Sauceda C, Dhar D, Ganguly S, Hosseini M, Teneche MG, Adams PD, Gonzalez DJ, Kisseleva T; Liver Cancer Collaborative; Tirnitz-Parker JEE, Simon MC, Alexandrov LB, Karin M.
source: Nature
publication_date: 2025 Jan 1.
abstract: Hepatocellular carcinoma (HCC) originates from differentiated hepatocytes undergoing compensatory proliferation in livers damaged by viruses or metabolic-dysfunction-associated steatohepatitis (MASH)1. While increasing HCC risk2, MASH triggers p53-dependent hepatocyte senescence3, which we found to parallel hypernutrition-induced DNA breaks. How this tumour-suppressive response is bypassed to license oncogenic mutagenesis and enable HCC evolution was previously unclear. Here we identified the gluconeogenic enzyme fructose-1,6-bisphosphatase 1 (FBP1) as a p53 target that

In [25]:
articles_list = articles.split("================================================================================")
print(f"Nombre d'articles : {len(articles_list)}")

Nombre d'articles : 950


In [26]:
print(articles_list[0])

Article Title: FBP1 controls liver cancer evolution from senescent MASH hepatocytes.
pmid: 39743585
authors: Gu L, Zhu Y, Nandi SP, Lee M, Watari K, Bareng B, Ohira M, Liu Y, Sakane S, Carlessi R, Sauceda C, Dhar D, Ganguly S, Hosseini M, Teneche MG, Adams PD, Gonzalez DJ, Kisseleva T; Liver Cancer Collaborative; Tirnitz-Parker JEE, Simon MC, Alexandrov LB, Karin M.
source: Nature
publication_date: 2025 Jan 1.
abstract: Hepatocellular carcinoma (HCC) originates from differentiated hepatocytes undergoing compensatory proliferation in livers damaged by viruses or metabolic-dysfunction-associated steatohepatitis (MASH)1. While increasing HCC risk2, MASH triggers p53-dependent hepatocyte senescence3, which we found to parallel hypernutrition-induced DNA breaks. How this tumour-suppressive response is bypassed to license oncogenic mutagenesis and enable HCC evolution was previously unclear. Here we identified the gluconeogenic enzyme fructose-1,6-bisphosphatase 1 (FBP1) as a p53 target that

-----

In [None]:
input_file = '"articles.txt"'
output_file = "articles.json"

with open(input_file, "r", encoding="utf-8") as file:
    data = file.read()

articles_raw = data.split("=" * 80)
articles = []

for article_raw in articles_raw:
    lines = article_raw.strip().split("\n")
    article_dict = {}
    for line in lines:
        if ":" in line:
            key, value = line.split(":", 1)
            article_dict[key.strip()] = value.strip()
    if article_dict:
        articles.append(article_dict)

with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)
print(f"Conversion terminée. Les articles sont enregistrés dans '{output_file}'.")

Conversion terminée. Les articles sont enregistrés dans 'articles.json'.
