In [None]:
import os
import requests
from bs4 import BeautifulSoup
import time

# Define the base URL of the MedlinePlus articles
base_url = 'https://medlineplus.gov/ency/article/'

# Define the directory to save the disease definitions
save_dir = './data/medlineplus/'

# Ensure the save directory exists
os.makedirs(save_dir, exist_ok=True)

def get_disease_info(article_id):
    try:
        disease_url = f'{base_url}{article_id}.htm'
        response = requests.get(disease_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the disease name and definition
        disease_name = soup.find('h1').text.strip()
        print(f"Fetching info for: {disease_name}")  # Debugging print
        definition_section = soup.find('div', class_='section')
        definition = definition_section.text.strip() if definition_section else "No definition found."
        
        return {
            'name': disease_name,
            'definition': definition
        }
    except requests.exceptions.RequestException as e:
        print(f"Error fetching disease info from {disease_url}: {e}")
        return None

def save_disease_info(disease_info):
    # Create a valid filename by removing characters that are not allowed in filenames
    filename = "".join(c for c in disease_info['name'] if c.isalnum() or c in (' ', '_')).rstrip()
    filepath = os.path.join(save_dir, f"{filename}.txt")
    
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(f"Name: {disease_info['name']}\n")
        file.write(f"Definition: {disease_info['definition']}\n")
    
    print(f"Saved: {filepath}")

def main():
    # Example range of article IDs to iterate through
    for article_id in range(0, 20):  # Adjust range as needed
        article_id_str = f'{article_id:06}'  # Zero-pad to match the article ID format
        disease_info = get_disease_info(article_id_str)
        if disease_info:
            save_disease_info(disease_info)
        time.sleep(1)  # Add a delay to avoid overloading the server

if __name__ == "__main__":
    main()