# Download Raw Data

This notebook downloads raw data from the links specified in the README and saves them to `data/raw`.


In [2]:
import requests
from pathlib import Path
import os
from urllib.parse import urlparse


In [3]:
# Define the data sources to download
data_sources = [
    {
        "url": "https://medlineplus.gov/diabetes.html",
        "condition": "diabetes",
        "type": "condition",
        "filename": "diabetes_medlineplus_overview.html"
    },
    {
        "url": "https://medlineplus.gov/highbloodpressure.html",
        "condition": "hypertension",
        "type": "condition",
        "filename": "hypertension_medlineplus_overview.html"
    },
    {
        "url": "https://medlineplus.gov/diabetesmedicines.html",
        "condition": "diabetes",
        "type": "treatment",
        "filename": "diabetes_medlineplus_treatment.html"
    },
    {
        "url": "https://medlineplus.gov/bloodpressuremedicines.html",
        "condition": "hypertension",
        "type": "treatment",
        "filename": "hypertension_medlineplus_treatment.html"
    }
]


In [4]:
# Define the base directory
base_dir = Path("../data/raw")
base_dir.mkdir(parents=True, exist_ok=True)


In [5]:
def download_and_save(url, filepath):
    """
    Downloads content from a URL and saves it to a file.
    
    Args:
        url: URL to download
        filepath: Path where to save the file
    """
    try:
        print(f"Downloading: {url}")
        response = requests.get(url, timeout=30)
        response.raise_for_status()  # Raise exception if HTTP error occurs
        
        # Create directory if it doesn't exist
        filepath.parent.mkdir(parents=True, exist_ok=True)
        
        # Save the content
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        print(f"✓ Saved to: {filepath}")
        print(f"  Size: {len(response.content)} bytes\n")
        return True
    except requests.exceptions.RequestException as e:
        print(f"✗ Error downloading {url}: {e}\n")
        return False


In [6]:
# Download all data
results = []

for source in data_sources:
    # Determine the path based on type
    if source["type"] == "condition":
        save_dir = base_dir / "health_education" / "conditions" / source["condition"]
    elif source["type"] == "treatment":
        save_dir = base_dir / "health_education" / "treatments" / source["condition"]
    else:
        save_dir = base_dir / "health_education" / source["type"] / source["condition"]
    
    filepath = save_dir / source["filename"]
    
    success = download_and_save(source["url"], filepath)
    
    results.append({
        "url": source["url"],
        "filepath": str(filepath),
        "success": success
    })


Downloading: https://medlineplus.gov/diabetes.html
✓ Saved to: ../data/raw/health_education/conditions/diabetes/diabetes_medlineplus_overview.html
  Size: 177127 bytes

Downloading: https://medlineplus.gov/highbloodpressure.html
✓ Saved to: ../data/raw/health_education/conditions/hypertension/hypertension_medlineplus_overview.html
  Size: 173310 bytes

Downloading: https://medlineplus.gov/diabetesmedicines.html
✓ Saved to: ../data/raw/health_education/treatments/diabetes/diabetes_medlineplus_treatment.html
  Size: 95497 bytes

Downloading: https://medlineplus.gov/bloodpressuremedicines.html
✓ Saved to: ../data/raw/health_education/treatments/hypertension/hypertension_medlineplus_treatment.html
  Size: 89025 bytes



In [7]:
# Summary of downloads
print("=" * 60)
print("DOWNLOAD SUMMARY")
print("=" * 60)

successful = sum(1 for r in results if r["success"])
total = len(results)

print(f"\nTotal: {total} files")
print(f"Successful: {successful}")
print(f"Failed: {total - successful}")

print("\n" + "-" * 60)
print("Saved files:")
print("-" * 60)

for result in results:
    status = "✓" if result["success"] else "✗"
    print(f"{status} {result['filepath']}")


DOWNLOAD SUMMARY

Total: 4 files
Successful: 4
Failed: 0

------------------------------------------------------------
Saved files:
------------------------------------------------------------
✓ ../data/raw/health_education/conditions/diabetes/diabetes_medlineplus_overview.html
✓ ../data/raw/health_education/conditions/hypertension/hypertension_medlineplus_overview.html
✓ ../data/raw/health_education/treatments/diabetes/diabetes_medlineplus_treatment.html
✓ ../data/raw/health_education/treatments/hypertension/hypertension_medlineplus_treatment.html


In [8]:
# Verify that files were saved correctly
print("\n" + "=" * 60)
print("FILE VERIFICATION")
print("=" * 60)

for result in results:
    if result["success"]:
        filepath = Path(result["filepath"])
        if filepath.exists():
            size = filepath.stat().st_size
            print(f"✓ {filepath.name}: {size:,} bytes")
        else:
            print(f"✗ {filepath.name}: File not found")



FILE VERIFICATION
✓ diabetes_medlineplus_overview.html: 177,127 bytes
✓ hypertension_medlineplus_overview.html: 173,310 bytes
✓ diabetes_medlineplus_treatment.html: 95,497 bytes
✓ hypertension_medlineplus_treatment.html: 89,025 bytes
