### Download the QM9 dataset

In [1]:
import sys
from pathlib import Path

# Add project root to Python path (from notebooks/ to thesis/)
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(project_root))


# Import and run with verification
from src.data.download import download_qm9

# Force redownload if needed
QM9_FORCE_REDOWNLOAD = False  # Set to True to force redownload
qm9_path = project_root / "data" / "QM9" / "dsgdb9nsd.xyz"

if QM9_FORCE_REDOWNLOAD:
    print("\n" + "="*50)
    print("QM9 FORCE REDOWNLOAD INITIATED")
    print("="*50)
    if qm9_path.exists():
        qm9_path.unlink()
        print("Removed existing QM9 file")
    else:
        print("No existing QM9 file found")

print("\nDownloading QM9 dataset...")
qm9_success = download_qm9()
if qm9_success:
    print(f"✅ QM9 verification complete - {qm9_path.stat().st_size/1e6:.1f} MB")
else:
    print("❌ QM9 download failed - check error messages")





Downloading QM9 dataset...
QM9 Dataset already exists at:
/home/pc/Python_Projects/thesis/data/QM9/dsgdb9nsd.xyz
✅ QM9 verification complete - 86.1 MB


### Download the ThermoML dataset

In [None]:
import sys
from pathlib import Path

# Add project root to Python path (from notebooks/ to thesis/)
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(project_root))


from src.data.download import download_thermoML

# Force redownload if needed
THERMO_ML_FORCE_REDOWNLOAD = False # Set to True to force redownload
thermo_dir = project_root / "data" / "ThermoML"
xml_files = list(thermo_dir.glob("*.xml")) if thermo_dir.exists() else []

# Only download if forced OR no XML files exist
if not xml_files or THERMO_ML_FORCE_REDOWNLOAD:
    if THERMO_ML_FORCE_REDOWNLOAD and thermo_dir.exists():
        import shutil
        shutil.rmtree(thermo_dir)
        print("\nRemoved existing ThermoML directory")
    
    print("\n2. Processing ThermoML dataset...")
    thermo_success = download_thermoML()
    xml_files = list(thermo_dir.glob("*.xml")) if thermo_dir.exists() else []
else:
    thermo_success = True

print(f"ThermoML Status: {'✅ SUCCESS' if thermo_success else '❌ FAILED'}")
print(f"XML files found: {len(xml_files)}")

# Final verification
print("\n" + "="*50)
print("FINAL VERIFICATION")
print("="*50)
print(f"QM9: {'Exists' if qm9_path.exists() else 'Missing'} ({qm9_path.stat().st_size/1e6:.1f} MB)")
print(f"ThermoML: {len(xml_files)} XML files found")


2. Processing ThermoML dataset...
Downloading ThermoML schema...


### Download the ESOL dataset

In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.append(str(project_root))


from src.data.download import download_esol


# Force redownload if needed
ESOL_FORCE_REDOWNLOAD = False  # Set to True to force redownload
esol_path = project_root / "data" / "ESOL" / "delaney-processed.csv"

if ESOL_FORCE_REDOWNLOAD:
    print("\n" + "="*50)
    print("ESOL FORCE REDOWNLOAD INITIATED")
    print("="*50)
    if esol_path.exists():
        esol_path.unlink()
        print("Removed existing ESOL file")
    else:
        print("No existing ESOL file found")

print("\nDownloading ESOL dataset...")
esol_success = download_esol()
if esol_success:
    print(f"✅ ESOL verification complete - {esol_path.stat().st_size/1e6:.1f} MB")
    print(f"File location: {esol_path.resolve()}")
else:
    print("❌ ESOL download failed - check error messages")


Downloading ESOL dataset...
Dataset already exists at:
/home/pc/Python_Projects/thesis/data/ESOL/delaney-processed.csv
✅ ESOL verification complete - 0.1 MB
File location: /home/pc/Python_Projects/thesis/data/ESOL/delaney-processed.csv
