In [3]:
import os
import pandas as pd
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

In [None]:
def get_xray_raw_link(rruff_id):
    url = f"https://rruff.info/{rruff_id}/display=default/"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        for link in soup.find_all('a', href=True):
            href = link['href']
            if rruff_id in href and "Powder__Xray" in href and "RAW" in href:
                return f"{href}"
    print(f"No matching RAW X-ray Powder file found for {rruff_id}")
    return None

def download_xray_raw(csv_path, output_dir):
    df = pd.read_csv(csv_path)
    df = df.drop_duplicates(subset='RRUFFID', keep=False)
    os.makedirs(output_dir, exist_ok=True)
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Downloading X-ray RAW files"):
        row_id = index + 1
        rruff_id = row['RRUFFID']
        raw_link = get_xray_raw_link(rruff_id)
        if raw_link:
            response = requests.get(raw_link)
            if response.status_code == 200:
                raw_path = os.path.join(output_dir, f"{rruff_id}.txt")
                with open(raw_path, 'wb') as file:
                    file.write(response.content)
            else:
                print(f"Failed to download RAW file for RRUFF ID {rruff_id}")

In [5]:
download_xray_raw('matching_compositions.csv', 'data/ruff')

Downloading X-ray RAW files:   0%|          | 0/60 [00:00<?, ?it/s]

Downloading X-ray RAW files:  27%|██▋       | 16/60 [00:37<01:31,  2.08s/it]

No matching RAW X-ray Powder file found for R060173


Downloading X-ray RAW files:  30%|███       | 18/60 [00:41<01:22,  1.97s/it]

No matching RAW X-ray Powder file found for R060543


Downloading X-ray RAW files:  32%|███▏      | 19/60 [00:42<01:14,  1.82s/it]

No matching RAW X-ray Powder file found for R060612


Downloading X-ray RAW files:  43%|████▎     | 26/60 [00:58<01:09,  2.04s/it]

No matching RAW X-ray Powder file found for R050156


Downloading X-ray RAW files:  48%|████▊     | 29/60 [01:04<01:00,  1.95s/it]

No matching RAW X-ray Powder file found for R060245


Downloading X-ray RAW files:  70%|███████   | 42/60 [01:35<00:38,  2.13s/it]

No matching RAW X-ray Powder file found for R050163


Downloading X-ray RAW files:  85%|████████▌ | 51/60 [01:56<00:19,  2.11s/it]

No matching RAW X-ray Powder file found for R050215


Downloading X-ray RAW files:  87%|████████▋ | 52/60 [01:57<00:14,  1.85s/it]

No matching RAW X-ray Powder file found for R050584


Downloading X-ray RAW files: 100%|██████████| 60/60 [02:16<00:00,  2.28s/it]
