In [2]:
import os
import pandas as pd
import requests
from tqdm import tqdm

In [4]:
def download_cif(csv_path, output_dir):
    # Load the CSV file
    df = pd.read_csv(csv_path)
    df = df.drop_duplicates(subset='cod_id', keep=False)
    # Create output_dir
    os.makedirs(output_dir, exist_ok=True)
    
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Downloading CIF files"):
        row_id = index + 1  # Row index as row_id (or change if another column represents row_id)
        cod_id = row['cod_id']
        
        # Download CIF file
        cif_url = f'https://www.crystallography.net/cod/{cod_id}.cif'
        response = requests.get(cif_url)
        try:
            if response.status_code == 200:
                cif_path = os.path.join(output_dir, f'{cod_id}.cif')
                with open(cif_path, 'w', encoding='utf-8') as file:
                    file.write(response.text)
            else:
                print(f'Failed to download CIF for COD ID {cod_id}')
        except:
            print(f'Download failed for: {cif_path}')

In [5]:
# Example usage
download_cif('matching_compositions.csv', 'data/cifs')

Downloading CIF files:   0%|          | 0/1574 [00:00<?, ?it/s]

Downloading CIF files: 100%|██████████| 1574/1574 [04:24<00:00,  5.96it/s]
