In [8]:
import requests
import hashlib

In [9]:
# Dataset URLs and MD5 checksums
datasets = {
    "train": {
        "url": "https://ndownloader.figshare.com/files/13612760",
        "md5": "05ad85d871958a05c02ab51a4fde8530",
        "filename": "train_guacamol.smiles"
    },
    "valid": {
        "url": "https://ndownloader.figshare.com/files/13612766",
        "md5": "e53db4bff7dc4784123ae6df72e3b1f0",
        "filename": "valid_guacamol.smiles"
    },
    "test": {
        "url": "https://ndownloader.figshare.com/files/13612757",
        "md5": "677b757ccec4809febd83850b43e1616",
        "filename": "test_guacamol.smiles"
    }
}

In [10]:
def download_file_with_md5(url, filename, expected_md5):
    """
    Download a file from a URL and verify its MD5 checksum.

    :param url: URL to the file to be downloaded
    :param filename: Local filename to save the downloaded file
    :param expected_md5: Expected MD5 checksum of the file
    """
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Ensure the download was successful

    # Compute MD5 checksum while writing the file
    md5 = hashlib.md5()
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
            md5.update(chunk)
    
    actual_md5 = md5.hexdigest()
    if actual_md5 != expected_md5:
        raise ValueError(f"MD5 mismatch: expected {expected_md5}, got {actual_md5}")
    
    print(f"Downloaded and verified {filename}")

In [11]:
def read_smiles_file(filename):
    """Read a .smiles file and return the list of SMILES strings."""
    with open(filename, 'r') as file:
        smiles = file.readlines()
    # Remove newline characters
    smiles = [s.strip() for s in smiles]
    return smiles

In [12]:
# Check before downloading if the files already exist
for dataset in datasets.values():
    try:
        with open(dataset['filename'], 'r'):
            print(f"File {dataset['filename']} already exists")
    except FileNotFoundError:
        download_file_with_md5(dataset['url'], dataset['filename'], dataset['md5'])
    

training_smiles = read_smiles_file('train_guacamol.smiles')
validation_smiles = read_smiles_file('valid_guacamol.smiles')
test_smiles = read_smiles_file('test_guacamol.smiles')

print(f"Number of train molecules in data set: {len(training_smiles)}")
print(f"First 5 SMILES strings in training set: {training_smiles[:5]}")
print(f"Number of validation molecules in data set: {len(validation_smiles)}")
print(f"First 5 SMILES strings in validation set: {validation_smiles[:5]}")
print(f"Number of test molecules in data set: {len(test_smiles)}")
print(f"First 5 SMILES strings in test set: {test_smiles[:5]}")


Downloaded and verified train_guacamol.smiles
Downloaded and verified valid_guacamol.smiles
Downloaded and verified test_guacamol.smiles
Number of train molecules in data set: 1273104
First 5 SMILES strings in training set: ['CCC(C)(C)Br', 'CCCN(CCc1cccc(-c2ccccc2)c1)C(=O)C1OC(C(=O)O)=CC(N)C1NC(C)=O', 'Oc1ccc(C2CC(c3ccccc3)=NN2C(=S)Nc2ccccc2)cc1', 'CC1(C)OCC2OC3(C4OC(C)(C)OC4CO)OC(C)(C)OC3C2O1', 'COC(=O)c1cc(C(=CCCCC(=O)SC)c2cc(Cl)c(OC)c(C(=O)OC)c2)cc(Cl)c1OC']
Number of validation molecules in data set: 79568
First 5 SMILES strings in validation set: ['CCCC(=O)NNC(=O)Nc1ccccc1', 'CC(=O)NC1CCC2(C)C(CCC3(C)C2C(=O)C=C2C4C(C)C(C)CCC4(C)CCC23C)C1(C)C(=O)O', 'CC(=O)NC(C)Cc1ccc(C#Cc2ccnc(N3CCCC(F)C3)n2)cc1', 'Cc1cccc(CCNC(=O)C2CCC(=O)N(Cc3ccc(Cl)cc3)C2)n1', 'CC1C=CN(N(C)C)C2=C1C(=O)c1cnccc1C2=O']
Number of test molecules in data set: 238706
First 5 SMILES strings in test set: ['Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O', 'C=CCN1C(=O)C(=NNC(=S)NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)