In [4]:
import gzip
import pickle
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator
from tqdm.auto import tqdm
from pathlib import Path
import os

CHEMBL_PATH = 'chembl_35_chemreps.txt.gz'
FP_CACHE = 'chembl_fps.pkl'
FP_RADIUS = 3
FP_SIZE = 2048

chembl_smiles = []
chembl_fps = []

def process_and_save():
    """Однократный препроцессинг данных"""
    if Path(FP_CACHE).exists():
        print("Файл с фингерпринтами уже существует!")
        return

    print("Начало обработки ChEMBL...")
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
    
    fps = []
    smiles = []
    
    with gzip.open(CHEMBL_PATH, 'rt') as f:
        df = pd.read_csv(f, sep='\t', usecols=['canonical_smiles'])
        
        for smi in tqdm(df['canonical_smiles'], total=len(df)):
            try:
                mol = Chem.MolFromSmiles(smi)
                if mol:
                    fps.append(mfpgen.GetFingerprint(mol))
                    smiles.append(smi)
            except:
                continue
    
    with open(FP_CACHE, 'wb') as f:
        pickle.dump({'smiles': smiles, 'fps': fps}, f)
    
    print(f"Сохранено {len(fps)} фингерпринтов в {FP_CACHE}")
process_and_save()

Начало обработки ChEMBL...


  0%|                                                                          | 2383/2474590 [00:02<46:22, 888.44it/s][08:51:57] Can't kekulize mol.  Unkekulized atoms: 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
  2%|█▏                                                                      | 41208/2474590 [00:24<22:46, 1780.94it/s][08:52:19] Explicit valence for atom # 13 P, 7, is greater than permitted
  2%|█▌                                                                      | 52433/2474590 [00:30<21:39, 1864.10it/s][08:52:25] Explicit valence for atom # 29 P, 7, is greater than permitted
  2%|█▌                                                                      | 54878/2474590 [00:31<21:49, 1847.44it/s][08:52:26] Explicit valence for atom # 91 P, 7, is greater than permitted
 16%|███████████▏                                                           | 391234/2474590 [03:55<22:59, 

Сохранено 2474569 фингерпринтов в chembl_fps.pkl
