In [6]:
import pandas as pd
from pathlib import Path
import numpy as np
from tqdm.notebook import tqdm
import tables
from concurrent.futures import ThreadPoolExecutor
import warnings
import rdkit.RDLogger as rdlog
rdlog.DisableLog('rdApp.*')  # Désactive les logs RDKit
from rdkit import Chem
import warnings
import rdkit.RDLogger as rdlog
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

# Désactiver les avertissements RDKit et pandas
rdlog.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')

In [2]:
def canonicalize_smiles(smiles):
    """Canonicalise un SMILES en utilisant RDKit"""
    try:
        if pd.isna(smiles):
            return None
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, canonical=True)
    except:
        return None

In [9]:
# Définition du chemin de base
base_path = Path('data/input/databases')

# Définition des catégories et leurs fichiers
ref_files = {
    'BIOCID': base_path / 'biocides.csv',
    'DW': base_path / 'drinkingwater.csv',
    'DOA': base_path / 'drugofabuse.csv',
    'FOODC': base_path / 'foodcontactchemicals.csv',
    'HUME': base_path / 'humanmetabolite.csv',
    'HUTOX': base_path / 'humanneurotoxins.csv',
    'INDOOR': base_path / 'indoor.csv',
    'IND': base_path / 'industrialchemicals.csv',
    'NATOX': base_path / 'naturaltoxins.csv',
    'PMT': base_path / 'persistentmobile.csv',
    'PCP': base_path / 'personalcareproduct.csv',
    'PFAS': base_path / 'PFAS.csv',
    'PHARMA': base_path / 'pharma.csv',
    'PPP': base_path / 'plantproductionproducts.csv',
    'PLAST': base_path / 'plasticadditives.csv',
    'SMOKE': base_path / 'smokecompounds.csv',
    'SURF': base_path / 'surfactants.csv'
}


def canonicalize_smiles_batch(smiles_list):
    return [canonicalize_smiles(s) for s in smiles_list]

# Charger et vérifier un fichier de référence
def process_reference_file(category_file):
    category, filepath = category_file
    try:
        # Vérifier si le fichier existe
        if not filepath.exists():
            print(f"File not found: {filepath}")
            return category, {'smiles': set(), 'names': set()}
        
        # Afficher le contenu du fichier
        print(f"\nLoading {category} from {filepath}")
        df = pd.read_csv(filepath, low_memory=False)
        #print(f"Columns in {category}: {df.columns.tolist()}")
        
        smiles_col = next((col for col in df.columns if col.upper() == 'SMILES'), None)
        name_col = next((col for col in df.columns if col.upper() == 'NAME'), None)
        
        if smiles_col:
            smiles_list = list(df[smiles_col].dropna())
            canonical_smiles = []
            
            # Traitement par lots avec barre de progression
            for smiles in tqdm(smiles_list, desc=f"Processing {category} SMILES"):
                can_smiles = canonicalize_smiles(smiles)
                if can_smiles:
                    canonical_smiles.append(can_smiles)
            
            smiles_set = set(canonical_smiles)
        else:
            print(f"No SMILES column found in {category}")
            smiles_set = set()
            
        if name_col:
            names_set = set(df[name_col].dropna().str.lower().unique())
        else:
            print(f"No Name column found in {category}")
            names_set = set()
        
        return category, {
            'smiles': smiles_set,
            'names': names_set
        }
        
    except Exception as e:
        print(f"Error processing {category}: {str(e)}")
        return category, {'smiles': set(), 'names': set()}
    
    
# Chargement des fichiers de référence
print("Processing reference files...")
reference_data = {}

# Traiter chaque fichier séquentiellement avec barre de progression
for category, filepath in tqdm(ref_files.items(), desc="Overall progress"):
    result = process_reference_file((category, filepath))
    reference_data[result[0]] = result[1]
    print(f"{result[0]}: {len(result[1]['smiles'])} SMILES, {len(result[1]['names'])} names")

print("\nFinal results:")
for category, data in reference_data.items():
    print(f"{category}: {len(data['smiles'])} SMILES, {len(data['names'])} names")

Processing reference files...


Overall progress:   0%|          | 0/17 [00:00<?, ?it/s]


Loading BIOCID from data/input/databases/biocides.csv


Processing BIOCID SMILES:   0%|          | 0/146 [00:00<?, ?it/s]

BIOCID: 143 SMILES, 159 names

Loading DW from data/input/databases/drinkingwater.csv


Processing DW SMILES:   0%|          | 0/4930 [00:00<?, ?it/s]

DW: 4889 SMILES, 5286 names

Loading DOA from data/input/databases/drugofabuse.csv


Processing DOA SMILES:   0%|          | 0/537 [00:00<?, ?it/s]

DOA: 537 SMILES, 538 names

Loading FOODC from data/input/databases/foodcontactchemicals.csv


Processing FOODC SMILES:   0%|          | 0/7043 [00:00<?, ?it/s]

FOODC: 6975 SMILES, 13120 names

Loading HUME from data/input/databases/humanmetabolite.csv


Processing HUME SMILES:   0%|          | 0/65353 [00:00<?, ?it/s]

HUME: 64398 SMILES, 65570 names

Loading HUTOX from data/input/databases/humanneurotoxins.csv


Processing HUTOX SMILES:   0%|          | 0/1465 [00:00<?, ?it/s]

HUTOX: 1451 SMILES, 1548 names

Loading INDOOR from data/input/databases/indoor.csv


Processing INDOOR SMILES:   0%|          | 0/1945 [00:00<?, ?it/s]

INDOOR: 1939 SMILES, 1947 names

Loading IND from data/input/databases/industrialchemicals.csv


Processing IND SMILES:   0%|          | 0/281 [00:00<?, ?it/s]

IND: 279 SMILES, 286 names

Loading NATOX from data/input/databases/naturaltoxins.csv


Processing NATOX SMILES:   0%|          | 0/3933 [00:00<?, ?it/s]

NATOX: 3920 SMILES, 3936 names

Loading PMT from data/input/databases/persistentmobile.csv


Processing PMT SMILES:   0%|          | 0/399 [00:00<?, ?it/s]

PMT: 399 SMILES, 402 names

Loading PCP from data/input/databases/personalcareproduct.csv


Processing PCP SMILES:   0%|          | 0/3315 [00:00<?, ?it/s]

PCP: 3303 SMILES, 3317 names

Loading PFAS from data/input/databases/PFAS.csv


Processing PFAS SMILES:   0%|          | 0/4592 [00:00<?, ?it/s]

PFAS: 4505 SMILES, 5710 names

Loading PHARMA from data/input/databases/pharma.csv


Processing PHARMA SMILES:   0%|          | 0/9620 [00:00<?, ?it/s]

PHARMA: 9587 SMILES, 9627 names

Loading PPP from data/input/databases/plantproductionproducts.csv


Processing PPP SMILES:   0%|          | 0/1691 [00:00<?, ?it/s]

PPP: 1673 SMILES, 1738 names

Loading PLAST from data/input/databases/plasticadditives.csv


Processing PLAST SMILES:   0%|          | 0/3071 [00:00<?, ?it/s]

PLAST: 3048 SMILES, 4386 names

Loading SMOKE from data/input/databases/smokecompounds.csv


Processing SMOKE SMILES:   0%|          | 0/94 [00:00<?, ?it/s]

SMOKE: 94 SMILES, 95 names

Loading SURF from data/input/databases/surfactants.csv


Processing SURF SMILES:   0%|          | 0/1628 [00:00<?, ?it/s]

SURF: 1578 SMILES, 2527 names

Final results:
BIOCID: 143 SMILES, 159 names
DW: 4889 SMILES, 5286 names
DOA: 537 SMILES, 538 names
FOODC: 6975 SMILES, 13120 names
HUME: 64398 SMILES, 65570 names
HUTOX: 1451 SMILES, 1548 names
INDOOR: 1939 SMILES, 1947 names
IND: 279 SMILES, 286 names
NATOX: 3920 SMILES, 3936 names
PMT: 399 SMILES, 402 names
PCP: 3303 SMILES, 3317 names
PFAS: 4505 SMILES, 5710 names
PHARMA: 9587 SMILES, 9627 names
PPP: 1673 SMILES, 1738 names
PLAST: 3048 SMILES, 4386 names
SMOKE: 94 SMILES, 95 names
SURF: 1578 SMILES, 2527 names


In [10]:
def classify_molecule(row, reference_dict):
    """
    Classifie une molécule en vérifiant ses SMILES et son nom dans les références
    """
    categories = set()
    
    smiles = row.get('SMILES', '')
    name = str(row.get('Name', '')).lower() if 'Name' in row else ''
    
    if pd.isna(smiles) and not name:
        return ["UNCLASSIFIED"]
    
    for category, ref_data in reference_dict.items():
        if not pd.isna(smiles) and smiles in ref_data['smiles']:
            categories.add(category)
        if name and name in ref_data['names']:
            categories.add(category)
    
    return list(categories) if categories else ["UNCLASSIFIED"]

In [11]:
# Chargement de la base principale
print("Loading main database...")
database_path = base_path / 'norman_all_ccs_all_rt_pos_neg_with_ms2.h5'

df_pos = pd.read_hdf(database_path, key='positive')
df_neg = pd.read_hdf(database_path, key='negative')

print(f"\nLoaded:")
print(f"Positive mode: {len(df_pos)} entries")
print(f"Negative mode: {len(df_neg)} entries")

Loading main database...

Loaded:
Positive mode: 591654 entries
Negative mode: 289856 entries


In [23]:
from tqdm.notebook import tqdm
tqdm.pandas()  # Activer tqdm pour pandas
# Classification des molécules
print("\nClassifying positive mode molecules...")
df_pos['categories'] = df_pos.progress_apply(lambda row: classify_molecule(row, reference_data), axis=1)

print("\nClassifying negative mode molecules...")
df_neg['categories'] = df_neg.progress_apply(lambda row: classify_molecule(row, reference_data), axis=1)


Classifying positive mode molecules...


  0%|          | 0/591654 [00:00<?, ?it/s]


Classifying negative mode molecules...


  0%|          | 0/289856 [00:00<?, ?it/s]

In [24]:
# Statistiques pour les molécules uniques
print("\nClassification statistics for unique molecules:")

# Obtenir tous les SMILES uniques et leurs catégories
all_smiles = set(df_pos['SMILES']).union(set(df_neg['SMILES']))
print(f"Total unique SMILES: {len(all_smiles)}")

# Créer un dictionnaire SMILES -> catégories
unique_mol_categories = {}
for df in [df_pos, df_neg]:
    smiles_cat_dict = df.set_index('SMILES')['categories'].to_dict()
    for smiles, cats in smiles_cat_dict.items():
        if smiles in unique_mol_categories:
            unique_mol_categories[smiles].update(cats)
        else:
            unique_mol_categories[smiles] = set(cats)

# Compter les catégories pour les molécules uniques
unique_categories_count = {}
for cats in unique_mol_categories.values():
    for cat in cats:
        unique_categories_count[cat] = unique_categories_count.get(cat, 0) + 1

print("\nNumber of unique molecules per category:")
for cat, count in sorted(unique_categories_count.items()):
    print(f"{cat}: {count} unique molecules")

# Statistiques supplémentaires
print("\nAdditional statistics:")
print(f"Total number of unique molecules: {len(unique_mol_categories)}")
multiple_cats = sum(1 for cats in unique_mol_categories.values() if len(cats) > 1)
print(f"Molecules with multiple categories: {multiple_cats} ({multiple_cats/len(unique_mol_categories)*100:.2f}%)")


Classification statistics for unique molecules:
Total unique SMILES: 72577

Number of unique molecules per category:
BIOCID: 136 unique molecules
DOA: 475 unique molecules
DW: 4414 unique molecules
FOODC: 6034 unique molecules
HUME: 49949 unique molecules
HUTOX: 1285 unique molecules
IND: 258 unique molecules
INDOOR: 1569 unique molecules
NATOX: 1549 unique molecules
PCP: 2855 unique molecules
PFAS: 3662 unique molecules
PHARMA: 7861 unique molecules
PLAST: 2795 unique molecules
PMT: 367 unique molecules
PPP: 964 unique molecules
SMOKE: 92 unique molecules
SURF: 477 unique molecules
UNCLASSIFIED: 12040 unique molecules

Additional statistics:
Total number of unique molecules: 72577
Molecules with multiple categories: 14907 (20.54%)


In [26]:
print("Statistiques détaillées par mode :")

# Fonction pour obtenir les statistiques par catégorie
def get_category_stats(df, category):
    total = len(df[df['categories'].apply(lambda x: category in x)]['SMILES'].unique())
    ms2 = len(df[df['categories'].apply(lambda x: category in x)].dropna(subset=['peaks_ms2_mz'])['SMILES'].unique())
    rt = len(df[df['categories'].apply(lambda x: category in x)].dropna(subset=['Observed_RT'])['SMILES'].unique())
    rt_ms2 = len(df[df['categories'].apply(lambda x: category in x)].dropna(subset=['Observed_RT', 'peaks_ms2_mz'])['SMILES'].unique())
    return total, ms2, rt, rt_ms2

# Statistiques globales pour chaque mode
for mode, df in [("Mode positif", df_pos), ("Mode négatif", df_neg)]:
    print(f"\n{mode}:")
    total = len(df['SMILES'].unique())
    ms2 = len(df.dropna(subset=['peaks_ms2_mz'])['SMILES'].unique())
    rt = len(df.dropna(subset=['Observed_RT'])['SMILES'].unique())
    rt_ms2 = len(df.dropna(subset=['Observed_RT', 'peaks_ms2_mz'])['SMILES'].unique())
    
    print(f"Total composés uniques: {total}")
    print(f"Avec MS2: {ms2} ({ms2/total*100:.1f}%)")
    print(f"Avec RT observé: {rt} ({rt/total*100:.1f}%)")
    print(f"Avec RT et MS2: {rt_ms2} ({rt_ms2/total*100:.1f}%)")
    
    print("\nStatistiques par catégorie:")
    for category in sorted(set(cat for cats in df['categories'] for cat in cats)):
        if category != "UNCLASSIFIED":
            total, ms2, rt, rt_ms2 = get_category_stats(df, category)
            print(f"\n{category}:")
            print(f"  Total: {total}")
            print(f"  Avec MS2: {ms2} ({ms2/total*100:.1f}% de la catégorie)")
            print(f"  Avec RT: {rt} ({rt/total*100:.1f}% de la catégorie)")
            print(f"  Avec RT et MS2: {rt_ms2} ({rt_ms2/total*100:.1f}% de la catégorie)")

# Statistiques pour les molécules uniques tous modes confondus
print("\nStatistiques globales (tous modes confondus):")
all_smiles = set(df_pos['SMILES']).union(set(df_neg['SMILES']))
all_smiles_ms2 = set(df_pos.dropna(subset=['peaks_ms2_mz'])['SMILES']).union(
    set(df_neg.dropna(subset=['peaks_ms2_mz'])['SMILES']))
all_smiles_rt = set(df_pos.dropna(subset=['Observed_RT'])['SMILES']).union(
    set(df_neg.dropna(subset=['Observed_RT'])['SMILES']))
all_smiles_rt_ms2 = set(df_pos.dropna(subset=['Observed_RT', 'peaks_ms2_mz'])['SMILES']).union(
    set(df_neg.dropna(subset=['Observed_RT', 'peaks_ms2_mz'])['SMILES']))

print(f"\nTotal molécules uniques: {len(all_smiles)}")
print(f"Avec MS2: {len(all_smiles_ms2)} ({len(all_smiles_ms2)/len(all_smiles)*100:.1f}%)")
print(f"Avec RT: {len(all_smiles_rt)} ({len(all_smiles_rt)/len(all_smiles)*100:.1f}%)")
print(f"Avec RT et MS2: {len(all_smiles_rt_ms2)} ({len(all_smiles_rt_ms2)/len(all_smiles)*100:.1f}%)")

Statistiques détaillées par mode :

Mode positif:
Total composés uniques: 72576
Avec MS2: 2024 (2.8%)
Avec RT observé: 216 (0.3%)
Avec RT et MS2: 100 (0.1%)

Statistiques par catégorie:

BIOCID:
  Total: 136
  Avec MS2: 43 (31.6% de la catégorie)
  Avec RT: 21 (15.4% de la catégorie)
  Avec RT et MS2: 17 (12.5% de la catégorie)

DOA:
  Total: 475
  Avec MS2: 59 (12.4% de la catégorie)
  Avec RT: 4 (0.8% de la catégorie)
  Avec RT et MS2: 3 (0.6% de la catégorie)

DW:
  Total: 4414
  Avec MS2: 690 (15.6% de la catégorie)
  Avec RT: 116 (2.6% de la catégorie)
  Avec RT et MS2: 75 (1.7% de la catégorie)

FOODC:
  Total: 6034
  Avec MS2: 414 (6.9% de la catégorie)
  Avec RT: 44 (0.7% de la catégorie)
  Avec RT et MS2: 31 (0.5% de la catégorie)

HUME:
  Total: 49949
  Avec MS2: 1765 (3.5% de la catégorie)
  Avec RT: 154 (0.3% de la catégorie)
  Avec RT et MS2: 93 (0.2% de la catégorie)

HUTOX:
  Total: 1285
  Avec MS2: 243 (18.9% de la catégorie)
  Avec RT: 50 (3.9% de la catégorie)
  Avec 

In [28]:
# Sauvegarde des résultats
output_path = base_path / 'norman_all_ccs_all_rt_pos_neg_with_ms2_classified.h5'
print("Saving results...")
df_pos.to_hdf(output_path, key='positive', mode='w')
df_neg.to_hdf(output_path, key='negative')
print(f"Results saved successfully to {output_path}!")

Saving results...
Results saved successfully to data/input/databases/norman_all_ccs_all_rt_pos_neg_with_ms2_classified.h5!
