# Intégration des données MS2 de MassBank

# Introduction
Ce notebook permet d'enrichir la base de données avec les spectres MS2 issus de MassBank, en se concentrant sur les données acquises en ESI-QTOF et LC-ESI-QTOF.

#  Import des bibliothèques et définition des fonctions

In [None]:
import pandas as pd
from rdkit import Chem

def read_msp(file_path):
    records = []
    current_record = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line == "":
                if current_record:
                    records.append(current_record)
                    current_record = {}
            else:
                if ':' in line:
                    key, value = line.split(':', 1)
                    key = key.strip()
                    value = value.strip()
                    if key == 'Num Peaks':
                        continue
                    current_record[key] = value
                else:
                    if 'peaks' not in current_record:
                        current_record['peaks'] = []
                    parts = line.split()
                    if len(parts) == 2:
                        current_record['peaks'].append((float(parts[0]), float(parts[1])))
    
    if current_record:
        records.append(current_record)
    return records

def standardize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            return Chem.MolToSmiles(mol, isomericSmiles=True)
        else:
            return None
    except:
        return None

# Chargement et préparation des données MS2


In [None]:
# Lecture du fichier MSP
file_path = 'MassBank_NIST.msp'
msp_data = read_msp(file_path)
msp_df = pd.DataFrame(msp_data)

# Filtrage des spectres MS2 et types d'instruments
msp_df = msp_df[msp_df['Spectrum_type'] == 'MS2']
msp_df = msp_df[msp_df['Instrument_type'].isin(['ESI-QTOF', 'LC-ESI-QTOF'])]

print(f"Nombre de composés uniques : {len(msp_df['Name'].unique())}")

# Préparation des données spectrales

In [None]:
# Séparation des pics MS2
msp_df['peaks_ms2_mz'] = msp_df['peaks'].apply(lambda x: [mz for mz, intensity in x])
msp_df['peaks_ms2_intensities'] = msp_df['peaks'].apply(lambda x: [intensity for mz, intensity in x])

# Standardisation des SMILES
msp_df["SMILES"] = msp_df['SMILES'].apply(standardize_smiles)

# Sélection des colonnes pertinentes
msp_df_filtered = msp_df[['SMILES', 'Precursor_type', 'peaks_ms2_mz', 
                         'peaks_ms2_intensities', 'Instrument_type', 
                         'Ion_mode', 'Collision_energy']]

# Chargement de la base existante

In [None]:
# Chargement de la base Norman avec CCS et RT
with pd.HDFStore("norman_all_ccs_all_rt_pos_neg.h5") as store:
    norman_pos = store['positive']
    norman_neg = store['negative']
    norman_combined = pd.concat([norman_pos, norman_neg])

# Fusion des données

In [None]:
# # Fusion avec les données MS2
norman_combined = pd.merge(
    norman_combined, 
    msp_df_filtered, 
    left_on=['SMILES', 'adduct'], 
    right_on=['SMILES', 'Precursor_type'], 
    how='left'
)

norman_combined.drop(columns=['Precursor_type'], inplace=True)

# Nettoyage

In [None]:
# Liste des colonnes à supprimer
columns_to_drop = [
    'Norman_SusDat_ID', 'Name_Dashboard', 'Name_ChemSpider', 'Name_IUPAC',
    'Synonyms_ChemSpider', 'Reliability_of_Synonyms_ChemSpider', 'CAS_RN',
    'CAS_RN_Dashboard', 'CAS_RN_PubChem', 'CAS_RN_Cactus', 'CAS_RN_ChemSpider',
    'Reliability_of_CAS_ChemSpider', 'Validation_Level', 'SMILES_Dashboard',
    'StdInChI', 'StdInChIKey', 'MS_Ready_SMILES', 'MS_Ready_StdInChI',
    'MS_Ready_StdInChIKey', 'PubChem_CID', 'ChemSpiderID', 'DTXSID',
    'Pred_RTI_Positive_ESI', 'Uncertainty_RTI_pos', 'Pred_RTI_Negative_ESI',
    'Uncertainty_RTI_neg', 'logKow_EPISuite', 'Exp_logKow_EPISuite',
    'ChemSpider_ID_based_on_InChIKey', 'alogp_ChemSpider', 'xlogp_ChemSpider',
    'Species', 'Uncertainty', 'ExposureScore_Water_KEMI', 'HazScore_EcoChronic_KEMI',
    'ValidationLevel_KEMI', 'Prob. of GC', 'Prob. RPLC', 'Pred. Chromatography',
    'Prob. of both Ionization Source', 'Prob. EI', 'Prob. ESI',
    'Pred. Ionization source', 'Prob. both ESI mode', 'Prob. +ESI', 'Prob. -ESI',
    'Pred. ESI mode', 'Preferable Platform by decision Tree', 'Synonyms',
    'Koc_min_experimental (L/kg)', 'Koc_max_experimental (L/kg)',
    'Koc_min_predicted (L/kg)', 'Koc_max_predicted (L/kg)','Source'
]
# Suppression des colonnes non nécessaires
norman_final = norman_combined.drop(columns=columns_to_drop, errors='ignore')

print("Colonnes conservées :")
print(norman_final.columns.tolist())

# Sauvegarde

In [None]:
# Séparation par mode d'ionisation
norman_positive = norman_final[norman_final['adduct'].str.endswith('+')]
norman_negative = norman_final[norman_final['adduct'].str.endswith('-')]

# Sauvegarde finale
norman_positive.to_hdf("norman_all_ccs_all_rt_pos_neg_with_ms2.h5", key="positive")
norman_negative.to_hdf("norman_all_ccs_all_rt_pos_neg_with_ms2.h5", key="negative")