In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from module import (
    numerical_separator,
    standardizer,
    clean,
    constant_predictors_remover,
    correlation_remover,
    correlation_matrix,
    plot_correlation_matrix,
    NaN_checker
)

In [2]:
#PRETRAITEMENT
 
TrainingData = pd.read_csv('train.csv') 

#clean
TrainingData = clean(TrainingData)

## Feature Enrichment

From the non numerical features, we want to extract relevent information and/or transorm them into numerical treatable information. 

This is the case for the Feature Coumpound, SMILES, Lab, and mol.

### 'mol' enrichment
From the feature **mol** in RDKit formst we can directly **extract** different exploitable features : 
- molecular wheight
- nb of atoms
- nb of Cs
- nb of rings
- ...

These new features are added at the end of the df

In [6]:
def extract_features_from_mol(df):
    """
    Extract molecular features from a column of molecules in a DataFrame.

    Parameters:
    - df (pd.DataFrame): DataFrame containing a column named 'mol' with RDKit molecule objects.

    Returns:
    - pd.DataFrame: Original DataFrame with additional columns for extracted molecular features.
    """
    df['mol'] = df['SMILES'].apply(Chem.MolFromSmiles)
    
    # Initialize an empty DataFrame
    mol_features_df = pd.DataFrame()

    # Loop through each molecule in the 'mol' column
    for molecule in df['mol']:
        # Calculate features 
        mol_weight = Descriptors.MolWt(molecule)
        total_atoms = molecule.GetNumAtoms()
        carbon_atoms = Descriptors.HeavyAtomCount(molecule)
        num_rings = Descriptors.RingCount(molecule)
        rotatable_bonds = Descriptors.NumRotatableBonds(molecule)
        tpsa = Descriptors.TPSA(molecule)
        mol_log_p = Descriptors.MolLogP(molecule)
        num_h_acceptors = Lipinski.NumHAcceptors(molecule)
        num_h_donors = Lipinski.NumHDonors(molecule)
        num_valence_electrons = Descriptors.NumValenceElectrons(molecule)
        num_aliphatic_carbocycles = Lipinski.NumAliphaticCarbocycles(molecule)
        num_aliphatic_heterocycles = Lipinski.NumAliphaticHeterocycles(molecule)

        # Append features to a temporary DataFrame
        temp_df = pd.DataFrame({
            'MolecularWeight': [mol_weight],
            'TotalAtoms': [total_atoms],
            'CarbonAtoms': [carbon_atoms],
            'NumRings': [num_rings],
            'RotatableBonds': [rotatable_bonds],
            'TPSA': [tpsa],
            'MolLogP': [mol_log_p],
            'NumHAcceptors': [num_h_acceptors],
            'NumHDonors': [num_h_donors],
            'NumValenceElectrons': [num_valence_electrons],
            'NumAliphaticCarbocycles': [num_aliphatic_carbocycles],
            'NumAliphaticHeterocycles': [num_aliphatic_heterocycles]    
        })

        # Concatenate temporary DataFrame to the main DataFrame
        mol_features_df = pd.concat([mol_features_df, temp_df], ignore_index=True)

    # Concatenate molecular features to the original DataFrame
    df = pd.concat([df, mol_features_df], axis=1)
    df.drop('mol', axis=1, inplace=True)
    
    return df

# Usage
TrainingData = extract_features_from_mol(TrainingData)
TrainingData.head()

Unnamed: 0,Compound,SMILES,Lab,RT,ECFP_1,ECFP_2,ECFP_3,ECFP_4,ECFP_5,ECFP_6,...,CarbonAtoms,NumRings,RotatableBonds,TPSA,MolLogP,NumHAcceptors,NumHDonors,NumValenceElectrons,NumAliphaticCarbocycles,NumAliphaticHeterocycles
0,Hydroxytriazolam,OCc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1Cl)=NC2,CFSRE,7.02,-0.099043,1.089098,-0.283867,-0.155853,-0.525333,-0.088172,...,24,4,2,63.3,3.4174,5,1,120,0,1
1,5-MeO-DIPT,COc1ccc2[nH]cc(CCN(C(C)C)C(C)C)c2c1,Aarhus,4.45,-0.099043,1.089098,-0.283867,-0.155853,-0.525333,-0.088172,...,20,2,6,28.26,3.8378,2,1,110,0,0
2,MDMA,CNC(C)Cc1ccc2c(c1)OCO2,Ghent University,3.14,-0.099043,1.089098,-0.283867,-0.155853,-0.525333,-0.088172,...,14,2,3,30.49,1.5657,3,1,76,0,1
3,Despropionyl N-Benzyl para-Fluoro Norfentanyl,Fc1ccc(NC2CCN(Cc3ccccc3)CC2)cc1,San Francisco OCME,5.95,-0.099043,-0.918191,-0.283867,-0.155853,-0.525333,-0.088172,...,21,3,4,15.27,3.9022,2,1,110,0,1
4,N-Ethylpentylone,CCCC(NCC)C(=O)c1ccc2c(c1)OCO2,Ghent University,4.21,-0.099043,1.089098,-0.283867,-0.155853,-0.525333,-0.088172,...,18,2,6,47.56,2.3762,4,1,98,0,1


### Enrichment Function

In [8]:
def enrich(data): #only for training data 
    """Enriches the training data by developping the feature mol, merging cddd and standardizing the new enriching features

    Args:
        data (panda.DataFrame): imput data

    Returns:
        panda.DataFrame: enriched dataframe 
    """
    enriched_Data = extract_features_from_mol(data)
    cddd = pd.read_csv('cddd.csv')
    enriched_Data_cd = pd.merge(enriched_Data, cddd, on='SMILES', how='left')
    if NaN_checker(enriched_Data_cd) == True : 
         nned, ned = numerical_separator(enriched_Data_cd)
         moyennes_colonnes = ned.mean()
         ned = ned.fillna(moyennes_colonnes)
         nned.reset_index(drop=True, inplace=True)
         ned.reset_index(drop=True, inplace=True)
         enriched_Data_wn = pd.concat([nned, ned], axis=1)
    else : 
        enriched_Data_wn = enriched_Data_cd
    enriched_Data_st = standardizer(enriched_Data_wn)
    return enriched_Data_st

We can save the new training data containing the enriched features in a new cvs file 

In [35]:
#copy file 
enriched_TrainingData = enriched_TrainingData.copy() 

# Charger le fichier CSV initial
file_path = 'TrainingData.csv'

# Save the modified DataFrame to a CSV file
output_file_path = 'enriched_train.csv'
enriched_TrainingData.to_csv(output_file_path, index=False)

In [39]:
enriched_TrainingData.head()

Unnamed: 0,Compound,SMILES,Lab,RT,ECFP_1,ECFP_2,ECFP_3,ECFP_4,ECFP_5,ECFP_6,...,CarbonAtoms,NumRings,RotatableBonds,TPSA,MolLogP,NumHAcceptors,NumHDonors,NumValenceElectrons,NumAliphaticCarbocycles,NumAliphaticHeterocycles
0,Hydroxytriazolam,OCc1nnc2n1-c1ccc(Cl)cc1C(c1ccccc1Cl)=NC2,CFSRE,7.02,-0.099576,1.088512,-0.285502,-0.156704,-0.528913,-0.088645,...,0.349863,0.973688,-1.25787,1.016091,-0.086185,1.729665,0.367446,0.123866,-0.384111,0.813517
1,5-MeO-DIPT,COc1ccc2[nH]cc(CCN(C(C)C)C(C)C)c2c1,Aarhus,4.45,-0.099576,1.088512,-0.285502,-0.156704,-0.528913,-0.088645,...,-0.359283,-0.645463,0.467451,-0.625066,0.210107,-0.840342,0.367446,-0.224759,-0.384111,-0.799081
2,MDMA,CNC(C)Cc1ccc2c(c1)OCO2,Ghent University,3.14,-0.099576,1.088512,-0.285502,-0.156704,-0.528913,-0.088645,...,-1.423002,-0.645463,-0.826539,-0.52062,-1.391237,0.016327,0.367446,-1.410083,-0.384111,0.813517
3,Despropionyl N-Benzyl para-Fluoro Norfentanyl,Fc1ccc(NC2CCN(Cc3ccccc3)CC2)cc1,San Francisco OCME,5.95,-0.099576,-0.918685,-0.285502,-0.156704,-0.528913,-0.088645,...,-0.181996,0.164113,-0.395209,-1.233474,0.255496,-0.840342,0.367446,-0.224759,-0.384111,0.813517
4,N-Ethylpentylone,CCCC(NCC)C(=O)c1ccc2c(c1)OCO2,Ghent University,4.21,-0.099576,1.088512,-0.285502,-0.156704,-0.528913,-0.088645,...,-0.713856,-0.645463,0.467451,0.278882,-0.820008,0.872996,0.367446,-0.643109,-0.384111,0.813517
