# ¿Que más se podría hacer?

In [84]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


- Test con data augmentation y la LSTM
- Usar los tokenizadores usados en los transformers para ver como funcionan
- TSNE de embeddings y analizar si hay interpretación
- Probar con otras encimas o proteinas
- Usar los embeddings entrenados para analizar resultados de proteinas o encimas con menos data
- Entrenar una red neuronal con los features (fingerprints por ejemplo) y comparar los resultados con los embeddings


# Tome cualquiera de estas propuestas o alguna suya y desarrolle

<b>Entreno una red neuronal utilizando los descriptores de Lipinski</b>

In [85]:
import pandas as pd
import numpy as np
from pandas_summary import DataFrameSummary

## Obtención de datos para entrenamiento

In [86]:
DATA_PATH = 'data/acetylcholinesterase_02_bioactivity_data_preprocessed_full.pkl'

In [129]:
try:

    df = pd.read_pickle(DATA_PATH)

except FileNotFoundError:

    print("Preparando dataset...\n")
    
    # Descargo Data
    from chembl_webresource_client.new_client import new_client

    target = new_client.target
    target_query = target.search('acetylcholinesterase')
    targets = pd.DataFrame.from_dict(target_query)


    selected_target = targets.target_chembl_id[0]
    print(f"selected_target: {selected_target}\n")


    activity = new_client.activity
    res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

    res_cols = []
    for i, r in enumerate(res):
        print(f'downloaded: {i}\r', end='')
        res_cols.append(r)

    df = pd.DataFrame(res_cols)

    # Limpieza Data
    df = df.dropna(subset=['standard_value', 'canonical_smiles'])
    df = df.drop_duplicates(['canonical_smiles'])

    selection = ['molecule_chembl_id','canonical_smiles','standard_value']
    df = df[selection]    
    
    # Preprocesamiento y normalización
    df['standard_value'] = df['standard_value'].apply(pd.to_numeric)
    df['standard_value_norm'] = df['standard_value'].apply(lambda x: (x>1e8)*1e8 + (x<=1e8)*x)
    df['pIC50'] = df['standard_value'].apply(lambda x: -np.log10(x*(10**-9)))

    # Elimino datos no numericos de la columna target. De otro modo luego ocasionan inconvenientes durante 
    # el proceso de entrenamiento
    df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

    df.to_pickle(DATA_PATH)
    print("\nDataset descargado y listo para trabajar")

In [132]:
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,750.0,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,100.0,7.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,50000.0,4.30103
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,300.0,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,800.0,6.09691


## Lipinski descriptors

In [104]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [105]:
# df = df[df.standard_value.notna()]
# df = df[df.canonical_smiles.notna()]
# df = df.drop_duplicates(['canonical_smiles'])
# df = df[['molecule_chembl_id','canonical_smiles','standard_value', 'standard_value_norm', 'pIC50']]

In [121]:
# df["canonical_smiles_bkp"] = df["canonical_smiles"].copy()
# df_no_smiles = df.drop(columns='canonical_smiles')

In [131]:
df["canonical_smiles"].apply(
    lambda x: max(
        x.split('.'), key=len
    )
)

0                   CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1
1              O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1
2       CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1
3           O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F
4               CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C
                              ...                        
8384    CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCCNC(=O)c4cc(...
8385    CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)Nc4cn[n...
8386    CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...
8387    CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)NCc4cc[...
8388    COc1cc(CNC(=O)CCCCCCCCNc2c3c(nc4cc(Cl)ccc24)CC...
Name: canonical_smiles, Length: 5823, dtype: object

In [92]:
def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        try:
            mol=Chem.MolFromSmiles(elem) 
        except:
            print(elem)
            raise
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(
        data=baseData,
        columns=columnNames
    )
    
    return descriptors
     


In [135]:
df_lipinski = lipinski(df.canonical_smiles)

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
0,312.325,2.8032,0.0,6.0
1,376.913,4.5546,0.0,5.0
2,426.851,5.3574,0.0,5.0
3,404.845,4.7069,0.0,5.0
4,346.334,3.0953,0.0,6.0
...,...,...,...,...
5818,562.154,7.8680,4.0,5.0
5819,506.094,7.3886,3.0,4.0
5820,523.121,5.9852,2.0,4.0
5821,547.143,7.0315,3.0,4.0


In [144]:
df.reset_index(drop=True, inplace=True)
df_lipinski.reset_index(drop=True, inplace=True)

In [147]:
df_combined = pd.concat([df, df_lipinski], axis=1)

In [151]:
df_combined.head(3)

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,750.0,6.124939,312.325,2.8032,0.0,6.0
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,100.0,7.0,376.913,4.5546,0.0,5.0
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,50000.0,4.30103,426.851,5.3574,0.0,5.0


In [152]:
DATA_PATH = 'data/acetylcholinesterase_02_bioactivity_data_preprocessed_with_lipinkski_desc.pkl'

df_combined.to_pickle(DATA_PATH)

## Network Model

In [157]:
from sklearn.model_selection import train_test_split

y = df_combined.pIC50
X = df_combined[['MW', 'LogP', 'NumHDonors', 'NumHAcceptors']]

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=88
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.2, 
    random_state=88
)

len(X_train), len(X_val), len(X_test)

(3726, 932, 1165)

In [158]:
X_train

Unnamed: 0,MW,LogP,NumHDonors,NumHAcceptors
2700,267.376,3.94360,2.0,2.0
5407,523.615,3.09802,1.0,8.0
4797,551.562,8.01310,3.0,3.0
3302,386.442,4.30110,1.0,3.0
5335,554.090,7.14050,3.0,5.0
...,...,...,...,...
2951,290.363,2.76298,0.0,5.0
5077,425.437,2.65300,0.0,8.0
570,303.834,-0.54070,1.0,4.0
3355,365.473,4.70084,0.0,4.0


In [160]:
""" """
from tensorflow.keras import Sequential
from tensorflow.keras.layers import (
    Dense, Dropout, Activation
)

import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)


def create_model() -> Sequential:
    """ """
    
    input_len = 4
    
    # Model params 
    dropout_rate = 0.5
    add_bn = False
    hidden_layer_config = [
        # {"units": 100, "activation": "relu", "dropout_rate": dropout_rate},
        {"units": 10, "activation": "relu", "dropout_rate": dropout_rate},
        ]

    output_layer_activation = None  # "linear"
    
    # Models
    model = Sequential()
    
    model.add(Dense(input_len, activation="relu"))
    
    # Top fully-connected layer
    for layer in hidden_layer_config:
        dropout_rate = layer.get("dropout_rate", 0)
        if dropout_rate:
            model.add(Dropout(dropout_rate))
        model.add(Dense(layer["units"]))
        if layer.get("add_bn", add_bn):
            model.add(BatchNormalization())  # util para redes profundas
        model.add(Activation(layer["activation"]))

    # Output layer
    model.add(Dense(1, activation=output_layer_activation))

    return model


2023-02-21 17:55:47.423127: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# df.head(3)
# df.head(1).to_dict("records")

features = [
    "activity_properties",

    "assay_type",
    "assay_variant_accession",
    "assay_variant_mutation",

    "bao_endpoint",
    "bao_format",
    "bao_label",

    "canonical_smiles",

    "ligand_efficiency",

    "pchembl_value",
    "potential_duplicate",
    "relation",

    "target_organism",
    "target_pref_name",
    "target_tax_id"
]

In [6]:
# uniques = DataFrameSummary(df[cat_vars]).summary().loc[['uniques']]

In [None]:
DataFrameSummary(df[features]).summary()

TypeError: unhashable type: 'list'

In [8]:
df[features]

Unnamed: 0,activity_properties,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,bao_label,canonical_smiles,ligand_efficiency,pchembl_value,potential_duplicate,relation,target_organism,target_pref_name,target_tax_id
0,[],B,,,BAO_0000190,BAO_0000357,single protein format,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,"{'bei': '19.61', 'le': '0.36', 'lle': '3.32', ...",6.12,0,=,Homo sapiens,Acetylcholinesterase,9606
1,[],B,,,BAO_0000190,BAO_0000357,single protein format,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,"{'bei': '18.57', 'le': '0.38', 'lle': '2.45', ...",7.00,0,=,Homo sapiens,Acetylcholinesterase,9606
2,[],B,,,BAO_0000190,BAO_0000357,single protein format,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,,,0,>,Homo sapiens,Acetylcholinesterase,9606
3,[],B,,,BAO_0000190,BAO_0000357,single protein format,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,"{'bei': '16.11', 'le': '0.34', 'lle': '1.81', ...",6.52,0,=,Homo sapiens,Acetylcholinesterase,9606
4,[],B,,,BAO_0000190,BAO_0000357,single protein format,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,"{'bei': '17.60', 'le': '0.36', 'lle': '3.00', ...",6.10,0,=,Homo sapiens,Acetylcholinesterase,9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8384,"[{'comments': None, 'relation': '=', 'result_f...",B,,,BAO_0000190,BAO_0000357,single protein format,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCCNC(=O)c4cc(...,"{'bei': '15.28', 'le': '0.29', 'lle': '0.72', ...",8.59,0,=,Homo sapiens,Acetylcholinesterase,9606
8385,"[{'comments': None, 'relation': '=', 'result_f...",B,,,BAO_0000190,BAO_0000357,single protein format,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)Nc4cn[n...,"{'bei': '17.04', 'le': '0.33', 'lle': '1.23', ...",8.62,0,=,Homo sapiens,Acetylcholinesterase,9606
8386,"[{'comments': None, 'relation': '=', 'result_f...",B,,,BAO_0000190,BAO_0000357,single protein format,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)N4CCNC(...,"{'bei': '17.94', 'le': '0.35', 'lle': '3.40', ...",9.39,0,=,Homo sapiens,Acetylcholinesterase,9606
8387,"[{'comments': None, 'relation': '=', 'result_f...",B,,,BAO_0000190,BAO_0000357,single protein format,CC1=CC2Cc3nc4cc(Cl)ccc4c(NCCCCCCCCC(=O)NCc4cc[...,"{'bei': '16.82', 'le': '0.32', 'lle': '2.17', ...",9.20,0,=,Homo sapiens,Acetylcholinesterase,9606
