In [1]:
import pandas as pd
import numpy as np
from spacecutter.models import OrdinalLogisticModel
import torch
from torch import nn
import datamol as dm
import matplotlib.pyplot as plt

from skorch import NeuralNet
from skorch.dataset import Dataset
from skorch.helper import SkorchDoctor
from skorch.callbacks import EarlyStopping

from spacecutter.callbacks import AscensionCallback
from spacecutter.losses import CumulativeLinkLoss
from sklearn.metrics import mean_absolute_error
from scipy.stats import kendalltau

from sklearn.preprocessing import RobustScaler

from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
proj_dir = '/Users/robertarbon/Library/CloudStorage/GoogleDrive-robert.arbon@gmail.com/My Drive/Polaris_ASAP_competition/polaris_challenge/admet'

In [3]:
# Imputed training data
df_imp = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_log_pmm_imputed.csv')
# Non-imputed validation data
df_val = pd.read_csv(f'{proj_dir}/dm_features/ordinal_data_split_2/train_admet_split2_features.csv')
# change names
df_val.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)
df_imp.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)

# Smiles columns because they were removed (for some unknown reason)
df_smiles = pd.read_csv(f'{proj_dir}/data/train_admet_all.csv')
df_smiles.rename(columns={'Molecule Name': 'Molecule.Name', 'LogMDR1-MDCKII':'LogMDR1.MDCKII'}, inplace=True)


df_imp = df_imp.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')
df_val = df_val.merge(df_smiles.loc[:, ['Molecule.Name', 'CXSMILES']], on='Molecule.Name', how='left')

In [5]:
transformer = PretrainedHFTransformer(kind="ChemBERTa-77M-MTR", notation='smiles', dtype=float)
feat = transformer(df_smiles.CXSMILES.values)

                                               





In [10]:
df_smiles.head()

Unnamed: 0,CXSMILES,HLM,KSOL,LogD,MDR1-MDCKII,MLM,Molecule.Name,n_missing,in-vitro_MLM_bienta: CLint (Num) (uL/min/mg),in-vitro_MLM_bienta: CLint (Mod),in-vitro_HLM_bienta: CLint (Num) (uL/min/mg),in-vitro_HLM_bienta: CLint (Mod),in-vitro_KSOL-PBS_bienta: mean_solubility (Num) (uM),in-vitro_KSOL-PBS_bienta: mean_solubility (Mod),in-vitro_LogD_bienta: LogD (Num),in-vitro_LogD_bienta: LogD (Mod),in-vitro_MDR1-MDCKII-Papp_bienta: mean_Papp_A_to_B (Num) (10^-6 cm/s),in-vitro_MDR1-MDCKII-Papp_bienta: mean_Papp_A_to_B (Mod)
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,10.0,400.0,0.3,2.0,10.0,ASAP-0032437,3,10.0,<,10.0,<,400.0,≥,,,,
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,0.2,,ASAP-0031915,2,,,,,,,,,,
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,400.0,0.4,0.5,,ASAP-0031884,3,,,,,400.0,≥,,,,
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,10.0,376.0,1.0,8.5,10.0,ASAP-0031848,2,10.0,<,10.0,<,,,,,,
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,10.0,375.0,-0.3,0.9,10.0,ASAP-0031813,2,10.0,<,10.0,<,,,,,,


In [12]:
feat_df = pd.DataFrame(feat)
feat_df = pd.concat((df_smiles.loc[:, ['Molecule.Name']], feat_df), axis=1)
feat_df.to_csv('chemberta_77M_mtr.csv', index=False)