In [2]:
from pathlib import Path
dataset_path = Path("./dockstring-dataset.tsv")
assert dataset_path.exists()  # Throw an error if the dataset is missing!

In [10]:
import pandas as pd
df = pd.read_csv(
    dataset_path, 
    sep="\t", # since our dataset is tab-delimited
    index_col="inchikey",  # index by inchikey
)  
df

Unnamed: 0_level_0,smiles,PPARD,ABL1,ADAM17,ADRB1,ADRB2,AKT2,MAOB,CASP3,DHFR,...,EGFR,F10,GBA,MAPK1,MAPK14,PLK1,SRC,THRB,F2,KDR
inchikey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UMVWYQXKBPJMOF-UHFFFAOYNA-N,C1=C(C2=C(C=C1O)OC(C(C2=O)=O)C3=CC=C(C(=C3)O)O)O,-8.2,-9.1,-9.0,-9.3,-9.7,-8.7,-8.4,-7.2,-8.9,...,-9.1,-8.4,-9.1,-9.3,-8.3,-9.0,-8.4,-8.8,-8.2,-8.0
NGOGFTYYXHNFQH-UHFFFAOYNA-N,O=S(=O)(N1CCNCCC1)C2=CC=CC=3C2=CC=NC3,-7.1,-9.5,-7.0,-7.6,-7.7,-8.2,-6.3,-6.1,-8.4,...,-7.5,-6.6,-8.0,-8.3,-6.9,-8.6,-7.7,-8.1,-6.8,-7.4
BGVLELSCIHASRV-QPEQYQDCNA-N,C=1C=C2S/C(/N(CC)C2=CC1OC)=C\C(=O)C,-6.6,-7.4,-5.9,-7.0,-7.0,-7.1,-6.9,-5.5,-7.0,...,-6.6,-5.9,-6.4,-7.1,-6.3,-7.0,-6.2,-7.9,-5.8,-6.2
KTUFNOKKBVMGRW-RPGFEBOUNA-N,C=1(N=C(C=2C=NC=CC2)C=CN1)NC=3C=C(NC(C4=CC=C(C...,-10.8,-10.2,-11.0,-10.4,-11.5,-11.0,-3.3,-9.4,-10.1,...,-11.0,-9.0,-9.3,-10.2,-8.8,-10.4,-9.5,-7.4,-9.7,-11.4
LLJRXVHJOJRCSM-UHFFFAOYNA-N,C1=CC=2C(=CNC2C=C1)C=3C=CN=CC3,-7.7,-8.6,-7.4,-8.4,-8.2,-7.7,-7.9,-6.2,-8.1,...,-8.0,-7.0,-7.7,-8.0,-7.8,-7.9,-6.8,-8.5,-6.6,-7.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DWPVTFJKTGBFRK-UHFFFAOYNA-N,ClC1=CC(S(=O)(=O)N2N=NC=3C2=CC=CC3)=C(OCC)C=C1,-7.5,-9.0,-7.8,-8.1,-8.4,-7.8,-7.4,-6.9,-9.2,...,-8.0,-7.8,-8.2,-8.3,-7.3,-8.2,-7.6,-8.7,-7.8,-7.8
GZKYOKRGPTXXIJ-YAQRNVERNA-N,ClC=1C=CC(C(=O)NN=C2CCCC2)=CC1,-8.1,-8.6,-8.0,-8.1,-8.1,-7.3,-8.5,-6.3,-8.3,...,-7.9,-7.1,-8.0,-7.6,-7.0,-8.2,-7.0,-8.3,-7.2,-8.2
SFJOYBYSPNCEKG-UHFFFAOYNA-N,O=C1N(C(=O)N(C2=C3N(C(=C21)C4=CC=CC=C4)C=5C(N=...,-8.9,-10.3,-9.6,-11.5,-7.7,-8.9,-6.3,-8.9,-8.9,...,-10.5,-9.5,-8.7,-8.8,-8.2,-10.9,-9.1,-6.3,-9.2,-7.5
QUXNNXZZXGGGPV-UHFFFAOYNA-N,O=C(N1C=2C(C(=C1)C(OC)=O)=CC=CC2)C3=CC(=C(C=C3...,-9.6,-9.3,-9.1,-9.3,-9.7,-8.0,-8.8,-7.9,-9.4,...,-8.7,-8.1,-8.8,-8.6,-8.3,-9.2,-8.7,-11.1,-7.7,-7.0


In [11]:
df["affinity"] = df["LCK"]

In [12]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split

# Function to calculate properties using RDKit
def calculate_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        qed = Descriptors.qed(mol)
        logp = Descriptors.MolLogP(mol)
        tpsa = Descriptors.TPSA(mol)
        return pd.Series([qed, logp, tpsa], index=['qeds', 'logps', 'tpsas'])
    else:
        # Return a Series with None values for invalid SMILES
        return pd.Series([None, None, None], index=['qeds', 'logps', 'tpsas'])

import time

start_time = time.time()

# Apply the function to the DataFrame
df[['qeds', 'logps', 'tpsas']] = df['smiles'].apply(calculate_properties)

end_time = time.time()
print(f"Property calculation completed in {end_time - start_time:.2f} seconds.")

# Drop rows where property calculation failed due to invalid SMILES
df = df.dropna()

# Split the data into training (80%) and testing (20%) sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Add a 'split' column to indicate whether each molecule is in the train or test set
df_train['split'] = 'train'
df_test['split'] = 'test'

# Combine the train and test data back into a single DataFrame
final_df = pd.concat([df_train, df_test])

# Select and reorder the columns as requested
final_df = final_df[['qeds', 'logps', 'tpsas', 'affinity', 'smiles', 'split']]

# Save the final DataFrame to a CSV file
output_file = 'lck_dockstring_data.csv'
final_df.to_csv(output_file, index=False)

print(f"Successfully generated '{output_file}' with columns: {final_df.columns.tolist()}")

Property calculation completed in 844.20 seconds.
Successfully generated 'lck_dockstring_data.csv' with columns: ['qeds', 'logps', 'tpsas', 'affinity', 'smiles', 'split']
