In [None]:
import pickle
import random
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from rdkit import Chem
%matplotlib inline

In [None]:
import sys
sys.path.append("..")

In [None]:
from molbart.tokeniser import MolEncTokeniser

In [None]:
CHEMBL_PATH = "../../data/chembl_27.txt"
CHEMBL_SMILES_PATH = "../../data/chembl_27.pickle"
VOCAB_PATH = "../bart_vocab.txt"
CHEM_TOKEN_START_IDX = 272
REGEX = "\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9]"

In [None]:
chembl_path = Path(CHEMBL_PATH)
chembl_smiles_path = Path(CHEMBL_SMILES_PATH)

In [None]:
chembl_df = pd.read_csv(chembl_path, sep="\t")

In [None]:
chembl_df.head()

In [None]:
print(f"Number of rows: {len(chembl_df)}")

In [None]:
chembl_smiles_df = chembl_df[["chembl_id", "canonical_smiles"]]
assert len(chembl_smiles_df) == len(chembl_df)
chembl_df = None

In [None]:
chembl_smiles_df.head()

In [None]:
smiles = chembl_smiles_df["canonical_smiles"].tolist()

molecules = []
err_idxs = []

for idx, smi in enumerate(smiles):
    try:
        mol = Chem.MolFromSmiles(smi)
        molecules.append(mol)
    except TypeError:
        print(idx)
        print(smi)
        err_idxs.append(idx)

In [None]:
print(len(molecules))

In [None]:
chembl_smiles_df.iloc[622133]

In [None]:
processed_df = chembl_smiles_df.drop([622133])
chembl_smiles_path = None

In [None]:
print(f"Number of rows: {len(processed_df)}")

In [None]:
processed_df = processed_df.reset_index(drop=True)
processed_df.iloc[622133]

In [None]:
processed_df.iloc[622130:622135]

In [None]:
processed_df["molecules"] = molecules

In [None]:
# Check SMILES are correct

mols = processed_df["molecules"].tolist()[622130:622135]
smis = processed_df["canonical_smiles"].tolist()[622130:622135]

for mol, smi in zip(mols, smis):
    mol_smi = Chem.MolToSmiles(mol)
    print(mol_smi)
    print(smi)
    print()

In [None]:
processed_df.head()

In [None]:
tokeniser = MolEncTokeniser.from_vocab_file(VOCAB_PATH, REGEX, CHEM_TOKEN_START_IDX)

In [None]:
smiles = processed_df["canonical_smiles"].tolist()
for smi in smiles[:5]:
    print(smi)
    print()

In [None]:
tokens = tokeniser.tokenise(smiles)["original_tokens"]

In [None]:
seq_lengths = [len(ts) for ts in tokens]

In [None]:
print(f"Min length: {min(seq_lengths)}")
print(f"Max length: {max(seq_lengths)}")

In [None]:
plt.hist(seq_lengths, bins=20)
plt.show()

In [None]:
long_lengths = [l for l in seq_lengths if l > 128]
print(len(long_lengths))
plt.hist(long_lengths, bins=20)
plt.show()

In [None]:
short_lengths = [l for l in seq_lengths if l <= 128]
print(len(short_lengths))

In [None]:
drop_idxs = [idx for idx, l in enumerate(seq_lengths) if l > 128]

In [None]:
shorter_seq_df = processed_df.drop(drop_idxs)
processed_df = None

In [None]:
shorter_seq_df = shorter_seq_df.reset_index(drop=True)

In [None]:
shorter_seq_df.head()

In [None]:
print(len(shorter_seq_df))

In [None]:
SPLIT = 0.05

In [None]:
idxs = range(len(shorter_seq_df))
num_idxs = int(len(idxs) * SPLIT)
val_idxs = random.sample(idxs, num_idxs)
print(len(val_idxs))

In [None]:
rem_idxs = set(idxs) - set(val_idxs)
test_idxs = random.sample(list(rem_idxs), num_idxs)
print(len(test_idxs))

In [None]:
train_idxs = rem_idxs - set(test_idxs)
assert len(train_idxs) + len(val_idxs) + len(test_idxs) == len(shorter_seq_df)

In [None]:
val_idxs = set(val_idxs)
test_idxs = set(test_idxs)

In [None]:
def process_idx(idx, train, val, test):
    if idx in train:
        return "train"
    elif idx in val:
        return "val"
    elif idx in test:
        return "test"
    else:
        raise ValueError()

In [None]:
data_type = [process_idx(idx, train_idxs, val_idxs, test_idxs) for idx in idxs]
print(len(data_type))

In [None]:
shorter_seq_df["set"] = data_type

In [None]:
shorter_seq_df.head()

In [None]:
chembl_smiles_path = Path(CHEMBL_SMILES_PATH)
shorter_seq_df.to_pickle(chembl_smiles_path)

In [None]:
# Add section to process seq lengths and add these to the df

In [None]:
chembl_df = pd.read_pickle(chembl_smiles_path)

In [None]:
mols = chembl_df["molecules"].tolist()
smiles = [Chem.MolToSmiles(mol) for mol in mols]

In [None]:
tokens = tokeniser.tokenise(smiles)["original_tokens"]

In [None]:
seq_lengths = [len(ts) for ts in tokens]

In [None]:
chembl_df["lengths"] = seq_lengths

In [None]:
chembl_df.to_pickle(chembl_smiles_path)