# Description

This notebook is used to concatenate all sources of raw SMILES data and prepare data so that the 'cleanup_smiles.py' script can parse original data (used in training RNN).

In [None]:
import pandas as pd

In [None]:
moses_smiles = pd.read_csv('./datasets/moses_dataset_v1.txt',sep=',')
print(moses_smiles.shape)
moses_smiles = moses_smiles[moses_smiles['SMILES'].notnull()]
print(moses_smiles.shape)
moses_smiles['smiles'] = moses_smiles["SMILES"]
moses_smiles['length'] = moses_smiles["smiles"].str.len()
moses_smiles.head()

In [None]:
chembl_smiles = pd.read_csv('./datasets/CHEMBL25-chembl_molecule.csv',sep=';')
print(chembl_smiles.shape)
chembl_smiles = chembl_smiles[chembl_smiles['Smiles'].notnull()]
print(chembl_smiles.shape)
chembl_smiles['smiles'] = chembl_smiles["Smiles"]
chembl_smiles['length'] = chembl_smiles["smiles"].str.len()
chembl_smiles.head()

In [None]:
# Load existing smiles from original dataset
old_smiles = pd.read_csv('./datasets/dataset.smi', names=["smiles"])
old_smiles['length'] = old_smiles["smiles"].str.len() 

In [None]:
# They are ready to be appended once run through a canonizer and then drop duplicates
moses_smiles = moses_smiles['smiles']
print(moses_smiles.shape)
moses_smiles = moses_smiles.drop_duplicates()
print(moses_smiles.shape)

In [None]:
# They are ready to be appended once run through a canonizer and then drop duplicates
chembl_smiles = chembl_smiles['smiles']
print(chembl_smiles.shape)
chembl_smiles = chembl_smiles.drop_duplicates()
print(chembl_smiles.shape)

In [None]:
old_smiles = old_smiles['smiles']
print(old_smiles.shape)
old_smiles = old_smiles.drop_duplicates()
print(old_smiles.shape)

In [None]:
smiles = moses_smiles.append(chembl_smiles)
print(smiles.shape)
smiles = smiles.drop_duplicates()
print(smiles.shape)
smiles = smiles.append(old_smiles)
print(smiles.shape)
smiles = smiles.drop_duplicates()
print(smiles.shape)

In [None]:
smiles.head()

In [None]:
smiles.to_csv(r'./datasets/all_smiles.smi', header=None, index=None, sep='\t', mode='a')

In [None]:
# Manually append HIV inhibitors list and remove duplicates to see if currently has any of them
all_smiles_test = pd.read_csv('./datasets/all_smiles.smi',sep='\t', header=None)
print(all_smiles_test.shape)
all_smiles_test.head()

In [None]:
# Now can combine these lists and run the data cleaning py script

Found HIV inhibitor drug SMILES to manually add above

https://pubchem.ncbi.nlm.nih.gov/

Tipranavir - 85
CCCC1(CC(=C(C(=O)O1)C(CC)C2=CC(=CC=C2)NS(=O)(=O)C3=NC=C(C=C3)C(F)(F)F)O)CCC4=CC=CC=C4
Darunavir - 71
CC(C)CN(CC(C(CC1=CC=CC=C1)NC(=O)OC2COC3C2CCO3)O)S(=O)(=O)C4=CC=C(C=C4)N
Amprenavir - 66
CC(C)CN(CC(C(CC1=CC=CC=C1)NC(=O)OC2CCOC2)O)S(=O)(=O)C3=CC=C(C=C3)N
Lopinavir - 84
CC1=C(C(=CC=C1)C)OCC(=O)NC(CC2=CC=CC=C2)C(CC(CC3=CC=CC=C3)NC(=O)C(C(C)C)N4CCCNC4=O)O
Atazanavir - 98
CC(C)(C)C(C(=O)NC(CC1=CC=CC=C1)C(CN(CC2=CC=C(C=C2)C3=CC=CC=N3)NC(=O)C(C(C)(C)C)NC(=O)OC)O)NC(=O)OC
Saquinavir - 89
CC(C)(C)NC(=O)C1CC2CCCCC2CN1CC(C(CC3=CC=CC=C3)NC(=O)C(CC(=O)N)NC(=O)C4=NC5=CC=CC=C5C=C4)O
Indinavir - 82
CC(C)(C)NC(=O)C1CN(CCN1CC(CC(CC2=CC=CC=C2)C(=O)NC3C(CC4=CC=CC=C34)O)O)CC5=CN=CC=C5
Ritonavir - 93
CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2=CC=CC=C2)CC(C(CC3=CC=CC=C3)NC(=O)OCC4=CN=CS4)O
Nelfinavir - 70
CC1=C(C=CC=C1O)C(=O)NC(CSC2=CC=CC=C2)C(CN3CC4CCCCC4CC3C(=O)NC(C)(C)C)O
Efavirenz - 45
C1CC1C#CC2(C3=C(C=CC(=C3)Cl)NC(=O)O2)C(F)(F)F

GS-8374 - 89
CCOP(=O)(COC1=CC=C(C=C1)CC(C(CN(CC(C)C)S(=O)(=O)C2=CC=C(C=C2)OC)O)NC(=O)OC3COC4C3CCO4)OCC
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3957959/

SPI-256 - ???
http://www.natap.org/2006/CROI/CROI_04.htm

See this for more:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5228633/
TMC310911 - 96
CC(C)CN(CC(C(CC1=CC=CC=C1)NC(=O)OC2COC3C2CCO3)O)S(=O)(=O)C4=CC5=C(C=C4)N=C(S5)NC6CCN(CC6)C7CCCC7
CTP-518 - 98
CC(C)(C)C(C(=O)NC(CC1=CC=CC=C1)C(CN(CC2=CC=C(C=C2)C3=CC=CC=N3)NC(=O)C(C(C)(C)C)NC(=O)OC)O)NC(=O)OC
PPL-100 - 83
CC(C)CN(C(CCCCNC(=O)C(C(C1=CC=CC=C1)C2=CC=CC=C2)NC(=O)OC)CO)S(=O)(=O)C3=CC=C(C=C3)N