## Initialization

In [1]:
%reload_ext autoreload
%autoreload 2
def warn(*args, **kwargs):
    pass  # to silence scikit-learn warnings

import warnings
warnings.filterwarnings('ignore')
warnings.warn = warn

# Global Imports
# from collections import Counter
# import glob
from pathlib import Path
import sys

import pandas as pd
import numpy as np

# from scipy.stats import median_absolute_deviation as mad

from rdkit import DataStructs
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors as Desc
# from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from Contrib.NP_Score import npscorer

import matplotlib.pyplot as plt
import seaborn as sns

from cellpainting3 import processing as cpp


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Project-local Imports
PROJECT_DIR = list(Path("..").absolute().parents)[1]
sys.path.append(str(PROJECT_DIR))
import plt_style
import utils as u
from utils import lp

* Cell Painting v3
cellpainting3.processing                      ((master      )  commit: c9269f5 ( 2020-09-08 13:21:42 ))


In [2]:
df = pd.DataFrame({"Name": ["A", "B", np.nan, "D", "E"], "x": [1.0, 2.0, 3.0, np.nan, 5.0]})
df

Unnamed: 0,Name,x
0,A,1.0
1,B,2.0
2,,3.0
3,D,
4,E,5.0


In [None]:
for u in df.it

In [None]:
df["x"].loc[df["x"].isnull()] = "*"
df

In [5]:
df[df["x"] < 3.0]

Unnamed: 0,Name,x
0,A,1.0
1,B,2.0


In [7]:
df.query("x < 3.0")

Unnamed: 0,Name,x
0,A,1.0
1,B,2.0


In [20]:
df = pd.DataFrame({"Name": [1, 2, 3, 4, 5], "Smiles": ["CC[C@@H](C)O", "xxx", "CC[C@H](C)O", "CCC(C)O", "c1ccccc1C(=O)N"]})
df

Unnamed: 0,Name,Smiles
0,1,CC[C@@H](C)O
1,2,xxx
2,3,CC[C@H](C)O
3,4,CCC(C)O
4,5,c1ccccc1C(=O)N


In [8]:
from rdkit.Chem.MolStandardize.standardize import Standardizer
molvs_s = Standardizer()

In [10]:
smiles = ["CC[C@@H](C)O", "CC[C@H](C)O", "CCC(C)O"]

In [12]:
for smi in smiles:
    m = Chem.MolFromSmiles(smi)
    # m = molvs_s.stereo_parent(m)
    m = molvs_s.standardize(m)
    print(Chem.MolToSmiles(m))

CC[C@@H](C)O
CC[C@H](C)O
CCC(C)O


In [21]:
df = u.apply_to_smiles(df, "Smiles", {"Smiles_Std": lambda x: u.standardize_mol(x, remove_stereo=True)})
df.drop("Smiles", axis=1, inplace=True)
df = df.rename(columns={"Smiles_Std": "Smiles"})
df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




RDKit ERROR: [13:04:42] SMILES Parse Error: syntax error while parsing: xxx
RDKit ERROR: [13:04:42] SMILES Parse Error: Failed parsing SMILES 'xxx' for input: 'xxx'


Unnamed: 0,Name,Smiles
0,1,CCC(C)O
1,2,
2,3,CCC(C)O
3,4,CCC(C)O
4,5,NC(=O)c1ccccc1


In [19]:
u.filter_mols(df, "Smiles", ["Duplicates"])

Applying filters (1)...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))


Applied filter Duplicates: 2


Unnamed: 0,Name,Smiles
0,1,CCC(C)O
3,4,NC(=O)c1ccccc1


In [22]:
well_ids = pd.read_csv("../tmp/well_ids.tsv", sep="\t")[["Well_Id", "CpdClass"]]
lp(well_ids)
well_ids = well_ids.drop_duplicates(subset="Well_Id")
lp(well_ids)

Shape                                             :        342 /    2  [ Well_Id, CpdClass ]  
Shape                                             :        183 /    2  [ Well_Id, CpdClass ]  


In [6]:
cpp.load_resource("DATASTORE")

  - loading resource:                      (DATASTORE)
    This is the dataset that went public on datavis on 11-Nov-2020



In [23]:
ds = cpp.DATASTORE.merge(well_ids, on="Well_Id", how="inner").compute()
lp(ds, "Internal data set")

Shape Internal data set                           :        183 /  594  


In [24]:
ds.to_csv("../Input Data/internal_data.tsv", sep="\t", index=False)

In [25]:
len(ds.drop_duplicates(subset="Compound_Id"))

166

In [26]:
cpd_ids = pd.read_csv("/home/pahl/comas/notebooks/projects/michael/200831_cross-sim_np-like_qed/cpd_ids.tsv", sep="\t")
lp(cpd_ids)
len(cpd_ids.drop_duplicates(subset="Compound_Id"))

Shape                                             :        263 /    2  [ Compound_Id, CpdClass ]  


263

In [27]:
cpd_ids["CpdClass"].unique()

array(['QN-C-S', 'QN-C-R', 'QD-C-S', 'QD-C-R', 'QN-I', 'QD-I', 'S-C-S',
       'S-C-R', 'S-I-closed', 'S-I-opened', 'G-I-1', 'G-I-2', 'G-THPI-β'],
      dtype=object)

https://link.springer.com/article/10.1186/1758-2946-5-26

In [15]:
mol1, mol2 = (Chem.MolFromSmiles(x) for x in ["c1ccccc1C(=O)N", "c1cnccc1C(=O)N"])

In [8]:
# ecfc4 (default)
fp1, fp2 = (Chem.GetMorganFingerprint(x, radius=2) for x in [mol1, mol2])
DataStructs.TanimotoSimilarity(fp1, fp2)

0.47058823529411764

In [18]:
# fcfc6
fp1, fp2 = (Chem.GetMorganFingerprint(x, 3, useFeatures=True) for x in [mol1, mol2])
DataStructs.TanimotoSimilarity(fp1, fp2)

0.4

In [16]:
# ecfp6
fp1, fp2 = (Chem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in [mol1, mol2])
DataStructs.TanimotoSimilarity(fp1, fp2)

0.42857142857142855

In [20]:
# fcfp6
fp1, fp2 = (Chem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in [mol1, mol2])
DataStructs.TanimotoSimilarity(fp1, fp2)

0.44

In [21]:
# RDK
fp1, fp2 = (Chem.RDKFingerprint(x) for x in [mol1, mol2])
DataStructs.TanimotoSimilarity(fp1, fp2)

0.39285714285714285

1024