Load dependecies

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
import tmap as tm
from pandarallel import pandarallel
import numpy as np
pandarallel.initialize(nb_workers=2,progress_bar=False)
import pandas as pd
from map4.map4 import MAP4Calculator
from scipy.spatial import distance

map4_instance = MAP4Calculator(dimensions=1024)
def calc_map4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
    mol = Chem.MolFromSmiles(smiles)
    map4 = map4_instance.calculate(mol)
    return np.array(map4)

def get_distance(a, b):
    """Estimates the Jaccard distance of two binary arrays based on their hashes,
    which measures dissimilarity between sample sets, is complementary to the 
    Jaccard coefficient and is obtained by subtracting the Jaccard coefficient from 1.

    Arguments:
      a {numpy.ndarray} -- An array containing hash values.
      b {numpy.ndarray} -- An array containing hash values.

    Returns:
      float -- The estimated Jaccard distance.
    """

    # The Jaccard distance of Minhashed values is estimated by
    return 1.0 - float(np.count_nonzero(a == b)) / float(len(a))

Load Database

In [None]:
mibig = 'mibig_3.1_filtered_prop.pkl'
with open(mibig, 'rb') as file:
    df = pd.read_pickle(file)

if not isinstance(df, pd.DataFrame):
    raise ValueError("The loaded pickle file does not contain a DataFrame")
    
if 'MAP4' not in df.columns:
    raise ValueError("The DataFrame does not contain a MAP4 column")

Define input

In [None]:
# Input here a SMILES string of your query molecule
query_smiles = 'C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OP(=O)(N[C@@H](CC(C)C)C(=O)N[C@@H](CC2=CNC3=CC=CC=C32)C(=O)O)O)O)O)O'
# Write here a threshold from 0 to 1, with 1 as max distance to your input molecule (I suggest 0.85 for a first try)
threshold = 0.85

# Calculate fingerprint
fingerprint = calc_map4(query_smiles)

# Print input molecule
mol = Chem.MolFromSmiles(query_smiles)
img = Draw.MolToImage(mol, legend="Input molecule")
display(img)

Calculate distances

In [None]:
rows_with_distance = []
for index, map4_ in enumerate(df['MAP4']):
    dist = distance.jaccard(fingerprint, map4_)
    if dist < threshold:
        row_dict = df.iloc[index].to_dict()
        row_dict['distance'] = dist
        rows_with_distance.append(row_dict)

result_df = pd.DataFrame(rows_with_distance)

# Check if the 'Distance' column exists before filtering
if 'distance' in result_df.columns:
  filtered_df = result_df[result_df['distance'].notna()]
  filtered_df.head()
else:
  print("No similar molecules found within the specified threshold.")
  filtered_df = pd.DataFrame()  # Create an empty DataFrame if no similar molecules are found

# Assuming rows_with_distance is already defined and contains your data
result_df = pd.DataFrame(rows_with_distance)

# Filter out rows where 'Distance' is not NaN
filtered_df = result_df[result_df['distance'].notna()]

# Sort the filtered DataFrame by the 'Distance' column
sorted_df = filtered_df.sort_values(by='distance')

# Optionally, you can reset the index if desired
sorted_df.reset_index(drop=True, inplace=True)

sorted_df

Save the output as .csv

In [None]:
# Input here a filename for your output (without .csv)
out_name = "example"
sorted_df.to_csv(out_name+'.csv', index=False)