In [1]:
import numpy as np
import pandas as pd
import pickle
import dimod
import os
import dwave_token
import dwave.inspector

from dwave.system import DWaveSampler, EmbeddingComposite, FixedEmbeddingComposite
# from dwave.system import LeapHybridCQMSampler

# automatically generated embedding:
sampler = EmbeddingComposite(DWaveSampler(token=dwave_token.value))

In [2]:
def csv_to_matrix(file_name): # read csv file to get row names and matrix values
    dimension = len(np.genfromtxt(file_name)) - 1
    rows = np.array([element[0] for element in pd.read_csv(file_name, delimiter=';', usecols=[0]).to_numpy()])
    matrix = np.genfromtxt(file_name, delimiter=';', skip_header=1, usecols = range(1,dimension+1))
    return(rows, matrix)

def distance_to_similarity(Q, neutral_distance=1): # convert distance matrix to similarity
    # neutral distance is the distance which will be mapped to similarity 0, larger distance values will be mapped to negative similarity
    # The worst similarity (distance 1) will then be (neutral_distance-1)/neutral_distance
    P = 1 - np.array(Q)/neutral_distance
    return(P)

def load_mapping(file_name): # load mapping file and convert to normalized number values
    frame = pd.read_csv(file_name, delimiter=';')
    class_to_number = {
        "Approved": 1,
        "Preclinical": 0,
        "Withdrawn": -1
    }
    dictionary = {key: class_to_number[val] for key, val in frame.set_index('ROWID')['Class'].to_dict().items()}
    return(frame, dictionary)

In [26]:
folder = 'Compound data/'
file_name = 'Matrix_ECFP4.csv'
mapping_file_name = 'mapping.csv'
rows, distance_J = csv_to_matrix(os.path.join(folder,file_name)) # row names and distances
mapping, bias_dict0 = load_mapping(os.path.join(folder, mapping_file_name)) # linear biases from the mapping file
N = len(rows)
J = np.triu(distance_to_similarity(distance_J,neutral_distance=0.3)) # convert into a similarity upper triangle matrix. set distance that is without reward/penalty
J_dict = {(rows[i],rows[j]): -J[i,j] for i in range(N) for j in range(i+1,N)} # - for minimization
bias_scaling = 0.05*len(J) # choose scaling for linear bias
bias = np.array([bias_scaling * bias_dict0[row] for row in rows])
bias_dict = {key: -bias_scaling * value for (key,value) in bias_dict0.items()} # - for minimization

In [21]:
# if feasible, load embedding:
embedding_folder = 'Embeddings' # folder where embedding is saved
#embedding_folder = folder
embedding_file_name = str(N) + '_Ising' + '_embedding.pkl'
#embedding_file_name =  os.path.splitext(file_name)[0] + '_embedding.pkl'
with open(os.path.join(embedding_folder,embedding_file_name), 'rb') as f:
    embedding = pickle.load(f)
if len(embedding) != N:
    print('Choose and embedding of correct size!')
elif rows[0] not in embedding.keys():
    try:
        embedding = {row: embedding[i] for i, row in enumerate(rows)}
    except KeyError:
        embedding = {row: list(embedding.values())[i] for i, row in enumerate(rows)}
fixed_sampler = FixedEmbeddingComposite(child_sampler=DWaveSampler(token=dwave_token.value), embedding=embedding)

In [22]:
# # artifically create withdrawn compounds:
# for i in range(len(J)-10,len(J)):
#     bias_dict[rows[i]] = bias_scaling
#     bias[i] = - bias_scaling

In [23]:
bqm = dimod.BinaryQuadraticModel.from_ising(bias_dict,J_dict) # model creation

In [24]:
chain_strength = bias_scaling # set chain strength close to largest coefficient
# sampleset = sampler.sample( # uncomment this if you want to generate an embedding on the fly
sampleset = fixed_sampler.sample( # uncomment this if you want to used the fixed saved embedding
    bqm,
    num_reads=1000,
    chain_strength=chain_strength
)
print('minimum: ' + str(sampleset.lowest().first.energy))
print(str(N) + ' variables encoded in ' + str(len([node for nodes in sampleset.info['embedding_context']['embedding'].values() for node in nodes])) + ' physical qubits')
solution_dict = sampleset.lowest().first.sample # load best solution in a dictionary
solution = np.array([solution_dict[row] for row in rows])

minimum: -106.38766928721535
102 variables encoded in 1405 physical qubits


In [25]:
print(
    'Approved which are bad: ' + str(np.count_nonzero(np.logical_and(np.sign(bias) == 1, solution < 0))) + '\n',
    'Withdrawn which are good: ' + str(np.count_nonzero(np.logical_and(np.sign(bias) == -1, solution > 0))) + '\n',
    'Preclinical which are bad ' + str(np.count_nonzero(np.logical_and(np.sign(bias) == 0, solution < 0))) + '\n',
    'Preclinical which are good ' + str(np.count_nonzero(np.logical_and(np.sign(bias) == 0, solution > 0))) + '\n'
    )
print(solution)

Approved which are bad: 20
 Withdrawn which are good: 0
 Preclinical which are bad 29
 Preclinical which are good 22

[ 1  1 -1  1  1  1  1  1  1  1  1 -1  1 -1  1 -1  1 -1  1  1  1  1  1 -1
  1  1  1 -1 -1 -1 -1 -1 -1  1  1 -1 -1 -1  1  1 -1  1  1  1 -1 -1  1 -1
 -1  1  1 -1  1 -1 -1  1 -1  1  1  1  1  1 -1  1  1 -1  1 -1 -1  1 -1  1
 -1 -1  1  1  1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1
  1 -1  1  1 -1 -1]


In [61]:
# save embedding:
embedding_file_name = os.path.splitext(file_name)[0] + '_embedding.pkl'
with open(os.path.join(folder,dict_file_name), 'wb') as f:
    pickle.dump(sampleset.to_serializable()['info']['embedding_context']['embedding'], f)

## Comparison with classical computation:
Calculating similarity with approved compounds and then ordering accordingly

$sim_i = \sum_{j\neq i\ \text{approved}} similarity(i,j)$

In [106]:
similarity_dict = {(rows[i],rows[j]): distance_to_similarity(distance_J,0.3)[i,j] for i in range(N) for j in range(N)}
approved_list = [row for row in rows if np.sign(bias_dict0[row]) == 1]
sim_dict = {row: sum(similarity_dict[row,pow] for pow in approved_list if pow != row) for row in rows}
sorted_dict = sorted(sim_dict.items(), key=lambda x:x[1], reverse=True)

In [108]:
print(sorted_dict[:10])
# It seems like with the neutral distance of 0.3, all compounds are far away from the group

[('Row25', -94.07054901472613), ('Row50', -94.794893548157), ('Row39', -96.35794760686892), ('Row32', -96.3900904668777), ('Row37', -96.69211663090114), ('Row22', -96.91378022072725), ('Row11', -96.9213318987318), ('Row0', -97.28615864743844), ('Row7', -97.5112293736308), ('Row2', -97.57237662470456)]


In [111]:
for key,val in sorted_dict:
    print(key in approved_list)
# But at least it looks like the approved group is the closest to itself

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
False
True
True
True
False
True
True
True
True
False
False
True
True
True
False
False
False
True
False
True
True
False
False
True
False
False
False
True
False
False
True
False
True
True
False
True
False
False
False
False
False
True
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
True
True
True
False
False
False
False
True
False
False
False
False
True
True
False
False
False
False
False
False
True
True
False
True
