In [1]:
import re
import copy
import pandas as pd
import numpy as np

## Fingerprints into single strings

In [3]:
fingerprints_file = "valid_data/drug_fingerprints.txt"
drug_names_file = 'valid_data/drugs_names.txt'

In [4]:
col_names      = ["chemical_name"]
drugs_names_df = pd.read_csv(drug_names_file, names=col_names)

In [5]:
drugs_names = list()
for i in range(len(drugs_names_df)):
    name = drugs_names_df["chemical_name"][i]
    drugs_names.append(name)

In [6]:
drugs_fingerprints_dict = {}
for i in range(len(drugs_names)):
    drugs_fingerprints_dict[drugs_names[i]] = [""] * 1024

In [7]:
def get_fingerprints(file_handle,drugs_fingerprints_dict, drugs_list):
    with open(file_handle) as fh:
        j=0
        content = fh.readlines()
        content = [x.strip() for x in content]
        for line in content:
            result = re.split(r'[,\t]\s*',line)
            drug_name = result[0]
            if drug_name in drugs_list:
                j=j+1
                for i in range(1,1025):
                    str1 = str(result[i])
                    drugs_fingerprints_dict[drug_name][i-1] = str1[0]
    fh.close()
    print(j)

In [8]:
get_fingerprints(fingerprints_file,drugs_fingerprints_dict, drugs_names)

41464


In [9]:
for i in range(len(drugs_names)):
    list1 = drugs_fingerprints_dict[drugs_names[i]]
    str1 = ''.join(list1)
    drugs_fingerprints_dict[drugs_names[i]] = str1

In [11]:
with open('fingerprints_strings.txt', 'w') as f:
    for key, value in drugs_fingerprints_dict.items():
        f.write(key)
        f.write(',')
        f.write(value)
        f.write('\n')

## Calculate similarity

In [2]:
fingerprints_file_1 = "sim_test/fingerprints_sample1.csv"
fingerprints_file_2 = "sim_test/fingerprints_sample2.csv"
drug_names_file_s   = "sim_test/name_sample.csv"

In [3]:
col_names       = ["chemical_name"]
drugs_names_df_s = pd.read_csv(drug_names_file_s, names=col_names)

In [4]:
size_drugs = len(drugs_names_df_s)

In [5]:
col_names_s        = ["chemical_name","fingerprint"]
fingerprints_df_1  = pd.read_csv(fingerprints_file_1, names=col_names_s)
fingerprints_df_2  = pd.read_csv(fingerprints_file_2, names=col_names_s)

In [6]:
frames = [fingerprints_df_1, fingerprints_df_2]
fingerprints_df_s = pd.concat(frames,ignore_index=True)

In [7]:
similarity_matrix = np.zeros(shape=(size_drugs,size_drugs))

In [8]:
upper_indexes = np.triu_indices(size_drugs,k=1)
num_indexes = len(upper_indexes[0])

In [9]:
for i in range(num_indexes):
    x = upper_indexes[0][i]
    y = upper_indexes[1][i]
    fingerprint1 = fingerprints_df_s.fingerprint[x]
    fingerprint2 = fingerprints_df_s.fingerprint[y]
    fingerprintsAND = bin(int(str(fingerprint1),2) & int(str(fingerprint2),2))[2:].zfill(len(fingerprint1)).count("1")
    fingerprintsXOR = bin(int(fingerprint1,2) ^ int(fingerprint2,2))[2:].zfill(len(fingerprint1)).count("1")
    tanimoto = float(fingerprintsAND)/(fingerprintsXOR+ fingerprintsAND) 
    similarity_matrix[x][y] = tanimoto

In [10]:
np.savetxt('similarity_test2.txt',similarity_matrix, fmt='%.6f', delimiter=',')