In [60]:
import numpy as np 
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs

In [2]:
df = pd.read_csv('processed_df.csv')

In [3]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,drug1,db_no1,drug2,db_no2,general_syntax,smiles1,smiles2
0,8,Pravastatin,DB00175,Alectinib,DB11363,the serum concentration of drug1 can be increa...,[H][C@]12[C@H](C[C@H](O)C=C1C=C[C@H](C)[C@@H]2...,CCC1=CC2=C(C=C1N1CCC(CC1)N1CCOCC1)C(C)(C)C1=C(...


In [4]:
smile = 'CCN1N=NN(CCN2CCC(COC)(CC2)N(C(=O)CC)C2=CC=CC=C2)C1=O'
# smile = 'CCC1=CC2=C(C=C1N1CCC(CC1)N1CCOCC1)C(C)(C)C1=C(C3=C(N1)C=C(C=C3)C#N)C2=O'
s_mol = Chem.MolFromSmiles(smile)
x = Chem.RDKFingerprint(s_mol, maxPath=3, fpSize=60, useBondOrder=True, branchedPaths=True)
# s_mol
x.ToBitString()

'011001111100011111111111111110111111111111111101001111111110'

In [73]:
df['fingerprint1'] = 'None'
df['fingerprint2'] = 'None'

In [77]:
def smiles_to_maccs(smile):
    mol = Chem.MolFromSmiles(smile)
    return MACCSkeys.GenMACCSKeys(mol).ToBitString()

In [78]:
drop_list = []

In [88]:
for i in df.index:
    smiles1 = df.at[i, 'smiles1']
    smiles2 = df.at[i, 'smiles2']
    try:
        df.at[i, 'fingerprint1'] = str(smiles_to_maccs(smiles1))
        df.at[i, 'fingerprint2'] = str(smiles_to_maccs(smiles2))
    except Exception as e:
        print(i, ' ', e)
    if((df.at[i, 'fingerprint1'] == 'None') or (df.at[i, 'fingerprint2'] == 'None')):
        drop_list.append(i)
        continue

In [93]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,drug1,db_no1,drug2,db_no2,general_syntax,smiles1,smiles2,fingerprint1,fingerprint2,combined_fingerprint
0,8,Pravastatin,DB00175,Alectinib,DB11363,the serum concentration of drug1 can be increa...,[H][C@]12[C@H](C[C@H](O)C=C1C=C[C@H](C)[C@@H]2...,CCC1=CC2=C(C=C1N1CCC(CC1)N1CCOCC1)C(C)(C)C1=C(...,0000000000000000000000000010000000000000000000...,0000000000000000000000000000000000000000010000...,


In [86]:
df.drop(index=drop_list, inplace = True)

In [91]:
df.to_csv('fingerprint.csv')

In [92]:
df['combined_fingerprint'] = 'None'

In [102]:
for i in df.index:
    fingerprint1 = df.at[i, 'fingerprint1'] = str(smiles_to_maccs(smiles1))
    fingerprint2 = df.at[i, 'fingerprint2'] = str(smiles_to_maccs(smiles2))
    # print(len(fingerprint1))
    result = ''
    for x in range(0, 167):
        if fingerprint1[x] == '0' and fingerprint2[x] == '0':
            result += '0'
        if fingerprint1[x] == '0' and fingerprint2[x] == '1':
            result += '2'
        if fingerprint1[x] == '1' and fingerprint2[x] == '0':
            result += '1'
        if fingerprint1[x] == '1' and fingerprint2[x] == '1':
            result += '3'
    df.at[i, 'combined_fingerprint'] = result
    
    

In [103]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,drug1,db_no1,drug2,db_no2,general_syntax,smiles1,smiles2,fingerprint1,fingerprint2,combined_fingerprint
0,8,Pravastatin,DB00175,Alectinib,DB11363,the serum concentration of drug1 can be increa...,[H][C@]12[C@H](C[C@H](O)C=C1C=C[C@H](C)[C@@H]2...,CCC1=CC2=C(C=C1N1CCC(CC1)N1CCOCC1)C(C)(C)C1=C(...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000001000000000...,0000000000000000000000000000000000002000000000...


In [104]:
df.to_csv('fingerprint.csv')