In [27]:
import pandas as pd
import numpy as np
import tqdm
import copy
from scipy import sparse

In [3]:
#load and drop duplicate records 
df =pd.read_csv('classes_data_pchembl.csv').drop_duplicates(['chembl_id', 'lig_chemblid'])

#filter for number of ligands per target:
low = 500
high = 5000
df = df.groupby('chembl_id').filter(lambda x : (len(x)>500) and (len(x)<5000) ) 

In [18]:
num_instances = df['lig_chemblid'].unique().shape[0]
num_targets = df['chembl_id'].unique().shape[0]

#interaction matrix:
interaction_matrix = np.zeros([num_instances, num_targets])
#interaction dates:
interaction_dates = copy.copy(interaction_matrix)

In [22]:
###setting up column indices, to use in filling in the matrices above
tids = df.sort_values('chembl_id')['chembl_id'].unique()
cids = df.sort_values('lig_chemblid')['lig_chemblid'].unique()
target_indices = dict()
for count, i in enumerate(tids):
    target_indices[i]=count

instance_indices = dict()
for count, i in enumerate(cids):
    instance_indices[i]=count


#Actually filling the values:
for count, item in tqdm.tqdm_notebook(df.iterrows(), 
                                      total=len(df),
                                      smoothing=0):
    t_id = item['chembl_id']
    i_id = item['lig_chemblid']
    date = item['year']

    row = instance_indices[i_id]
    column = target_indices[t_id]

    interaction_matrix[row, column] = 1
    interaction_dates[row, column] = date

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=454516.0), HTML(value='')))




In [25]:
#Do a little test to make sure some randomly chosen positives in the interaction_matrix line up with real entries in the df.

for _ in range(100):
    row = np.random.choice(interaction_matrix.shape[0]) #select random instance
    col = np.random.choice(interaction_matrix[row].nonzero()[0]) #select from positives of that instance
    assert tids[col] in list(df[df['lig_chemblid']==cids[row]]['chembl_id'])
    
print('passed')
print('Matrix shape:', interaction_matrix.shape)

passed
Matrix shape: (318026, 300)


In [28]:

fname = 'pchembl'

##Save all the data
sparse.save_npz('./interaction_dates_'+fname+'.npz', sparse.csr_matrix(interaction_dates))
sparse.save_npz('./interaction_matrix_'+fname+'.npz', sparse.csr_matrix(interaction_matrix))


df.sort_values('lig_chemblid').drop_duplicates(['lig_chemblid'])[['lig_chemblid', 'canonical_smiles']].to_csv('./'+fname+'_chemicals.csv', index=False)
df.sort_values('chembl_id').drop_duplicates(['chembl_id'])['pref_name'].to_csv('subset_targets', index=False, header=None)
df.sort_values('chembl_id').drop_duplicates(['chembl_id']).to_csv('subset_targets.csv', index=False)