# Import pkgs

In [1]:
import pandas as pd

from pd_process import str2list

# Read csv: open-target -> uniprot

In [2]:
ot2uniprot_path = '../OT_disease_to_target/target_associated_with_AML_grouped_clean.csv'
ot2uniprot = pd.read_csv(ot2uniprot_path)
print(ot2uniprot.__len__())
print(ot2uniprot[ot2uniprot.columns[0:3]].nunique())

7851
uniprot_id       7851
symbol           7836
overall_score    4376
dtype: int64


In [3]:
temp_df = ot2uniprot.reset_index().set_index('uniprot_id')
ot2target_dict = temp_df['index'].to_dict()

# Read target2uniprot (specific and acceptable) csv

In [4]:
target2uniprot_path = '../target2uni/AML_accep_specific_uni_id_df.csv'
target2uniprot = pd.read_csv(target2uniprot_path)

# check if the uniprot_id is corresponding to the target_id for duplicated target_id
target_list = target2uniprot[target2uniprot['target_id'].duplicated(keep="first")]['target_id'].to_list()
for target in target_list:
    test_df = target2uniprot[target2uniprot['target_id'] == target]
    
    validator = False
    for idx in range(len(test_df)):
        if test_df['target_uniprot_ids'].iloc[0] == test_df['target_uniprot_ids'].iloc[idx]:
            validator = True
        else:
            validator = False
            break
    assert validator, print(target)


print(f"Total: {target2uniprot.__len__()}")
print(target2uniprot.nunique())

Total: 4422
parent_compound_id        17
target_id               1780
target_uniprot_ids       826
compound_specificity       2
dtype: int64


In [5]:
temp_df = target2uniprot.drop_duplicates(subset=['target_id'])[['target_id', 'target_uniprot_ids']].set_index('target_id')
target2uniprot_dict = temp_df.to_dict()['target_uniprot_ids']

temp_df = target2uniprot.drop_duplicates(subset=['parent_compound_id'])[['parent_compound_id', 'compound_specificity']].set_index('parent_compound_id')
compound_specificity = temp_df.to_dict()['compound_specificity']

## drug-assay-target relationship

In [6]:
drugs_assay_target_relationship_path = '../Get_surrounding_assay_from_ChEMBL/acute_myeloid_leukemia_approved_drugs_assay_target_chembl_id_relationship.csv'
drugs_assay_target_relationship = pd.read_csv(drugs_assay_target_relationship_path)

print(f"Total: {drugs_assay_target_relationship.__len__()}")
print(drugs_assay_target_relationship.nunique())

Total: 28172
parent_compound_chembl_id       23
assay_chembl_id              21242
target_chembl_id              2121
dtype: int64


In [7]:
# add uniprot_id to the drugs_assay_target_relationship
parent_compound_specificity = []
target_uniprot_ids = []

for idx in range(len(drugs_assay_target_relationship)):
    each_relationship = drugs_assay_target_relationship.iloc[idx]
    if each_relationship['parent_compound_chembl_id'] in compound_specificity:
        target_uniprot_ids.append(target2uniprot_dict[each_relationship['target_chembl_id']])
        parent_compound_specificity.append(compound_specificity[each_relationship['parent_compound_chembl_id']])
    else:
        target_uniprot_ids.append('Not interested!')
        parent_compound_specificity.append('wide-used')

drugs_assay_target_relationship['parent_compound_specificity'] = parent_compound_specificity
drugs_assay_target_relationship['target_uniprot_ids'] = target_uniprot_ids

In [8]:
# test algorithms' logic
hits_list = []
for idx in range(drugs_assay_target_relationship.__len__()):
    each_relationship = drugs_assay_target_relationship.iloc[idx]
    hits = 0
    for uniprot_id in str2list(each_relationship['target_uniprot_ids']):
        if uniprot_id in ot2target_dict:
            hits += 1
    
    hits_list.append(hits)

print(pd.Series(hits_list).value_counts())

0    23915
1     4257
Name: count, dtype: int64


In [9]:
infos = []
for idx in range(drugs_assay_target_relationship.__len__()):
    each_relationship = drugs_assay_target_relationship.iloc[idx]
    hits = 0
    uniprot_idx = None
    for uniprot_id in str2list(each_relationship['target_uniprot_ids']):
        if uniprot_id in ot2target_dict:
            hits += 1
            uniprot_idx = ot2target_dict[uniprot_id]
    
    if hits == 0:
        infos.append([None]*ot2uniprot.columns.__len__())    
    elif hits == 1:
        infos.append(ot2uniprot.loc[uniprot_idx].to_list())
    else:
        raise ValueError('More than one hits!')

In [10]:
infos_df = pd.DataFrame(infos, columns=[f"OT_{each}" for each in ot2uniprot.columns])
drugs_assay_target_disease_mapped_relationship = pd.concat([drugs_assay_target_relationship, infos_df], axis=1)
drugs_assay_target_disease_mapped_relationship['disease_focused'] = "AML"


# Annotate the important relationship

In [11]:
drugs_assay_target_disease_mapped_relationship.sort_values(by=['OT_overall_score', 'OT_uniprot_id'], ascending=False).head(10)#.to_csv('AML_drugs_assay_target_disease_mapped_relationship.csv', index=False)

Unnamed: 0,parent_compound_chembl_id,assay_chembl_id,target_chembl_id,parent_compound_specificity,target_uniprot_ids,OT_uniprot_id,OT_symbol,OT_overall_score,OT_chembl_score,OT_uniprot_variants_score,...,OT_genomics_england_score,OT_gene2phenotype_score,OT_clingen_score,OT_orphanet_score,OT_impc_score,OT_crispr_score,OT_expression_atlas_score,OT_reactome_score,OT_ot_genetics_portal_score,disease_focused
18827,CHEMBL3301622,CHEMBL3706339,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
18928,CHEMBL3301622,CHEMBL3991719,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19112,CHEMBL3301622,CHEMBL4360649,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19118,CHEMBL3301622,CHEMBL4418514,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19121,CHEMBL3301622,CHEMBL4418517,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19140,CHEMBL3301622,CHEMBL4768829,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19141,CHEMBL3301622,CHEMBL4814855,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19142,CHEMBL3301622,CHEMBL4815226,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19143,CHEMBL3301622,CHEMBL4815227,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML
19157,CHEMBL3301622,CHEMBL5107691,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.93404,,...,,,,,0.520993,,,,,AML


In [12]:
assay_importance = [0 if each_filter else 1 for each_filter in drugs_assay_target_disease_mapped_relationship['OT_uniprot_id'].isna().to_list() ]
drugs_assay_target_disease_mapped_relationship['assay_importance'] = assay_importance

In [13]:
drugs_assay_target_disease_mapped_relationship.columns

Index(['parent_compound_chembl_id', 'assay_chembl_id', 'target_chembl_id',
       'parent_compound_specificity', 'target_uniprot_ids', 'OT_uniprot_id',
       'OT_symbol', 'OT_overall_score', 'OT_chembl_score',
       'OT_uniprot_variants_score', 'OT_cancer_gene_census_score',
       'OT_intogen_score', 'OT_eva_somatic_score', 'OT_eva_score',
       'OT_uniprot_literature_score', 'OT_gene_burden_score',
       'OT_slapenrich_score', 'OT_cancer_biomarkers_score',
       'OT_europepmc_score', 'OT_genomics_england_score',
       'OT_gene2phenotype_score', 'OT_clingen_score', 'OT_orphanet_score',
       'OT_impc_score', 'OT_crispr_score', 'OT_expression_atlas_score',
       'OT_reactome_score', 'OT_ot_genetics_portal_score', 'disease_focused',
       'assay_importance'],
      dtype='object')

In [14]:
drugs_assay_target_disease_mapped_relationship.sort_values(by=['OT_overall_score', 'assay_importance'], ascending=False).drop_duplicates(subset=['target_chembl_id'])

Unnamed: 0,parent_compound_chembl_id,assay_chembl_id,target_chembl_id,parent_compound_specificity,target_uniprot_ids,OT_uniprot_id,OT_symbol,OT_overall_score,OT_chembl_score,OT_uniprot_variants_score,...,OT_gene2phenotype_score,OT_clingen_score,OT_orphanet_score,OT_impc_score,OT_crispr_score,OT_expression_atlas_score,OT_reactome_score,OT_ot_genetics_portal_score,disease_focused,assay_importance
18827,CHEMBL3301622,CHEMBL3706339,CHEMBL1974,specific,"['A0AVG9', 'B7ZLT7', 'B7ZLT8', 'F5H0A0', 'P368...",P36888,['FLT3'],0.821082,0.934040,,...,,,,0.520993,,,,,AML,1
1003,CHEMBL178,CHEMBL1613933,CHEMBL2093862,acceptable,"['A8MV94', 'B2RMS4', 'D3DSG1', 'O60472', 'O604...",Q01196,['RUNX1'],0.758695,,,...,,,,0.598117,,,,,AML,1
106,CHEMBL1489,CHEMBL1613995,CHEMBL4096,acceptable,"['P04637', 'Q15086', 'Q15087', 'Q15088', 'Q165...",P04637,['TP53'],0.753801,0.270253,,...,,,,0.748968,,,,,AML,1
18781,CHEMBL3989908,CHEMBL4309368,CHEMBL3991501,specific,"['B2R6L6', 'B4DFL2', 'P48735', 'Q96GT3']",P48735,['IDH2'],0.752305,0.799058,,...,,,,,,,,,AML,1
345,CHEMBL1489,CHEMBL2354311,CHEMBL2007625,acceptable,"['O75874', 'Q567U4', 'Q6FHQ6', 'Q7Z3V0', 'Q930...",O75874,['IDH1'],0.750635,0.778973,,...,,,,,,,,,AML,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27903,CHEMBL90555,CHEMBL3739136,CHEMBL613290,wide-used,Not interested!,,,,,,...,,,,,,,,,AML,0
27973,CHEMBL90555,CHEMBL3998533,CHEMBL4523578,wide-used,Not interested!,,,,,,...,,,,,,,,,AML,0
27991,CHEMBL90555,CHEMBL4042130,CHEMBL4513128,wide-used,Not interested!,,,,,,...,,,,,,,,,AML,0
28045,CHEMBL90555,CHEMBL4252847,CHEMBL4296492,wide-used,Not interested!,,,,,,...,,,,,,,,,AML,0


In [15]:
print(f"Total: {drugs_assay_target_disease_mapped_relationship.__len__()}")
print(drugs_assay_target_disease_mapped_relationship.nunique())

Total: 28172
parent_compound_chembl_id         23
assay_chembl_id                21242
target_chembl_id                2121
parent_compound_specificity        3
target_uniprot_ids               827
OT_uniprot_id                    567
OT_symbol                        567
OT_overall_score                 492
OT_chembl_score                   66
OT_uniprot_variants_score          2
OT_cancer_gene_census_score       15
OT_intogen_score                   8
OT_eva_somatic_score              10
OT_eva_score                       7
OT_uniprot_literature_score        1
OT_gene_burden_score               2
OT_slapenrich_score               27
OT_cancer_biomarkers_score         5
OT_europepmc_score               435
OT_genomics_england_score          2
OT_gene2phenotype_score            0
OT_clingen_score                   0
OT_orphanet_score                  0
OT_impc_score                     69
OT_crispr_score                    9
OT_expression_atlas_score         36
OT_reactome_score        

# Read assay cluster id

In [16]:
assay_cluster_path = '../assay_clustering/AML_assays_clustered.csv'
assay_cluster = pd.read_csv(assay_cluster_path)
print(assay_cluster.__len__())

5001


In [17]:
cluster_info = assay_cluster[['check_assay_chembl_id', 'embedding_cluster']].set_index('check_assay_chembl_id').to_dict()['embedding_cluster']

In [18]:
for idx in range(len(drugs_assay_target_disease_mapped_relationship)):
    each_relationship = drugs_assay_target_disease_mapped_relationship.iloc[idx]
    if each_relationship['assay_chembl_id'] in cluster_info:
        drugs_assay_target_disease_mapped_relationship.at[idx, 'assay_cluster'] = str(int(cluster_info[each_relationship['assay_chembl_id']]))
        drugs_assay_target_disease_mapped_relationship.at[idx, 'assay_importance'] += 1
    else:
        drugs_assay_target_disease_mapped_relationship.at[idx, 'assay_cluster'] = 'Not AML cell lines'

In [19]:
drugs_assay_target_disease_mapped_relationship

Unnamed: 0,parent_compound_chembl_id,assay_chembl_id,target_chembl_id,parent_compound_specificity,target_uniprot_ids,OT_uniprot_id,OT_symbol,OT_overall_score,OT_chembl_score,OT_uniprot_variants_score,...,OT_clingen_score,OT_orphanet_score,OT_impc_score,OT_crispr_score,OT_expression_atlas_score,OT_reactome_score,OT_ot_genetics_portal_score,disease_focused,assay_importance,assay_cluster
0,CHEMBL2362016,CHEMBL935685,CHEMBL6035,specific,"['O89049', 'Q5U344', 'Q9JKZ3', 'Q9JKZ4', 'Q9R1...",,,,,,...,,,,,,,,AML,1,542
1,CHEMBL2362016,CHEMBL935686,CHEMBL2403,specific,"['O95840', 'Q96IJ2', 'Q9H2Z5', 'Q9NNW7', 'Q9NZ...",Q9NNW7,['TXNRD2'],0.00577,,,...,,,,,,,,AML,2,518
2,CHEMBL2362016,CHEMBL935687,CHEMBL6035,specific,"['O89049', 'Q5U344', 'Q9JKZ3', 'Q9JKZ4', 'Q9R1...",,,,,,...,,,,,,,,AML,1,542
3,CHEMBL2362016,CHEMBL936488,CHEMBL6035,specific,"['O89049', 'Q5U344', 'Q9JKZ3', 'Q9JKZ4', 'Q9R1...",,,,,,...,,,,,,,,AML,1,542
4,CHEMBL2362016,CHEMBL931509,CHEMBL6035,specific,"['O89049', 'Q5U344', 'Q9JKZ3', 'Q9JKZ4', 'Q9R1...",,,,,,...,,,,,,,,AML,1,542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28167,CHEMBL90555,CHEMBL5161635,CHEMBL614004,wide-used,Not interested!,,,,,,...,,,,,,,,AML,0,Not AML cell lines
28168,CHEMBL90555,CHEMBL5161636,CHEMBL612545,wide-used,Not interested!,,,,,,...,,,,,,,,AML,0,Not AML cell lines
28169,CHEMBL90555,CHEMBL5162749,CHEMBL3832942,wide-used,Not interested!,,,,,,...,,,,,,,,AML,0,Not AML cell lines
28170,CHEMBL90555,CHEMBL5162765,CHEMBL2095182,wide-used,Not interested!,,,,,,...,,,,,,,,AML,0,Not AML cell lines


# Save relationship

In [20]:
drugs_assay_target_disease_mapped_relationship.to_csv('AML_drugs_assay_target_disease_mapped_relationship_raw.csv', index=False)

In [23]:
d_a_t_d_mapped_relation_tidy = drugs_assay_target_disease_mapped_relationship[['disease_focused', 'assay_importance', 'parent_compound_chembl_id', 'parent_compound_specificity', 'assay_chembl_id', 'assay_cluster', 'target_chembl_id', 'target_uniprot_ids', 'OT_uniprot_id', 'OT_symbol', 'OT_overall_score']]
d_a_t_d_mapped_relation_tidy = d_a_t_d_mapped_relation_tidy.sort_values(by=['assay_importance', 'assay_cluster', 'OT_overall_score'], ascending=False)
d_a_t_d_mapped_relation_tidy.reset_index(drop=True, inplace=True)
d_a_t_d_mapped_relation_tidy.to_csv('AML_drugs_assay_target_disease_mapped_relationship_tidy.csv', index=False)

In [24]:
d_a_t_d_tidy_specific_accept = d_a_t_d_mapped_relation_tidy[d_a_t_d_mapped_relation_tidy['parent_compound_specificity'] != 'wide-used']
d_a_t_d_tidy_specific_accept.reset_index(drop=True, inplace=True)
d_a_t_d_tidy_specific_accept.to_csv('AML_drugs_assay_target_disease_mapped_relationship_tidy_non_wide_used.csv', index=False)

In [25]:
d_a_t_d_tidy_assay_important = d_a_t_d_tidy_specific_accept[d_a_t_d_tidy_specific_accept['assay_importance'] == 2]
d_a_t_d_tidy_assay_important.reset_index(drop=True, inplace=True)
d_a_t_d_tidy_assay_important.to_csv('AML_drugs_assay_target_disease_mapped_relationship_tidy_assay_importance_2.csv', index=False)

In [26]:
d_a_t_d_tidy_assay_important.nunique()

disease_focused                   1
assay_importance                  1
parent_compound_chembl_id        15
parent_compound_specificity       2
assay_chembl_id                1669
assay_cluster                    99
target_chembl_id                407
target_uniprot_ids              399
OT_uniprot_id                   399
OT_symbol                       399
OT_overall_score                347
dtype: int64