In [1]:
# Import relevant libraries to make HTTP requests and parse JSON response
import requests
import json
import pandas as pd

from pd_process import expand_by_column, str2list

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# API query

In [2]:
# Set ensemblid variable
efoId = "EFO_0000222"  # acute myeloid leukemia

# Build query string to get general information about AR and genetic constraint and tractability assessments 
query_string = """
  query targetsAssociatedWithDisease($efoId: String!){
    disease(efoId: $efoId){
      id
      name
      associatedTargets (page: {size: 10000, index: 0}){
        count
        rows{
          target{
            id
            approvedSymbol
            proteinIds{
              id
              source
            }
          }
          score
          datasourceScores {
            id
            score
          }
        }
      }
    }
  }
"""

# Set variables object of arguments to be passed to endpoint
variables = {"efoId": efoId}

# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"

# Perform POST request and check status code of response
r = requests.post(base_url, json={"query": query_string, "variables": variables})
assert r.status_code == 200, "Error: Response code was not 200 OK"

# Transform API response from JSON into Python dictionary and print in console
api_response = json.loads(r.text)

assert api_response['data']['disease']['associatedTargets']['count'] <= 10000, "Error: should set a higher page size"

In [3]:
print(f"Target hits: {len(api_response['data']['disease']['associatedTargets']['rows'])}")

Target hits: 8311


In [4]:
# Extract target information from the response
targets = api_response['data']['disease']['associatedTargets']['rows']
targets_dict = {}
for idx, each_target in enumerate(targets):
    targets_dict[idx] = {}
    targets_dict[idx]['symbol'] = each_target['target']['approvedSymbol']
    targets_dict[idx]['overall_score'] = each_target['score']
    for each_datasource in each_target['datasourceScores']:
        targets_dict[idx][f"{each_datasource['id']}_score"] = each_datasource['score']
    
    # target uniprot id
    target_proteinIds_df = pd.DataFrame(each_target['target']['proteinIds'])
    if target_proteinIds_df.empty:
        targets_dict[idx]['uniprot_id'] = "Not found"
        continue
    target_uniprot_id_df = target_proteinIds_df[target_proteinIds_df['source'] == 'uniprot_swissprot']
    if target_uniprot_id_df.empty:
        targets_dict[idx]['uniprot_id'] = "Not found"
        continue
    targets_dict[idx]['uniprot_id'] = str(list(target_uniprot_id_df['id'].values))

In [5]:
# Convert dictionary to dataframe and sort by overall score
targets_df = pd.DataFrame(targets_dict).T
main_colname = ['symbol', 'overall_score', 'uniprot_id']        # put important columns first
rest_of_colname = targets_df.columns.drop(['symbol', 'overall_score', 'uniprot_id']).tolist()
targets_df = targets_df[main_colname + rest_of_colname]
targets_df.sort_values(by='overall_score', ascending=False, inplace=True)

In [27]:
targets_df_path = 'target_associated_with_AML.csv'
# targets_df.to_csv(targets_df_path, index=False)

# post-processing: deal with duplicated uniprot id
1. Check the symbol with multiple uniprot id and expand the dataset.
2. If there are more than 1 symbol (gene name) pointing to the same uniprot, the data will be grouped by uniprot id.

In [2]:
# read checkpoint
targets_df_path = 'target_associated_with_AML.csv'
targets_df = pd.read_csv(targets_df_path)
print(targets_df.__len__())

8311


## expand the uniprot id with the same symbol (gene)

In [3]:
targets_df.head()

Unnamed: 0,symbol,overall_score,uniprot_id,chembl_score,uniprot_variants_score,cancer_gene_census_score,intogen_score,eva_somatic_score,eva_score,uniprot_literature_score,...,europepmc_score,genomics_england_score,gene2phenotype_score,clingen_score,orphanet_score,impc_score,crispr_score,expression_atlas_score,reactome_score,ot_genetics_portal_score
0,DNMT3A,0.847393,['Q9Y6K1'],0.940675,0.865457,0.841731,0.838003,0.832124,0.805562,0.759913,...,0.984963,,,,,,,,,
1,CEBPA,0.838892,['P49715'],,0.607931,0.829572,0.699588,0.894862,0.918328,0.759913,...,0.979052,0.886059,0.607931,0.607931,0.607931,0.455447,,,,
2,FLT3,0.821082,['P36888'],0.93404,,0.819313,0.875227,0.877781,0.43771,0.379957,...,0.997918,,,,,0.520993,,,,
3,TET2,0.793266,['Q6N021'],,,0.88264,0.739032,,,,...,0.961804,,,,,0.671552,,,,
4,SRSF2,0.782598,['Q01130'],,,0.874196,0.607931,0.656565,,,...,0.620911,,,,,,,,,


In [4]:
targets_df_expand = expand_by_column(targets_df, 'uniprot_id', 'uniprot_id')
print(targets_df_expand.__len__())

8326


### test and validate

In [5]:
for idx, each_uniprot_str in enumerate(targets_df['uniprot_id'].values):
    uniprot_list = str2list(each_uniprot_str)
    if len(uniprot_list) > 1:
        print(idx, uniprot_list)

147 ['P42771', 'Q8N726']
191 ['P39880', 'Q13948']
480 ['O95467', 'P63092', 'Q5JWF2']
629 ['B7ZAP0', 'Q5R372']
870 ['E9PAV3', 'Q13765']
2167 ['Q96PG8', 'Q9BXH1']
3065 ['O00241', 'Q5TFQ8']
3336 ['P0DI83', 'Q9BZG1']
3339 ['P01258', 'P06881']
3454 ['P0CAP2', 'Q6EEV4']
6137 ['P58400', 'Q9ULB1']
7290 ['P0DPB5', 'P0DPB6']
8018 ['Q5JU69', 'Q8N2E6']
8200 ['P42166', 'P42167']


In [6]:
targets_df.iloc[147][['symbol', 'overall_score', 'uniprot_id']]

symbol                         CDKN2A
overall_score                0.396773
uniprot_id       ['P42771', 'Q8N726']
Name: 147, dtype: object

In [7]:
targets_df_expand[targets_df_expand['symbol'] == 'CDKN2A']

Unnamed: 0,symbol,overall_score,uniprot_id,chembl_score,uniprot_variants_score,cancer_gene_census_score,intogen_score,eva_somatic_score,eva_score,uniprot_literature_score,...,europepmc_score,genomics_england_score,gene2phenotype_score,clingen_score,orphanet_score,impc_score,crispr_score,expression_atlas_score,reactome_score,ot_genetics_portal_score
147,CDKN2A,0.396773,P42771,,,0.603709,,,,,...,0.750376,,,,,0.514537,,,,
147,CDKN2A,0.396773,Q8N726,,,0.603709,,,,,...,0.750376,,,,,0.514537,,,,


## deal with uniprot id with multiple symbol
scores are averaged.

In [8]:
uniprot_found_df = targets_df_expand[(targets_df_expand['uniprot_id'] != "Not found")].sort_values(by=['uniprot_id', 'overall_score'], ascending=False)
uniprot_non_found_df = targets_df_expand[(targets_df_expand['uniprot_id'] == "Not found")]

In [9]:
uniprot_found_df_new = (uniprot_found_df.groupby(['uniprot_id'])
                        .agg({
                            'symbol': lambda x: str(x.tolist()), 
                            'overall_score': 'mean',
                            'chembl_score': 'mean',
                            'uniprot_variants_score': 'mean', 'cancer_gene_census_score':'mean', 'intogen_score': 'mean',
                            'eva_somatic_score': 'mean', 'eva_score': 'mean', 'uniprot_literature_score': 'mean',
                            'gene_burden_score': 'mean', 'slapenrich_score': 'mean', 'cancer_biomarkers_score': 'mean',
                            'europepmc_score': 'mean', 'genomics_england_score': 'mean', 'gene2phenotype_score': 'mean',
                            'clingen_score': 'mean', 'orphanet_score': 'mean', 'impc_score': 'mean', 'crispr_score': 'mean',
                            'expression_atlas_score': 'mean', 'reactome_score': 'mean', 'ot_genetics_portal_score': 'mean'
                            })
                        .reset_index())

uniprot_found_df_new.sort_values(by='overall_score', ascending=False, inplace=True)
uniprot_found_df_new.reset_index(drop=True, inplace=True)


uniprot_non_found_df =  (uniprot_non_found_df.groupby(['symbol'])
                         .agg({
                            'uniprot_id': lambda x: str(x.to_list()[0]), 
                            'overall_score': 'mean',
                            'chembl_score': 'mean',
                            'uniprot_variants_score': 'mean', 'cancer_gene_census_score':'mean', 'intogen_score': 'mean',
                            'eva_somatic_score': 'mean', 'eva_score': 'mean', 'uniprot_literature_score': 'mean',
                            'gene_burden_score': 'mean', 'slapenrich_score': 'mean', 'cancer_biomarkers_score': 'mean',
                            'europepmc_score': 'mean', 'genomics_england_score': 'mean', 'gene2phenotype_score': 'mean',
                            'clingen_score': 'mean', 'orphanet_score': 'mean', 'impc_score': 'mean', 'crispr_score': 'mean',
                            'expression_atlas_score': 'mean', 'reactome_score': 'mean', 'ot_genetics_portal_score': 'mean'
                            })
                        .reset_index())

uniprot_non_found_df['symbol'] = [f"['{each}']" for each in uniprot_non_found_df['symbol']]


targets_df_new = pd.concat([uniprot_found_df_new, uniprot_non_found_df], axis=0)
targets_df_new.sort_values(by='overall_score', ascending=False, inplace=True)
targets_df_new.reset_index(drop=True, inplace=True)



In [10]:
uniprot_non_found_df

Unnamed: 0,symbol,uniprot_id,overall_score,chembl_score,uniprot_variants_score,cancer_gene_census_score,intogen_score,eva_somatic_score,eva_score,uniprot_literature_score,...,europepmc_score,genomics_england_score,gene2phenotype_score,clingen_score,orphanet_score,impc_score,crispr_score,expression_atlas_score,reactome_score,ot_genetics_portal_score
0,['ABO'],Not found,0.065943,,,,,,,,...,0.542355,,,,,,,,,
1,['AIRN'],Not found,0.007790,,,,,,,,...,0.064066,,,,,,,,,
2,['ALG1L1P'],Not found,0.035184,,,,,,,,...,,,,,,0.289375,,,,
3,['ALYREF'],Not found,0.007761,,,,,,,,...,0.063833,,,,,,,,,
4,['ANKRD20A8P'],Not found,0.003696,,,,,,,,...,0.030397,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,['XRCC6P5'],Not found,0.008344,,,,,,,,...,0.068625,,,,,,,,,
425,['Y_RNA'],Not found,0.002439,,,,,,,,...,,,,,,,,0.020056,,
426,['ZEB2-AS1'],Not found,0.001478,,,,,,,,...,0.012159,,,,,,,,,
427,['ZFAS1'],Not found,0.102281,,,,,,,,...,0.841222,,,,,,,,,


In [11]:
targets_df_new

Unnamed: 0,uniprot_id,symbol,overall_score,chembl_score,uniprot_variants_score,cancer_gene_census_score,intogen_score,eva_somatic_score,eva_score,uniprot_literature_score,...,europepmc_score,genomics_england_score,gene2phenotype_score,clingen_score,orphanet_score,impc_score,crispr_score,expression_atlas_score,reactome_score,ot_genetics_portal_score
0,Q9Y6K1,['DNMT3A'],0.847393,0.940675,0.865457,0.841731,0.838003,0.832124,0.805562,0.759913,...,0.984963,,,,,,,,,
1,P49715,['CEBPA'],0.838892,,0.607931,0.829572,0.699588,0.894862,0.918328,0.759913,...,0.979052,0.886059,0.607931,0.607931,0.607931,0.455447,,,,
2,P36888,['FLT3'],0.821082,0.934040,,0.819313,0.875227,0.877781,0.437710,0.379957,...,0.997918,,,,,0.520993,,,,
3,Q6N021,['TET2'],0.793266,,,0.882640,0.739032,,,,...,0.961804,,,,,0.671552,,,,
4,Q01130,['SRSF2'],0.782598,,,0.874196,0.607931,0.656565,,,...,0.620911,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8275,Q15404,['RSU1'],0.001064,,,,,,,,...,,,,,,,,0.008753,,
8276,Q6TFL3,['CCDC171'],0.001055,,,,,,,,...,,,,,,,,0.008675,,
8277,O75157,['TSC22D2'],0.001055,,,,,,,,...,,,,,,,,0.008674,,
8278,Q8WVY7,['UBLCP1'],0.001052,,,,,,,,...,,,,,,,,0.008656,,


In [12]:
targets_df_new.to_csv('target_associated_with_AML_grouped.csv', index=False)
uniprot_found_df_new.to_csv('target_associated_with_AML_grouped_clean.csv', index=False)