In [11]:
import pandas as pd
import json
import requests
import tqdm.notebook as tqdm

In [12]:
# define a function to parse topological domains
def parse_uniprot_topological_domains(int_protein_id):
    # create uniprot url
    uniprot_url = "https://www.uniprot.org/uniprot/" + int_protein_id + ".json"
    # read json data as a list
    uniprot_data = json.loads(requests.get(uniprot_url).text)
    # if there is no "features" field in the dictionary, return None
    if not 'features' in uniprot_data:
        print("No features for " + int_protein_id)
        return None
    feature_list = uniprot_data['features']
    # get the elements of the list in which the "type" field of the dictionary is "topological domain"
    topo_domain_list = [x for x in feature_list if x['type'] == "Topological domain"]
    # if the topo_domain_list is empty
    if not topo_domain_list:
        return None
    # report the number of elements in the list
    parsed_df = pd.DataFrame(columns=['uniprot_id', 'description', 'start', 'end'])
    # for every element in the list, retrieve the 'description' field of the dictionary, the 'location','start','value and the 'location','end','value'
    for i in range(len(topo_domain_list)):
        description = topo_domain_list[i]['description']
        start = topo_domain_list[i]['location']['start']['value']
        end = topo_domain_list[i]['location']['end']['value']
        parsed_df = pd.concat([parsed_df, pd.DataFrame([[int_protein_id, description, start, end]], columns=['uniprot_id', 'description', 'start', 'end'])])
    return parsed_df

def download_topological_domains(input_glyco_file, output_file, uniprot_id_column = "Protein ID"):
    # get the unique uniprot ids from the input file
    df = pd.read_csv(input_glyco_file)
    id_list = df[uniprot_id_column].unique()
    # create a new data frame
    out_df = pd.DataFrame(columns=['uniprot_id', 'description', 'start', 'end'])
    # for every uniprot_id in the data frame, parse the topological domains
    for i in tqdm.tqdm(id_list):
        parsed_df = parse_uniprot_topological_domains(i)
        if parsed_df is not None:
            out_df = pd.concat([out_df, parsed_df])
    # write the output file
    out_df.to_csv(output_file, index=False)

In [13]:
# download data for human cell lines
input_file = "data/raw_data/hek_hela_nglyco.csv"
output_file = "data/topological_domains/human_topological_domains.csv"
download_topological_domains(input_file, output_file)

  0%|          | 0/1336 [00:00<?, ?it/s]

No features for A2RUG3
No features for P0DN76


In [16]:
# download human for mouse
input_file = "data/raw_data/mouse_brain_nglyco.csv"
output_file = "data/topological_domains/mouse_topological_domains.csv"
download_topological_domains(input_file, output_file)

  0%|          | 0/3199 [00:00<?, ?it/s]

No features for E9Q5E3
No features for E9PYH1
No features for D3Z3Y0
No features for E9PW22
No features for A2AC16
No features for D6RFU9
No features for D6RIL8
No features for A0A0J9YUD5
No features for A0A0G2JGP7
No features for H3BJV9
No features for B2RX70
No features for A0A494BAA2
No features for Q80X68
No features for A0A0A6YXS5
No features for S4R197
