In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# References

Where to get domain information:
- [UniProtKB column names for programmatic access
](https://www.uniprot.org/help/return_fields)

General Uniprot API
- [Uniprot API: Programmatic access - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)
- [Programmatic access - Retrieving individual entries
](https://www.uniprot.org/help/api_retrieve_entries)
- **[REST API - Access the UniProt website programmatically](https://www.uniprot.org/help/api)**
- **[REST API - Retrieve entries](https://www.uniprot.org/help/api_retrieve_entries)**
- **[REST API - ID Mapping](https://www.uniprot.org/help/id_mapping)**
- **[REST API - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)**
- **[REST API - Downloading](https://www.uniprot.org/help/api_downloading)**

# Read in UniProt API request data 

In [94]:
import gzip
import json
import os
import time
from collections import defaultdict
from io import BytesIO, StringIO
from pprint import pprint

import pandas as pd
import polars as pl
import requests
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

# tqdm.auto()

api_url = "https://rest.uniprot.org/uniprotkb/search"


# Thanks ChatGPT :)
def fetch_data(url, params=None):
    """
    Fetch data from a REST API endpoint.

    :param url: URL of the REST API endpoint.
    :param params: Dictionary of query parameters, defaults to None.
    :return: Parsed JSON data from the API response.
    """
    try:
        response = requests.get(url, params=params)
        # Check if the request was successful
        response.raise_for_status()

        return response
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Error connecting to the server: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout error: {timeout_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"Error decoding JSON: {json_err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None


def extract_cursor_value(link):
    # Example link:
    # '<https://rest.uniprot.org/uniprotkb/search?fields=accession&fields=gene_names&fields=id&fields=lineage&fields=lineage_ids&fields=organism_id&fields=organism_name&query=%28taxonomy_id%3A2759%29&cursor=1mkycb2xwxbouu0sjmt6c5coxaq82nladhrl&size=25>; rel="next"'
    split = link.split("cursor=")[-1]
    cursor = split.split("&")[0]
    return cursor


def fetch_data_paginate(url, params=None):
    """
    Fetch data from a REST API endpoint, paginate for uniprot.

    :param url: URL of the REST API endpoint.
    :param params: Dictionary of query parameters, defaults to None.
    :return: Parsed JSON data from the API response.
    """
    cursor = None

    response_format = params.get("format", "json")
    while True:
        # Add serialized pagination
        params["cursor"] = cursor

        response = fetch_data(url, params)
        total = response.headers["x-total-results"]

        # Parse the JSON response
        if response_format == "json":
            data = response.json()["results"]
        else:
            data = response.content
        yield data

        # Example link:
        # '<https://rest.uniprot.org/uniprotkb/search?fields=accession&fields=gene_names&fields=id&fields=lineage&fields=lineage_ids&fields=organism_id&fields=organism_name&query=%28taxonomy_id%3A2759%29&cursor=1mkycb2xwxbouu0sjmt6c5coxaq82nladhrl&size=25>; rel="next"'
        if "Link" in response.headers:
            cursor = extract_cursor_value(response.headers["Link"])
        else:
            # From UniProt documentation
            # "Note: whenever there are more results to retrieve, there will
            # be a corresponding the 'Link' header pointing to the next page of results.
            # When there are no more results to retrieve, no 'Link' header is returned."

            break

## Test pagination

In [21]:
human_query = "human cdc7"
data = {
    "query": human_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 25,
    "format": "tsv",
}


n_results = 0

for response in fetch_data_paginate(api_url, data):
    n = len(response)
    n_results += n
    print(f"Got {n} results, so far {n_results}")

Got 31111 results, so far 31111
Got 30596 results, so far 61707
Got 22902 results, so far 84609
Got 17479 results, so far 102088
Got 17332 results, so far 119420
Got 11797 results, so far 131217


In [103]:
response = fetch_data(api_url, data)
# response.__dict__[

In [32]:
df = pl.read_csv(BytesIO(response), separator="\t")
df.head()
# df['Taxonomic lineage (Ids)'].head()
# pl.read_csv(StringIO(response), sep="\t")

Entry,Gene Names,Entry Name,Taxonomic lineage,Taxonomic lineage (Ids),Organism (ID),Organism
str,str,str,str,str,i64,str
"""A0A0C2F690""","""SPBR_05471""","""A0A0C2F690_9PE…","""cellular organ…","""131567 (no ran…",1398154,"""Sporothrix bra…"
"""A0A8H4GHV6""","""CNMCM8060_0015…","""A0A8H4GHV6_9EU…","""cellular organ…","""131567 (no ran…",293939,"""Aspergillus le…"
"""J3KCS9""","""CIMG_04103""","""J3KCS9_COCIM""","""cellular organ…","""131567 (no ran…",246410,"""Coccidioides i…"
"""A0A8H4GJT1""","""CNMCM6457_0004…","""A0A8H4GJT1_9EU…","""cellular organ…","""131567 (no ran…",340414,"""Aspergillus fu…"
"""A0A397GXE0""","""CDV56_105079""","""A0A397GXE0_9EU…","""cellular organ…","""131567 (no ran…",41047,"""Aspergillus th…"


# Query for Eukaryota reviewed sequences

In [19]:
# dict.get(

In [20]:
def get_raw_fields(entry, raw_fields):
    info = {}
    for field in raw_fields:
        info[field] = entry.get(field, None)
    return info


def get_organism_info(entry, raw_fields=["scientificName", "commonName", "taxonId"]):
    organism = entry["organism"]

    organism_info = get_raw_fields(organism, raw_fields)
    return organism_info


def join_lineage(entry):
    lineages = entry["lineages"]

    lineage_joined = " > ".join(lineage["scientificName"] for lineage in lineages)
    taxonIds_joined = " > ".join(str(lineage["taxonId"]) for lineage in lineages)
    lineage_info = dict(lineage=lineage_joined, lineage_taxonIds=taxonIds_joined)
    return lineage_info

### Compute total

In [5]:
n_results_from_website = 75701375
size = 500
n_results_from_website / size

151402.75

#### Six digits of results -> `zfill(6)`

# Try UniProt API's Python example code

In [142]:
import re

import requests
from requests.adapters import HTTPAdapter, Retry

re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))


def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)


def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)


data = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "tsv",
}


# insulin_interactors_url = "https://rest.uniprot.org/uniprotkb/search?fields=accession%2Ccc_interaction&format=tsv&query=Insulin%20AND%20%28reviewed%3Atrue%29&size=500"
eukaryota_url = "https://rest.uniprot.org/uniprotkb/search?query=%28taxonomy_id%3A2759%29&fields=id&fields=accession&fields=gene_names&fields=organism_name&fields=organism_id&fields=lineage&fields=lineage_ids&size=500&format=tsv"
progress = 0


start_time = time.time()
notify_every = 10000
size = 500

with gzip.open("../data/2024-03-30__uniprot_eukaryota.tsv.gz", "wb") as f:
    for i, (batch, total) in enumerate(get_batch(eukaryota_url)):
        # f.write(batch.content)
        # lines = batch.content.splitlines()
        header, body = batch.content.split(b"\n", maxsplit=1)
        if not progress:
            f.write(header)
            # print(lines[0], file=f)
        for line in lines[1:]:
            f.write(body)
            # print(line, file=f)
        # progress += len(lines[1:])
        # This is approximate but good enough for now
        progress += size
        # print(f"{progress} / {total}")

        if progress % notify_every == 0:
            time_so_far = time.time() - start_time
            time_per_result = progress / time_so_far
            iter_per_s = (progress / 500) / time_so_far
            print(
                f"Iteration: {i}\tParsed {progress} in {time_so_far/60:.2f} minutes, "
                f"{time_per_result:.1f} results/s, {iter_per_s:.2f} iter/s"
            )

Iteration: 19	Parsed 10000 in 0.82 minutes, 203.8 results/s, 0.41 iter/s
Iteration: 39	Parsed 20000 in 1.71 minutes, 194.5 results/s, 0.39 iter/s


KeyboardInterrupt: 

In [138]:
%timeit len(re.findall('\n', batch.text))

379 µs ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [139]:
%timeit len(batch.text.splitlines())

763 µs ± 91.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [140]:
%timeit batch.content.split(b"\n", maxsplit=1)

25.7 µs ± 5.68 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


501

In [123]:
batch.text[:500]

'Entry\tGene Names\tEntry Name\tTaxonomic lineage\tTaxonomic lineage (Ids)\tOrganism (ID)\tOrganism\nA0A0C5B5G6\tMT-RNR1\tMOTSC_HUMAN\tcellular organisms (no rank), Eukaryota (superkingdom), Opisthokonta (no rank), Metazoa (kingdom), Eumetazoa (no rank), Bilateria (no rank), Deuterostomia (no rank), Chordata (phylum), Craniata (subphylum), Vertebrata (no rank), Gnathostomata (no rank), Teleostomi (no rank), Euteleostomi (no rank), Sarcopterygii (superclass), Dipnotetrapodomorpha (no rank), Tetrapoda (no ra'

In [121]:
print(batch.text[:500])

Entry	Gene Names	Entry Name	Taxonomic lineage	Taxonomic lineage (Ids)	Organism (ID)	Organism
A0A0C5B5G6	MT-RNR1	MOTSC_HUMAN	cellular organisms (no rank), Eukaryota (superkingdom), Opisthokonta (no rank), Metazoa (kingdom), Eumetazoa (no rank), Bilateria (no rank), Deuterostomia (no rank), Chordata (phylum), Craniata (subphylum), Vertebrata (no rank), Gnathostomata (no rank), Teleostomi (no rank), Euteleostomi (no rank), Sarcopterygii (superclass), Dipnotetrapodomorpha (no rank), Tetrapoda (no ra


In [119]:
# print(batch.content)

In [120]:
# batch.content.splitlines()

# REturn a JSON

## Iterate over all results -> Append to dict of lists

In [68]:
column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]
dict.fromkeys(column_headers, [])

{'entryType': [],
 'primaryAccession': [],
 'uniProtkbId': [],
 'scientificName': [],
 'commonName': [],
 'taxonId': [],
 'lineage': [],
 'lineage_taxonIds': []}

### Let's not do a progress bar

In [145]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]

start_time = time.time()
notify_every = 10000

euk_params_output_json = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "json",
}


eukaryota_query = "(taxonomy_id:2759)"


for i, response in enumerate(fetch_data_paginate(api_url, euk_params_output_json)):
    n_results += len(response)
    # print(f"Got {n} results, so far {n_results}")

    response_parsed = {
        "entryType": [],
        "primaryAccession": [],
        "uniProtkbId": [],
        "scientificName": [],
        "commonName": [],
        "taxonId": [],
        "lineage": [],
        "lineage_taxonIds": [],
    }
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        # print(entry_parsed)
        # break
        for key in column_headers:
            response_parsed[key].append(entry_parsed[key])
    df = pd.DataFrame(response_parsed)
    # display(df.head())
    # csv = os.path.join(outdir, f"uniprot_eukaryota_{str(i).zfill(6)}.csv")
    # df.to_csv(csv)
    pq = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
    )
    df.to_parquet(pq)
    if n_results % notify_every == 0:
        time_so_far = time.time() - start_time
        time_per_result = n_results / time_so_far
        iter_per_s = (n_results / 500) / time_so_far
        estimated_total_hours = total * iter_per_s
        print(
            f"Iteration: {i}\tParsed {n_results} in {time_so_far/60:.2f} minutes, "
            f"{time_per_result:.1f} results/s, {iter_per_s:.2f} iter/s -> "
            f"{estimated_total_hours:.2f} hour estimate"
        )
    # print(csv)
    # pprint(lineage_info)

total: 151403
Iteration: 19	Parsed 10000 in 0.29 minutes, 582.8 results/s, 1.17 iter/s -> 176464.43 hour estimate
Iteration: 39	Parsed 20000 in 0.63 minutes, 532.9 results/s, 1.07 iter/s -> 161372.07 hour estimate
Iteration: 59	Parsed 30000 in 1.00 minutes, 501.6 results/s, 1.00 iter/s -> 151898.97 hour estimate
Iteration: 79	Parsed 40000 in 1.37 minutes, 487.0 results/s, 0.97 iter/s -> 147469.32 hour estimate


KeyboardInterrupt: 

## Iterate over all results -> Append to dict of lists

In [68]:
column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]
dict.fromkeys(column_headers, [])

{'entryType': [],
 'primaryAccession': [],
 'uniProtkbId': [],
 'scientificName': [],
 'commonName': [],
 'taxonId': [],
 'lineage': [],
 'lineage_taxonIds': []}

In [92]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]


eukaryota_query = "(taxonomy_id:2759)"


for i, response in tqdm(
    enumerate(fetch_data_paginate(api_url, data)), total=total, miniters=5
):

    # print(f"Got {n} results, so far {n_results}")

    response_parsed = {
        "entryType": [],
        "primaryAccession": [],
        "uniProtkbId": [],
        "scientificName": [],
        "commonName": [],
        "taxonId": [],
        "lineage": [],
        "lineage_taxonIds": [],
    }
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        # print(entry_parsed)
        # break
        for key in column_headers:
            response_parsed[key].append(entry_parsed[key])
    df = pd.DataFrame(response_parsed)
    # display(df.head())
    # csv = os.path.join(outdir, f"uniprot_eukaryota_{str(i).zfill(6)}.csv")
    # df.to_csv(csv)
    pq = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
    )
    df.to_parquet(pq)
    # print(csv)
    # pprint(lineage_info)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Try parallelizing with joblib

In [76]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]


eukaryota_query = "(taxonomy_id:2759)"


def parse_uniprot_response(i, response):
    response_parsed = {
        "entryType": [],
        "primaryAccession": [],
        "uniProtkbId": [],
        "scientificName": [],
        "commonName": [],
        "taxonId": [],
        "lineage": [],
        "lineage_taxonIds": [],
    }
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        # print(entry_parsed)
        # break
        for key in column_headers:
            response_parsed[key].append(entry_parsed[key])
    df = pd.DataFrame(response_parsed)
    # display(df.head())
    # csv = os.path.join(outdir, f"uniprot_eukaryota_{str(i).zfill(6)}.csv")
    # df.to_csv(csv)
    pq = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
    )
    df.to_parquet(pq)
    pass


Parallel(n_jobs=4)(
    delayed(parse_uniprot_response)(i, response)
    for i, response in tqdm(enumerate(fetch_data_paginate(api_url, data)), total=total)
)


# print(csv)
# pprint(lineage_info)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Let's try profiling

#### Make a function to run the queries

In [79]:
def parse_single_response_dict_of_lists(response):
    response_parsed = {
        "entryType": [],
        "primaryAccession": [],
        "uniProtkbId": [],
        "scientificName": [],
        "commonName": [],
        "taxonId": [],
        "lineage": [],
        "lineage_taxonIds": [],
    }
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        # print(entry_parsed)
        # break
        for key in column_headers:
            response_parsed[key].append(entry_parsed[key])
    return response_parsed


def parse_single_response_list_of_dicts(response):
    response_parsed = []
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        response_parsed.append(entry_parsed)
    return response_parsed


def create_and_save_pandas(data, parquet):
    df = pd.DataFrame(response_parsed)
    df.to_parquet(pq)


def create_and_save_polars(data, parquet):
    df = pl.DataFrame(response_parsed)
    df.write_parquet(pq)


PARSERS = {
    "dict_of_lists": parse_single_response_dict_of_lists,
    "list_of_dicts": parse_single_response_list_of_dicts,
}


DF_MAKERS = {"pandas": create_and_save_pandas, "polars": create_and_save_polars}


def query_first_n_batches(
    first_n=5, parser_style="dict_of_lists", df_maker="pandas", outdir="../data"
):

    n_results = 0

    column_headers = [
        "entryType",
        "primaryAccession",
        "uniProtkbId",
        "scientificName",
        "commonName",
        "taxonId",
        "lineage",
        "lineage_taxonIds",
    ]

    eukaryota_query = "(taxonomy_id:2759)"

    parser = PARSERS[parser_style]
    df_maker = DF_MAKERS[df_maker]

    for i, response in enumerate(fetch_data_paginate(api_url, data)):
        n = len(response)
        n_results += n
        parsed = parser(response)

        pq = os.path.join(
            outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
        )
        df_maker(parsed, pq)

        if i > first_n:
            break

In [84]:
%timeit query_first_n_batches(parser_style='dict_of_lists', df_maker='pandas')

5.89 s ± 87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [85]:
%timeit query_first_n_batches(parser_style='list_of_dicts', df_maker='pandas')

5.86 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [86]:
%timeit query_first_n_batches(parser_style='dict_of_lists', df_maker='polars')

5.97 s ± 217 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [87]:
%timeit query_first_n_batches(parser_style='list_of_dicts', df_maker='polars')

5.9 s ± 92.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### So it's ~6 seconds for the first 5 batches -> how long total?

In [89]:
(6 * total / 5) / 3600

50.467666666666666

#### -> should be 50 hours, why does it jump up?

In [71]:
%load_ext line_profiler

In [72]:
%lprun -f query_first_n_batches()

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

  profile = LineProfiler(*funcs)


Timer unit: 1e-09 s

## Iterate over all results -> append to list of dicts

In [62]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

data = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "json",
}

for i, response in tqdm(enumerate(fetch_data_paginate(api_url, data)), total=total):
    n = len(response)
    n_results += n
    # print(f"Got {n} results, so far {n_results}")

    lines = []
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        # print(entry_parsed)
        lines.append(entry_parsed)
    df = pd.DataFrame(lines)
    # csv = os.path.join(outdir, f"uniprot_eukaryota_{str(i).zfill(6)}.csv")
    # df.to_csv(csv)
    pq = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
    )
    df.to_parquet(pq)
    # print(csv)
    # pprint(lineage_info)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Iterate over all results -> Append to dict of lists, use Polars

In [61]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]

data = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "json",
}

for i, response in tqdm(enumerate(fetch_data_paginate(api_url, data)), total=total):
    n = len(response)
    n_results += n
    # print(f"Got {n} results, so far {n_results}")

    response_parsed = {
        "entryType": [],
        "primaryAccession": [],
        "uniProtkbId": [],
        "scientificName": [],
        "commonName": [],
        "taxonId": [],
        "lineage": [],
        "lineage_taxonIds": [],
    }
    for entry in response:
        raw_keys = ["entryType", "primaryAccession", "uniProtkbId"]
        organism_info = get_organism_info(entry)
        lineage_info = join_lineage(entry)

        entry_parsed = get_raw_fields(entry, raw_keys)
        entry_parsed.update(organism_info)
        entry_parsed.update(lineage_info)
        # print(entry_parsed)
        for key in column_headers:
            to_add = entry_parsed[key]
            response_parsed[key].append(to_add)
            # print("--- key:", key)
            # print(f"\t response_parsed[{key}]: {response_parsed[key]}")

        # break
    df = pl.DataFrame(response_parsed)
    # csv = os.path.join(outdir, f"uniprot_eukaryota_{str(i).zfill(6)}.csv")
    # df.to_csv(csv)
    pq = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
    )
    df.write_parquet(pq)
    # print(csv)
    # pprint(lineage_info)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [48]:
entry_parsed[key]

'131567 > 2759 > 33154 > 33208 > 6072 > 33213 > 33511 > 7711 > 89593 > 7742 > 7776 > 117570 > 117571 > 8287 > 1338369 > 32523 > 32524 > 40674 > 32525 > 9347 > 1437010 > 314146 > 9443 > 376913 > 314293 > 9526 > 314295 > 9604 > 207598 > 9605'

In [49]:
key

'lineage_taxonIds'

In [50]:
entry_parsed.keys()

dict_keys(['entryType', 'primaryAccession', 'uniProtkbId', 'scientificName', 'commonName', 'taxonId', 'lineage', 'lineage_taxonIds'])

In [51]:
entry_parsed["entryType"]

'UniProtKB reviewed (Swiss-Prot)'

In [52]:
response_parsed["entryType"]

['UniProtKB reviewed (Swiss-Prot)',
 'A0A0C5B5G6',
 'MOTSC_HUMAN',
 'Homo sapiens',
 'Human',
 9606,
 'cellular organisms > Eukaryota > Opisthokonta > Metazoa > Eumetazoa > Bilateria > Deuterostomia > Chordata > Craniata > Vertebrata > Gnathostomata > Teleostomi > Euteleostomi > Sarcopterygii > Dipnotetrapodomorpha > Tetrapoda > Amniota > Mammalia > Theria > Eutheria > Boreoeutheria > Euarchontoglires > Primates > Haplorrhini > Simiiformes > Catarrhini > Hominoidea > Hominidae > Homininae > Homo',
 '131567 > 2759 > 33154 > 33208 > 6072 > 33213 > 33511 > 7711 > 89593 > 7742 > 7776 > 117570 > 117571 > 8287 > 1338369 > 32523 > 32524 > 40674 > 32525 > 9347 > 1437010 > 314146 > 9443 > 376913 > 314293 > 9526 > 314295 > 9604 > 207598 > 9605']

In [53]:
response_parsed

{'entryType': ['UniProtKB reviewed (Swiss-Prot)',
  'A0A0C5B5G6',
  'MOTSC_HUMAN',
  'Homo sapiens',
  'Human',
  9606,
  'cellular organisms > Eukaryota > Opisthokonta > Metazoa > Eumetazoa > Bilateria > Deuterostomia > Chordata > Craniata > Vertebrata > Gnathostomata > Teleostomi > Euteleostomi > Sarcopterygii > Dipnotetrapodomorpha > Tetrapoda > Amniota > Mammalia > Theria > Eutheria > Boreoeutheria > Euarchontoglires > Primates > Haplorrhini > Simiiformes > Catarrhini > Hominoidea > Hominidae > Homininae > Homo',
  '131567 > 2759 > 33154 > 33208 > 6072 > 33213 > 33511 > 7711 > 89593 > 7742 > 7776 > 117570 > 117571 > 8287 > 1338369 > 32523 > 32524 > 40674 > 32525 > 9347 > 1437010 > 314146 > 9443 > 376913 > 314293 > 9526 > 314295 > 9604 > 207598 > 9605'],
 'primaryAccession': ['UniProtKB reviewed (Swiss-Prot)',
  'A0A0C5B5G6',
  'MOTSC_HUMAN',
  'Homo sapiens',
  'Human',
  9606,
  'cellular organisms > Eukaryota > Opisthokonta > Metazoa > Eumetazoa > Bilateria > Deuterostomia > Chor

In [None]:
%debug

# Return a TSV

## Write the raw tsv

In [38]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]

data = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "tsv",
}

for i, response in tqdm(enumerate(fetch_data_paginate(api_url, data)), total=total):
    n = len(response)
    n_results += n
    # print(f"Got {n} results, so far {n_results}")

    tsv = os.path.join(outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.tsv")
    with open(tsv, "wb") as f:
        f.write(response)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Write the tsv.gz

In [37]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]

data = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "tsv",
}

for i, response in tqdm(enumerate(fetch_data_paginate(api_url, data)), total=total):
    n = len(response)
    n_results += n
    # print(f"Got {n} results, so far {n_results}")

    tsv = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.tsv.gz"
    )
    with gzip.open(tsv, "wb") as f:
        f.write(response)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]

Exception ignored in: <finalize object at 0x16af52fa0; dead>
Traceback (most recent call last):
  File "/Users/olgabot/anaconda3/envs/data-analysis/lib/python3.12/weakref.py", line 585, in __call__
    def __call__(self, _=None):

KeyboardInterrupt: 


KeyboardInterrupt: 

## Parse with polars

In [35]:
n_results = 0

# ! mkdir ../data

outdir = "../data"

total = int(n_results_from_website / size) + 1
print("total:", total)

column_headers = [
    "entryType",
    "primaryAccession",
    "uniProtkbId",
    "scientificName",
    "commonName",
    "taxonId",
    "lineage",
    "lineage_taxonIds",
]

data = {
    "query": eukaryota_query,
    "fields": [
        "id",
        "accession",
        "gene_names",
        "organism_name",
        "organism_id",
        "lineage",
        "lineage_ids",
    ],
    "size": 500,
    "format": "tsv",
}

for i, response in tqdm(enumerate(fetch_data_paginate(api_url, data)), total=total):
    n = len(response)
    n_results += n
    # print(f"Got {n} results, so far {n_results}")

    df = pl.read_csv(BytesIO(response), separator="\t")
    # csv = os.path.join(outdir, f"uniprot_eukaryota_{str(i).zfill(6)}.csv")
    # df.to_csv(csv)
    pq = os.path.join(
        outdir, f"2024-03-30__uniprot_eukaryota_{str(i).zfill(6)}.parquet"
    )
    df.write_parquet(pq)
    # print(csv)
    # pprint(lineage_info)

total: 151403


  0%|          | 0/151403 [00:00<?, ?it/s]


KeyboardInterrupt



In [None]:
response_parsed.keys()

In [None]:
# response_parsed

In [10]:
entry_parsed.keys()

dict_keys(['entryType', 'primaryAccession', 'uniProtkbId', 'scientificName', 'commonName', 'taxonId', 'lineage', 'lineage_taxonIds'])