In [43]:
import pandas as pd
import altair as alt
import numpy as np
from jsonapi_client import Session, Modifier
from jsonapi_client.exceptions import DocumentError
from pathlib import Path
import json
import requests

In [None]:
JSON_PATH = Path("../data/raw/63080F7C-11DE-11EF-8972-B537756F2D03.1.json") #change this path accordingly

with open(JSON_PATH, "r") as f:
    result = json.load(f)

In [None]:
df_hits = pd.DataFrame.from_dict(result['results']["hits"])
df_hits

In [None]:
# Transform the 'evalue' column to log scale
df_hits['evalue'] = pd.to_numeric(df_hits['evalue'])

# Create a histogram
chart = alt.Chart(df_hits).mark_bar().encode(
    alt.X("evalue:Q", bin=alt.Bin(maxbins=30)),  # Q means quantitative data
    y='count()',  # Number of records in each bin
)

chart

In [None]:
result["results"]["hits"][0]

In [None]:


hit_metadata = {}

ctr = 0
for num, item in enumerate(result["results"]["hits"]):
    peptide_name = item['name']
    for hit_type in ['samples', 'runs']:
        if item['mgnify'][hit_type] is not None:
            acc_id = item['mgnify'][hit_type][0][0]
            hit_metadata[ctr] = {"accession": acc_id, "peptide_name": peptide_name, "hit_type": hit_type}
            ctr = ctr + 1
            if hit_type == "samples":
                break

df_hits_metadata = pd.DataFrame.from_dict(hit_metadata).T
df_hits_metadata.accession.value_counts()


In [None]:
def get_sample_endpoint(sample_id):
    sample_endpoint = f'samples?accession={sample_id}'
    with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
        samples = map(lambda r: r.json, mgnify.iterate(sample_endpoint))
        return samples

sample_id = "%2C".join(df_hits_metadata[df_hits_metadata.hit_type == "samples"].accession.unique())
print(f'https://www.ebi.ac.uk/metagenomics/api/v1/samples?accession={sample_id}')
list(get_sample_endpoint(sample_id))

In [77]:
from jsonapi_client import Session, Modifier

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def get_run_endpoint(run_ids):
    run_ids_chunk = run_ids
    if isinstance(run_ids, (list, np.ndarray)):
        run_ids_chunk = "%2C".join(run_ids)
    print(f'https://www.ebi.ac.uk/metagenomics/api/v1/runs?accession={run_ids_chunk}')
    run_endpoint = f'runs?accession={run_ids_chunk}'
    url_mgnify = "https://www.ebi.ac.uk/metagenomics/api/v1"
    with Session(url_mgnify) as mgnify:
        try:
            runs = map(lambda r: r.json, mgnify.iterate(run_endpoint))
            status = True
            return list(runs), status
        except DocumentError as e:
            url = f'{url_mgnify}/runs?accession={run_ids_chunk}'
            response = requests.get(url)
            if response.status_code != 200:
                print(f"Error occurred with run_ids: {', '.join(run_ids)}.")
                print(response.json())
                bad_acc = response.json()['errors'][0]['detail'].strip('Select a valid choice.').split()[0]
                
                status = False
                return bad_acc, status

run_ids = df_hits_metadata[df_hits_metadata.hit_type == "runs"].accession.unique()

# Split run_ids into chunks of 100 and get runs for each chunk
all_runs = []
for run_ids_chunk in chunks(run_ids, 10):
    result, status = get_run_endpoint(run_ids_chunk)
    if status:
        pass
    else:
        run_ids_chunk = run_ids_chunk[run_ids_chunk != result]

        print(run_ids_chunk)

https://www.ebi.ac.uk/metagenomics/api/v1/runs?accession=ERR1474574%2CSRR6231191%2CERR526022%2CERR1135178%2CERR1414273%2CERR525902%2CERR973872%2CERR1474565%2CERR525926%2CERR525885
Error occurred with run_ids: ERR1474574, SRR6231191, ERR526022, ERR1135178, ERR1414273, ERR525902, ERR973872, ERR1474565, ERR525926, ERR525885.
{'errors': [{'detail': 'Select a valid choice. ERR973872 is not one of the available choices.', 'status': '400', 'source': {'pointer': '/data/attributes/accession'}, 'code': 'invalid_choice'}]}


AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [48]:
import requests

def get_run_endpoint(run_ids):
    url = f'https://www.ebi.ac.uk/metagenomics/api/v1/runs?accession={run_ids}'
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error occurred with run_ids: {run_ids}. Error: {response.text}")
        return []
    else:
        return response.json()

run_ids = df_hits_metadata[df_hits_metadata.hit_type == "runs"].accession.unique()

# Split run_ids into chunks of 100 and get runs for each chunk
all_runs = []
for run_ids_chunk in chunks(run_ids, 10):
    run_ids_chunk = "%2C".join(run_ids_chunk)
    print(f'https://www.ebi.ac.uk/metagenomics/api/v1/runs?accession={run_ids_chunk}')
    all_runs.extend(get_run_endpoint(run_ids_chunk))

https://www.ebi.ac.uk/metagenomics/api/v1/runs?accession=ERR1474574%2CSRR6231191%2CERR526022%2CERR1135178%2CERR1414273%2CERR525902%2CERR973872%2CERR1474565%2CERR525926%2CERR525885
Error occurred with run_ids: ERR1474574%2CSRR6231191%2CERR526022%2CERR1135178%2CERR1414273%2CERR525902%2CERR973872%2CERR1474565%2CERR525926%2CERR525885. Error: {"errors":[{"detail":"Select a valid choice. ERR973872 is not one of the available choices.","status":"400","source":{"pointer":"/data/attributes/accession"},"code":"invalid_choice"}]}
https://www.ebi.ac.uk/metagenomics/api/v1/runs?accession=SRR2081070%2CSRR4305420%2CERR1474580%2CERR1332590%2CERR1332594%2CERR537006%2CERR1620280%2CERR1190761%2CDRR042510%2CSRR6179360
Error occurred with run_ids: SRR2081070%2CSRR4305420%2CERR1474580%2CERR1332590%2CERR1332594%2CERR537006%2CERR1620280%2CERR1190761%2CDRR042510%2CSRR6179360. Error: {"errors":[{"detail":"Select a valid choice. ERR1332594 is not one of the available choices.","status":"400","source":{"pointer":

KeyboardInterrupt: 

In [None]:
len(df_hits_metadata[df_hits_metadata.hit_type == "runs"].accession.unique())

In [None]:
study_endpoint = "studies/MGYS00006613"
with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    study = map(lambda r: r.json, mgnify.iterate(study_endpoint))
    study = pd.json_normalize(study)

study

In [42]:
import jsonapi_client