Importing library

In [143]:
import json
import os
import requests
import pandas as pd

Specify API_URL

In [144]:
API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2alpha"

Get genome dataset report

In [145]:
genome_report_url = API_URL + "/genome/dataset_report"
genome_report_payload = {
  "filters": {
    "exclude_paired_reports": True,
  },
  "page_size": 10000,
  "page_token": None,
  "returned_content": "COMPLETE",
  "taxons": [
    "1",
  ]
}

In [146]:
genome_report_filename = os.path.join("data/" + "genome_report.json")

In [147]:
genome_report_res = requests.post(genome_report_url, json=genome_report_payload)
with open(genome_report_filename, "w") as file:
  json.dump(genome_report_res.json(), file)

Get accession name, taxon id, and organism name data

In [148]:
genome_report_json = []
with open(genome_report_filename, "r") as file:
  genome_report_json = json.load(file)["reports"]

In [149]:
genome_data = []

for genome_report in genome_report_json:
  data = {
    "accession": genome_report["accession"],
    "tax_id": genome_report["organism"]["tax_id"],
    "organism_name": genome_report["organism"]["organism_name"],
  }

  genome_data.append(data)

In [150]:
genome_data_df = pd.DataFrame(genome_data)
genome_data_df

Unnamed: 0,accession,tax_id,organism_name
0,GCF_000006945.2,99287,Salmonella enterica subsp. enterica serovar Ty...
1,GCF_000195955.2,83332,Mycobacterium tuberculosis H37Rv
2,GCF_000009045.1,224308,Bacillus subtilis subsp. subtilis str. 168
3,GCF_009858895.2,2697049,Severe acute respiratory syndrome coronavirus 2
4,GCF_000864885.1,227984,SARS coronavirus Tor2
...,...,...,...
995,GCF_000969885.1,523844,Methanosarcina thermophila TM-1
996,GCF_000550785.1,1285583,Corynebacterium casei LMG S-19264
997,GCF_000025705.1,580332,Sideroxydans lithotrophicus ES-1
998,GCF_000013705.1,265072,Methylobacillus flagellatus KT


In [151]:
genome_data_filename = os.path.join("data/" + "genome_data.csv")

In [152]:
genome_data_df.to_csv(genome_data_filename, index=False)

Get genome fna

In [153]:
genome_data_df = pd.read_csv(genome_data_filename)
genome_data = genome_data_df.iloc

In [154]:
genome_download_url = []
for data in genome_data:
  filename = data["organism_name"]
  filename = filename.replace(" ", "_")
  genome_download_url.append(API_URL + f"/genome/accession/{data['accession']}/download?include_annotation_type=GENOME_FASTA&filename={filename}.zip")

genome_data_df["download_url"] = genome_download_url

In [155]:
genome_data_df.to_csv(genome_data_filename)

Try to download genome fna

In [156]:
genome_data_df = pd.read_csv(genome_data_filename)

In [157]:
genome_download_filename = genome_data_df.iloc[0]["download_url"].split("filename=")[1]
genome_download_filename = os.path.join("download/" + genome_download_filename)

genome_download_res = requests.get(genome_data_df.iloc[0]["download_url"])
with open(genome_download_filename, "wb") as file:
  file.write(genome_download_res.content) 

Download sample genome fna

In [158]:
genome_data_df = pd.read_csv(genome_data_filename)

In [159]:
genome_download_url = genome_data_df["download_url"]
genome_download_url = genome_download_url.sample(3)

In [160]:
for download_url in genome_download_url:
  genome_download_filename = download_url.split("filename=")[1]
  genome_download_filename = os.path.join("download/" + genome_download_filename)

  print(f"currently downloading {genome_download_filename}")
  genome_download_res = requests.get(download_url)
  with open(genome_download_filename, "wb") as file:
    file.write(genome_download_res.content)

currently downloading download/Escherichia_phage_alpha3.zip
currently downloading download/Cellulomonas_fimi_ATCC_484.zip
currently downloading download/Microbacterium_sediminis.zip
