### WGKB Gene Database Query

#### Reading data via web scraping

In [1]:
import requests

In [None]:
url = 'https://www.ncbi.nlm.nih.gov/datasets/gene/GCF_001411555.2/'

payload = {
    'accession': 'GCF_001411555.2',
    'gene_types': [],
    'locations': [],
    'page_size': 20,
    'page_token': '',
    'search_text': [],
    'sort': []
}

headers = {
    'Origin': 'https://www.ncbi.nlm.nih.gov',
    'Referer': 'https://www.ncbi.nlm.nih.gov/datasets/gene/GCF_001411555.2/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

result = requests.post(url, headers=headers, params=payload)

#### Reading data via REST API

In [3]:
import requests

In [12]:
url = 'https://api.ncbi.nlm.nih.gov/datasets/v2alpha'

taxon = 'Juglans regia (English walnut)'
api_key = 'removed'

params = {
    'name': taxon,
    'api_key': api_key
}

response = requests.get(url, params=params)

In [13]:
response.text

# Might have to enable the NCBI API key. Don't think that will be viable if it's tied to my account.

'404 page not found\n'

#### Reading metadata with NCBI documented APIs (GenomeAPI, , GeneAPI, APIClient)

In [4]:
# !pip install ncbi-datasets-pylib

In [5]:
from ncbi.datasets import GenomeApi, GeneApi
from ncbi.datasets.openapi import ApiClient
import pandas as pd

##### Get chromosome metadata

In [None]:
def get_chrom_locations(organism_name: str) -> pd.DataFrame:

    client = ApiClient()
    genome_api = GenomeApi(client)

    # Search for genome assemblies by organism name
    genome_summaries = genome_api.assembly_descriptors_by_taxon(organism_name)

    chromosomes = []
    sizes = []

    if genome_summaries.assemblies:
        for chrom_dict in genome_summaries.assemblies[0]['assembly']['chromosomes']:
            if chrom_dict.accession_version != None:
                chromosomes.append(chrom_dict.accession_version)
                sizes.append(int(chrom_dict.length))
            
    elif (not chromosomes) or (not sizes):
        print(f"No genomes found for organism: {organism_name}")

    return pd.DataFrame({'Chromosome': chromosomes, 'Size (bp)': sizes})

In [10]:
# Walnut

get_chrom_locations('Prunus persica')

{'assemblies': [{'assembly': {'annotation_metadata': {'busco': {'busco_lineage': 'eudicots_odb10',
                                                                'busco_ver': '4.0.2',
                                                                'complete': 0.99312127,
                                                                'duplicated': 0.013757524,
                                                                'fragmented': 0.0017196905,
                                                                'missing': 0.005159071,
                                                                'single_copy': 0.97936374,
                                                                'total_count': '2326'},
                                                      'file': [{'estimated_size': '7699257',
                                                                'type': 'GENOME_GFF'},
                                                               {'estimated_size': '106292339',
  

Unnamed: 0,Chromosome,Size (bp)
0,NC_034009.1,47851208
1,NC_034010.1,30405870
2,NC_034011.1,27368013
3,NC_034012.1,25843236
4,NC_034013.1,18496696
5,NC_034014.1,30767194
6,NC_034015.1,22388614
7,NC_034016.1,22573980
8,NC_014697.1,157790


##### GeneAPI

In [5]:
gene_api = GeneApi()
help(gene_api)

Help on GeneApi in module ncbi.datasets.openapi.api.gene_api object:

class GeneApi(builtins.object)
 |  GeneApi(api_client=None)
 |  
 |  NOTE: This class is auto generated by OpenAPI Generator
 |  Ref: https://openapi-generator.tech
 |  
 |  Do not edit the class manually.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, api_client=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  download_gene_package(self, gene_ids, **kwargs)
 |      Get a gene dataset by gene ID  # noqa: E501
 |      
 |      Get a gene dataset including gene, transcript and protein fasta sequence, annotation and metadata by gene ID.  # noqa: E501
 |      This method makes a synchronous HTTP request by default. To make an
 |      asynchronous HTTP request, please pass async_req=True
 |      
 |      >>> thread = api.download_gene_package(gene_ids, async_req=True)
 |      >>> result = thread.get()
 |      
 |      Args:
 |          gene_ids ([int]): NCBI gene ids
 |      
 

In [7]:
walnut_accessions = get_chrom_locations('Juglans regia (English walnut)')
walnut_ref_seq = [chrom['RefSeq'] for chrom in walnut_accessions if chrom['RefSeq'] is not None]
# walnut_ref_seq

gene_api.gene_tax_name_query('Juglans regia (English walnut)')

{'sci_name_and_ids': [{'common_name': 'English walnut',
                       'sci_name': 'Juglans regia',
                       'tax_id': '51240'},
                      {'sci_name': 'Juglans microcarpa x Juglans regia',
                       'tax_id': '2249226'},
                      {'sci_name': 'Juglans sigillata x Juglans regia',
                       'tax_id': '1441050'},
                      {'sci_name': 'Juglans major x Juglans regia',
                       'tax_id': '1141591'},
                      {'sci_name': 'Roystonea regia', 'tax_id': '145709'},
                      {'sci_name': 'Drosera regia', 'tax_id': '4371'},
                      {'sci_name': 'Odontella regia', 'tax_id': '1335017'},
                      {'common_name': 'English oak',
                       'sci_name': 'Quercus robur',
                       'tax_id': '38942'},
                      {'common_name': 'black walnut',
                       'sci_name': 'Juglans nigra',
                       't

##### GenomeAPI

In [31]:
genome_api = GenomeApi()
help(genome_api)

Help on GenomeApi in module ncbi.datasets.openapi.api.genome_api object:

class GenomeApi(builtins.object)
 |  GenomeApi(api_client=None)
 |  
 |  NOTE: This class is auto generated by OpenAPI Generator
 |  Ref: https://openapi-generator.tech
 |  
 |  Do not edit the class manually.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, api_client=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  assembly_descriptors_by_accessions(self, accessions, **kwargs)
 |      Get genome metadata by accession  # noqa: E501
 |      
 |      Get detailed metadata for assembled genomes by accession in a JSON output format.  # noqa: E501
 |      This method makes a synchronous HTTP request by default. To make an
 |      asynchronous HTTP request, please pass async_req=True
 |      
 |      >>> thread = api.assembly_descriptors_by_accessions(accessions, async_req=True)
 |      >>> result = thread.get()
 |      
 |      Args:
 |          accessions ([str]):
 |      


In [33]:
client = ApiClient()
genome_api = GenomeApi(client)

# Search for genome assemblies by organism name
genome_summaries = genome_api.assembly_descriptors_by_taxon('Juglans regia (English walnut)')
print(genome_summaries.assemblies)

[{'assembly': {'annotation_metadata': {'busco': {'busco_lineage': 'eudicots_odb10',
                                                'busco_ver': '4.0.2',
                                                'complete': 0.99269134,
                                                'duplicated': 0.091573514,
                                                'fragmented': 0.0017196905,
                                                'missing': 0.005588994,
                                                'single_copy': 0.9011178,
                                                'total_count': '2326'},
                                      'file': [{'estimated_size': '11438918',
                                                'type': 'GENOME_GFF'},
                                               {'estimated_size': '261191555',
                                                'type': 'GENOME_GBFF'},
                                               {'estimated_size': '23490972',
                           

#### Reading Data via Entrez API (affiliated with NCBI)

In [7]:
!pip install Bio

Collecting Bio
  Downloading bio-1.7.1-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.84-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting pooch (from Bio)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading bio-1.7.1-py3-none-any.whl (280 kB)
Downloading biopython-1.84-cp311-cp311-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   ------------------------------------- -- 2.6/2.8 MB 13.7 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 10.1 MB/s eta 0:00:00
Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.

In [8]:
from Bio import Entrez

In [10]:
Entrez.email = "adamhetherwick@gmail.com"
handle = Entrez.egquery(term="Juglans regia (English walnut)")
record = Entrez.read(handle)

URLError: <urlopen error [Errno 11001] getaddrinfo failed>