In [63]:
import os
import glob
from Bio import Entrez
import xml.etree.ElementTree as ET
import time
import pathlib
import pprint

assemblies = 'assemblies'
Entrez.email = "mfoster11@mgh.harvard.edu"

In [20]:
def printlist(list):
    for item in list:
        print(item)

In [203]:
def parse_identifiers(text):
    result = {}
    pairs = text.split(';')
    for pair in pairs:
        if ':' in pair:
            key, value = pair.split(':', 1)
            result[key.strip()] = value.strip()
    return result

def parse_xml(element, parent_path=""):
    result = {}
    current_path = f"{parent_path}/{element.tag}" if parent_path else element.tag
    
    # Get attributes
    for key, value in element.attrib.items():
        result[f"{current_path}/@{key}"] = value
    
    # Handle SampleData separately to parse the nested XML
    if element.tag == 'SampleData' and element.text:
        try:
            inner_root = ET.fromstring(element.text.strip())
            inner_results = parse_xml(inner_root, current_path)
            result.update(inner_results)
        except ET.ParseError:
            result[current_path] = element.text.strip()
    elif element.tag == 'Identifiers':
        identifiers = parse_identifiers(element.text)
        for key, value in identifiers.items():
            result[f"{current_path}/{key}"] = value
        
    # Normal element processing
    elif element.text and not element.text.isspace() and not list(element):
        result[current_path] = element.text.strip()
    
    # Process child elements
    for child in element:
        if child.text and child.text.strip() or child.attrib or len(list(child)):
            child_results = parse_xml(child, current_path)
            result.update(child_results)
            
    return result

def get_ncbi_metadata(gcf_id):
    metadata = {}

    # get assembly id
    handle = Entrez.esearch(db="assembly", term=gcf_id)
    record = handle.read().decode()
    handle.close()
    # get asm_id
    asm_id = ET.fromstring(record).find(".//Id").text

    # get assembly summary
    handle = Entrez.esummary(db="assembly", id=asm_id)
    summary = handle.read().decode()
    handle.close()
    
    # parse assembly summary
    root = ET.fromstring(summary)
    asm_description = root.find(".//AssemblyDescription")
    seq_method = root.find(".//AssemblyMethod")
    asm_sum = parse_xml(root)
    metadata['assembly'] = {k.split("/")[-1].replace('@',''): v for k,v in asm_sum.items()}
    
    # get biosample summary
    handle = Entrez.esummary(db="biosample", id=metadata['assembly']['BioSampleId'])
    bs_summary = handle.read().decode()
    handle.close()
    
    # parse biosample summary
    sum_txt = ET.fromstring(bs_summary)
    bs_sum = parse_xml(sum_txt)
    metadata['biosample'] = {k.split("/")[-1].replace('@',''): v for k,v in bs_sum.items()}
    
    return metadata

In [133]:
genomes = [pathlib.Path(genome).stem for genome in glob.glob('assemblies/complete/*.gff3')]
ids = []
for genome in genomes:
    if genome.startswith(('GCA','GCF')):
        ids.append(genome)

GCF_000172295.2_ASM17229v2_genomic
GCF_003367295.1_ASM336729v1_genomic
GCF_000181575.2_ASM18157v2_genomic
GCF_000008685.2_ASM868v2_genomic
GCF_000021405.1_ASM2140v1_genomic
GCF_000181715.2_ASM18171v2_genomic
GCF_000171735.2_ASM17173v2_genomic
GCF_000172335.2_ASM17233v2_genomic
GCF_000166635.1_ASM16663v1_genomic
GCF_000166655.1_ASM16665v1_genomic
GCF_002442595.2_ASM244259v2_genomic
GCF_000181855.2_ASM18185v2_genomic
GCF_000172315.2_ASM17231v2_genomic
GCF_000181555.2_ASM18155v2_genomic


In [204]:
metadata = {}
for asm in ids:
    gcf_id = '_'.join(asm.split('_')[0:2])
    print(gcf_id)
    metadata[asm] = get_ncbi_metadata(gcf_id)

GCF_000172295.2
None
GCF_003367295.1
None
GCF_000181575.2
None
GCF_000008685.2
None
GCF_000021405.1
None
GCF_000181715.2
None
GCF_000171735.2
None
GCF_000172335.2
None
GCF_000166635.1
None
GCF_000166655.1
None
GCF_002442595.2
None
GCF_000181855.2
None
GCF_000172315.2
None
GCF_000181555.2
None


In [194]:
pprint.pprint(metadata)

{'GCF_000008685.2_ASM868v2_genomic': {'assembly': {'AsmReleaseDate_GenBank': '2012/01/06 '
                                                                             '00:00',
                                                   'AsmReleaseDate_RefSeq': '2012/10/16 '
                                                                            '00:00',
                                                   'AsmUpdateDate': '2020/03/06 '
                                                                    '00:00',
                                                   'AssemblyAccession': 'GCF_000008685.2',
                                                   'AssemblyName': 'ASM868v2',
                                                   'AssemblyStatus': 'Complete '
                                                                     'Genome',
                                                   'AssemblyStatusSort': '1',
                                                   'AssemblyType': 'haploid',
   

In [211]:
from urllib.request import urlopen

for _, asm in metadata.items():
    ftp_path = asm['assembly']['FtpPath_Assembly_rpt']
    print(_, ftp_path)
    with urlopen(ftp_path) as response:
        content = response.read().decode('utf-8')
    for line in content.split('\n'):
        if 'Sequencing Technology' in line:
            print(line)

GCF_000172295.2_ASM17229v2_genomic ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/172/295/GCF_000172295.2_ASM17229v2/GCF_000172295.2_ASM17229v2_assembly_report.txt
GCF_003367295.1_ASM336729v1_genomic ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/367/295/GCF_003367295.1_ASM336729v1/GCF_003367295.1_ASM336729v1_assembly_report.txt
GCF_000181575.2_ASM18157v2_genomic ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/181/575/GCF_000181575.2_ASM18157v2/GCF_000181575.2_ASM18157v2_assembly_report.txt
GCF_000008685.2_ASM868v2_genomic ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/008/685/GCF_000008685.2_ASM868v2/GCF_000008685.2_ASM868v2_assembly_report.txt
GCF_000021405.1_ASM2140v1_genomic ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/021/405/GCF_000021405.1_ASM2140v1/GCF_000021405.1_ASM2140v1_assembly_report.txt
GCF_000181715.2_ASM18171v2_genomic ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/181/715/GCF_000181715.2_ASM18171v2/GCF_000181715.2_ASM18171v2_assembly_report.txt
GCF_000171735.2_ASM171

In [198]:
metadata['GCF_003367295.1_ASM336729v1_genomic']

# GCF_003367295.1_ASM336729v1_genomic == B331 (already in dataset)
# GCF_003367295.1_ASM336729v1_genomic == MM1 (PacBio)
#

{'assembly': {'status': 'OK',
  'DbBuild': 'Build241120-0850.1',
  'uid': '1856251',
  'RsUid': '7053348',
  'GbUid': '7035338',
  'AssemblyAccession': 'GCF_003367295.1',
  'LastMajorReleaseAccession': 'GCF_003367295.1',
  'ChainId': '3367295',
  'AssemblyName': 'ASM336729v1',
  'Taxid': '139',
  'Organism': 'Borreliella burgdorferi (Lyme disease spirochete)',
  'SpeciesTaxid': '139',
  'SpeciesName': 'Borreliella burgdorferi',
  'AssemblyType': 'haploid',
  'AssemblyStatus': 'Complete Genome',
  'AssemblyStatusSort': '1',
  'BioprojectAccn': 'PRJNA224116',
  'BioprojectId': '224116',
  'BioSampleAccn': 'SAMN07452586',
  'BioSampleId': '7452586',
  'Sub_type': 'strain',
  'Sub_value': 'MM1',
  'Coverage': '800',
  'PartialGenomeRepresentation': 'false',
  'Primary': '7053338',
  'ReleaseLevel': 'Major',
  'ReleaseType': 'Major',
  'AsmReleaseDate_GenBank': '2018/08/08 00:00',
  'AsmReleaseDate_RefSeq': '2018/08/10 00:00',
  'SeqReleaseDate': '2018/08/08 00:00',
  'AsmUpdateDate': '2018