In [111]:
import Bio.SeqIO

['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', 'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name']


In [162]:
import sys

def parse_location(location_str):
    return location_str.split('..')

def parse_string(location_str):
    if location_str.index('complement') == 0:
        return '-'
    else:
        return '+'

def parse_entity(feature):
    '''
    Parse a transcribed entity
    '''
    entity = {}
    # print('feature:', feature)
    
    entity['start'] = feature.location.start.position
    entity['end'] = feature.location.end.position
    entity['strand'] = '+' if feature.location.strand == 1 else '-'
    #print(dir(feature))
    #print("qulaifiers:", feature.qualifiers)
    if 'gene' in feature.qualifiers:
        entity['name'] = feature.qualifiers['gene'][0]
    
    
    for part in feature.location.parts:
        entity['location_parts'] = {
            'start': part.start.position,
            'end': part.end.position,
            'strand': '+' if part.strand == 1 else '-',
        }

    for xref in feature.qualifiers['db_xref']:
        parts = xref.split(':')
        if parts[0] == 'GeneID':
            entity['geneId'] = parts[1]
            break

    if 'transcript_id' in feature.qualifiers:
        entity['transcriptId'] = feature.qualifiers['transcript_id'][0]

    #print('entity:', entity)
    return entity

def extract_refseq_features(refseq_file):
    '''
    Extract a list of the features present in this file. This file should
    only contain sequences for one record.
    
    Parameters
    ----------
    refseq_file: string
        The filename of a refseq file for a single sequence
        
    Returns
    --------
    features: {}
        A list of features indexed by GenBank gene ids
    '''
    record_count = 0
    genes = {}
    count = 0
    
    for record in Bio.SeqIO.parse(refseq_file, 'genbank'):
        record_count += 1
        #print("dir:", dir(record))
    
    print('record:', record)
    for feature in record.features:
        # print("dir", dir(feature))
        # print(feature.type)
        # print(feature.location, type(feature.location))
        # print(dir(feature))
        try:
            curr_entity = parse_entity(feature)
        except Exception as ex:
            print("Error parsing feature: {}".format(feature), file=sys.stderr)
            print("Error: {}".format(ex))


        curr_entity['type'] = feature.type

        if feature.type == 'gene':
            curr_entity['mRNAs'] = []
            print("feature:", feature)
            print('curr_entity:', curr_entity)
            genes[curr_entity['geneId']] = curr_entity

            if 'pseudo' in feature.qualifiers:
                curr_entity['type'] = 'pseudo'

            # print("gene", curr_entity)
        elif feature.type == 'CDS':
            #print("feature:", feature)

            cds = {}
            cds['start'] = feature.location.start.position
            cds['end'] = feature.location.end.position
            genes[curr_entity['geneId']]['cds'] = cds
        elif feature.type == 'ncRNA':
            genes[curr_entity['geneId']] = curr_entity
        elif feature.type == 'mRNA':
            if 'mRNAs' not in genes[curr_entity['geneId']]:
                genes[curr_entity['geneId']]['mRNAs'] = []
            genes[curr_entity['geneId']]['mRNAs'] += [curr_entity]
        elif feature.type == 'misc_RNA':
            genes[curr_entity['geneId']] = curr_entity        
            #print(feature)
        elif feature.type == 'precursor_RNA':
            genes[curr_entity['geneId']] = curr_entity
        elif feature.type == 'tRNA':
            genes[curr_entity['geneId']] = curr_entity
        else:
            print(feature)

        count += 1
    
    return genes

In [163]:
import urllib
import os
import os.path as op

def extract_assembly_annotations(assembly_location):
    #wget -O /tmp/assembly.txt ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.33_GRCh38.p7/GCF_000001405.33_GRCh38.p7_assembly_report.txt
    !wget -O /tmp/assembly.txt {assembly_location}

    !cat /tmp/assembly.txt | grep "^#" | tail -n 1 > /tmp/assembly_headers.txt
    !cat /tmp/assembly.txt | grep -v "^#" > /tmp/assembly_processed.txt

    with open('/tmp/assembly_headers.txt', 'r') as f:
        headers = f.readlines()[0].strip().split()[1:]
        print(headers)
        
    output_dir = op.join(op.expanduser('~/data/nuccore'),
                         op.splitext(op.basename(assembly_location))[0])

    if not op.exists(output_dir):
        os.makedirs(output_dir)

    output_files = []

    with open('/tmp/assembly_processed.txt', 'r') as f:
        for line in f:
            parts = line.strip().split()
            print("parts:", parts)
            genbank_accn = parts[6]
            sequence_name = parts[0]

            output_file  = op.join(output_dir, '{}.gb'.format(genbank_accn))
            done_file = '{}.done'.format(output_file)

            print(output_file)

            if not op.exists(done_file):
                try:
                    download_file = output_file + '.orig'

                    urllib.request.urlretrieve('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=gbwithparts'
                                   .format(genbank_accn), output_file)

                    with open(done_file, 'w') as f1:
                        pass
                except Exception as ex:
                    continue
                    print('Ex:', ex)

            output_struct = dict(zip(headers, parts))
            output_struct['refseqFilename'] = output_file
            output_struct['gene_features'] = extract_refseq_features(output_file)
            output_files += [output_struct]
            break

    #print(output_files)
    return output_files

In [164]:
assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt"
#assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_assembly_report.txt"

output_files = extract_assembly_annotations(assembly_location)


--2018-01-05 14:36:24--  ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt
           => ‘/tmp/assembly.txt’
Resolving ftp.ncbi.nlm.nih.gov... 130.14.250.11, 2607:f220:41e:250::12
Connecting to ftp.ncbi.nlm.nih.gov|130.14.250.11|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT ... done.
==> SIZE GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt ... 214085
==> PASV ... done.    ==> RETR GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt ... done.
Length: 214085 (209K) (unauthoritative)


2018-01-05 14:36:24 (2.86 MB/s) - ‘/tmp/assembly.txt’ saved [214085]

['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', 'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Len

Error parsing feature: type: exon
location: [476856:478543](-)
qualifiers:
    Key: gene, Value: ['CR18275']
    Key: gene_synonym, Value: ['CG18275; Dmel\\CR18275']
    Key: locus_tag, Value: ['Dmel_CR18275']
    Key: number, Value: ['3']
    Key: old_locus_tag, Value: ['Dmel_CG18275']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [478598:479026](-)
qualifiers:
    Key: gene, Value: ['CR18275']
    Key: gene_synonym, Value: ['CG18275; Dmel\\CR18275']
    Key: locus_tag, Value: ['Dmel_CR18275']
    Key: number, Value: ['2']
    Key: old_locus_tag, Value: ['Dmel_CG18275']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [479115:479309](-)
qualifiers:
    Key: gene, Value: ['CR18275']
    Key: gene_synonym, Value: ['CG18275; Dmel\\CR18275']
    Key: locus_tag, Value: ['Dmel_CR18275']
    Key: number, Value: ['1']
    Key: old_locus_tag, Value: ['Dmel_CG18275']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [4

 {'start': 2682828, 'end': 2683845, 'strand': '-', 'name': 'CR44470', 'location_parts': {'start': 2682828, 'end': 2683845, 'strand': '-'}, 'geneId': '19835100', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [2685579:2692780](+)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0003068', 'GeneID:31251']
    Key: gene, Value: ['per']
    Key: gene_synonym, Value: ['CG2647; Clk; clk-6; Dmel\\CG2647; dmper; dper; dPER; EG:155E2.4; mel_per; Per; PER']
    Key: locus_tag, Value: ['Dmel_CG2647']
    Key: map, Value: ['3B1-3B2']
    Key: note, Value: ['period']

curr_entity: {'start': 2685579, 'end': 2692780, 'strand': '+', 'name': 'per', 'location_parts': {'start': 2685579, 'end': 2692780, 'strand': '+'}, 'geneId': '31251', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [2692731:2693886](-)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0000092', 'GeneID:31252']
    Key: gene, Value: ['CG2650']
    Key: gene_synonym, Value: ['0.9; 0.9 gene; anon-3B1.2; Dmel\\CG26

Error parsing feature: type: exon
location: [3793463:3794660](+)
qualifiers:
    Key: gene, Value: ['CR42499']
    Key: gene_synonym, Value: ['CG42499; Dmel\\CR42499']
    Key: locus_tag, Value: ['Dmel_CR42499']
    Key: number, Value: ['1']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [3797496:3799028](-)
qualifiers:
    Key: gene, Value: ['CR33221']
    Key: gene_synonym, Value: ['CG12260; CG33221; Dmel\\CR33221']
    Key: locus_tag, Value: ['Dmel_CR33221']
    Key: number, Value: ['1']
    Key: old_locus_tag, Value: ['Dmel_CG33221']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [6280232:6280439](-)
qualifiers:
    Key: gene, Value: ['CR32745']
    Key: gene_synonym, Value: ['CG32745; Dmel\\CR32745']
    Key: locus_tag, Value: ['Dmel_CR32745']
    Key: number, Value: ['2']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [6280488:6280752](-)
qualifiers:
    Key: gene, Value: ['CR32745']
    Key: gene_sy


type: mobile_element
location: [7845780:7847213](-)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBti0019601']
    Key: mobile_element_type, Value: ['transposon:hopper{}82']

feature: type: gene
location: [7852480:7854244](-)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0029963', 'GeneID:31688']
    Key: gene, Value: ['CG10920']
    Key: gene_synonym, Value: ['Dmel\\CG10920']
    Key: locus_tag, Value: ['Dmel_CG10920']
    Key: map, Value: ['7C1-7C1']

curr_entity: {'start': 7852480, 'end': 7854244, 'strand': '-', 'name': 'CG10920', 'location_parts': {'start': 7852480, 'end': 7854244, 'strand': '-'}, 'geneId': '31688', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [7858172:7858794](+)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0029964', 'GeneID:31689']
    Key: gene, Value: ['CG1409']
    Key: gene_synonym, Value: ['BcDNA:AT29287; D.M.BLPp; Dmel\\CG1409']
    Key: locus_tag, Value: ['Dmel_CG1409']
    Key: map, Value: ['7C1-7C1']

curr_entity: {'start': 78

Error parsing feature: type: exon
location: [11586227:11586643](+)
qualifiers:
    Key: gene, Value: ['CR43217']
    Key: gene_synonym, Value: ['Dmel\\CR43217; noncoding_23159; noncoding_23161']
    Key: locus_tag, Value: ['Dmel_CR43217']
    Key: number, Value: ['1']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [11586709:11586944](+)
qualifiers:
    Key: gene, Value: ['CR43217']
    Key: gene_synonym, Value: ['Dmel\\CR43217; noncoding_23159; noncoding_23161']
    Key: locus_tag, Value: ['Dmel_CR43217']
    Key: number, Value: ['2']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [11587282:11587685](+)
qualifiers:
    Key: gene, Value: ['CR43216']
    Key: gene_synonym, Value: ['Dmel\\CR43216; noncoding_23159; noncoding_23161']
    Key: locus_tag, Value: ['Dmel_CR43216']
    Key: number, Value: ['1']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [11588322:11588736](+)
qualifiers:
    Key: gene, Value: ['


feature: type: gene
location: [11877736:11887162](-)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0030349', 'GeneID:32148']
    Key: gene, Value: ['CG10353']
    Key: gene_synonym, Value: ['Dmel\\CG10353']
    Key: locus_tag, Value: ['Dmel_CG10353']
    Key: map, Value: ['10F2-10F4']

curr_entity: {'start': 11877736, 'end': 11887162, 'strand': '-', 'name': 'CG10353', 'location_parts': {'start': 11877736, 'end': 11887162, 'strand': '-'}, 'geneId': '32148', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [11887283:11888240](+)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0030350', 'GeneID:32149']
    Key: gene, Value: ['SelG']
    Key: gene_synonym, Value: ['BcDNA:GH03581; CG1844; Dmel\\CG1844; dselG; dselK; dSelK; G-rich; SelK']
    Key: locus_tag, Value: ['Dmel_CG1844']
    Key: map, Value: ['10F4-10F4']
    Key: note, Value: ['Selenoprotein G']

curr_entity: {'start': 11887283, 'end': 11888240, 'strand': '+', 'name': 'SelG', 'location_parts': {'start': 11887283

curr_entity: {'start': 16560683, 'end': 16587172, 'strand': '-', 'name': 'CanA-14F', 'location_parts': {'start': 16560683, 'end': 16587172, 'strand': '-'}, 'geneId': '8674098', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [16587872:16588930](+)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0030759', 'GeneID:32626']
    Key: gene, Value: ['CG13014']
    Key: gene_synonym, Value: ['Dmel\\CG13014']
    Key: locus_tag, Value: ['Dmel_CG13014']
    Key: map, Value: ['14F1-14F1']

curr_entity: {'start': 16587872, 'end': 16588930, 'strand': '+', 'name': 'CG13014', 'location_parts': {'start': 16587872, 'end': 16588930, 'strand': '+'}, 'geneId': '32626', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [16588995:16593439](-)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBgn0015615', 'GeneID:32627']
    Key: gene, Value: ['SMC3']
    Key: gene_synonym, Value: ['cap; Cap; CAP; CG9802; dCAP; Dmel\\CG9802; DmSMC3; DmSMC3/Cap; dSMC3; Smc3']
    Key: locus_tag, Value: ['

Error parsing feature: type: exon
location: [15576354:15576964](+)
qualifiers:
    Key: gene, Value: ['betaNACtes5']
    Key: gene_synonym, Value: ['CR42877; Dmel\\CR42877']
    Key: locus_tag, Value: ['Dmel_CR42877']
    Key: number, Value: ['1']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [15796476:15796897](+)
qualifiers:
    Key: gene, Value: ['CR44383']
    Key: gene_synonym, Value: ['Dmel\\CR44383']
    Key: locus_tag, Value: ['Dmel_CR44383']
    Key: number, Value: ['1']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [17898642:17899278](+)
qualifiers:
    Key: gene, Value: ['CR32496']
    Key: gene_synonym, Value: ['CG32496; Dmel\\CR32496; FBpp0074273']
    Key: locus_tag, Value: ['Dmel_CR32496']
    Key: number, Value: ['1']
    Key: old_locus_tag, Value: ['Dmel_CG32496']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [17899356:17899498](+)
qualifiers:
    Key: gene, Value: ['CR32496']
    Key: 

type: exon
location: [20225136:20225193](-)
qualifiers:
    Key: gene, Value: ['CR33498']
    Key: gene_synonym, Value: ['CG33498; Dmel\\CR33498']
    Key: locus_tag, Value: ['Dmel_CR33498']
    Key: number, Value: ['3']
    Key: pseudo, Value: ['']

Error: 'db_xref'
type: exon
location: [20225585:20225767](-)
qualifiers:
    Key: gene, Value: ['CR33498']
    Key: gene_synonym, Value: ['CG33498; Dmel\\CR33498']
    Key: locus_tag, Value: ['Dmel_CR33498']
    Key: number, Value: ['2']
    Key: pseudo, Value: ['']

Error: 'db_xref'
type: exon
location: [20225849:20225979](-)
qualifiers:
    Key: gene, Value: ['CR33498']
    Key: gene_synonym, Value: ['CG33498; Dmel\\CR33498']
    Key: locus_tag, Value: ['Dmel_CR33498']
    Key: number, Value: ['1']
    Key: pseudo, Value: ['']

type: mobile_element
location: [20226131:20227857](+)
qualifiers:
    Key: db_xref, Value: ['FLYBASE:FBti0019653']
    Key: mobile_element_type, Value: ['transposon:Rt1c{}181']

feature: type: gene
location: [2022

Error parsing feature: type: exon
location: [20874940:20875303](+)
qualifiers:
    Key: gene, Value: ['CR11235']
    Key: gene_synonym, Value: ['CG11235; Dmel\\CR11235']
    Key: locus_tag, Value: ['Dmel_CR11235']
    Key: number, Value: ['1']
    Key: old_locus_tag, Value: ['Dmel_CG11235']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [21155431:21155667](-)
qualifiers:
    Key: gene, Value: ['CR45499']
    Key: gene_synonym, Value: ['Dmel\\CR45499']
    Key: locus_tag, Value: ['Dmel_CR45499']
    Key: number, Value: ['3']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [21155725:21155754](-)
qualifiers:
    Key: gene, Value: ['CR45499']
    Key: gene_synonym, Value: ['Dmel\\CR45499']
    Key: locus_tag, Value: ['Dmel_CR45499']
    Key: number, Value: ['2']
    Key: pseudo, Value: ['']

Error parsing feature: type: exon
location: [21155850:21156173](-)
qualifiers:
    Key: gene, Value: ['CR45499']
    Key: gene_synonym, Value: ['Dme

In [166]:
with open('/tmp/annotations.json', 'w') as f:
    f.write(json.dumps(output_files, indent=2))