In [1]:
import Bio.SeqIO

In [92]:
import sys

def parse_location(location_str):
    return location_str.split('..')

def parse_string(location_str):
    if location_str.index('complement') == 0:
        return '-'
    else:
        return '+'

def parse_entity(feature):
    '''
    Parse a transcribed entity
    '''
    entity = {}
    # print('feature:', feature)
    
    entity['start'] = feature.location.start.position
    entity['end'] = feature.location.end.position
    entity['strand'] = '+' if feature.location.strand == 1 else '-'
    #print(dir(feature))
    #print("qulaifiers:", feature.qualifiers)
    if 'gene' in feature.qualifiers:
        entity['name'] = feature.qualifiers['gene'][0]
    
    
    for part in feature.location.parts:
        entity['location_parts'] = {
            'start': part.start.position,
            'end': part.end.position,
            'strand': '+' if part.strand == 1 else '-',
        }

    for xref in feature.qualifiers['db_xref']:
        parts = xref.split(':')
        if parts[0] == 'GeneID':
            entity['geneId'] = parts[1]
            break

    if 'transcript_id' in feature.qualifiers:
        entity['transcriptId'] = feature.qualifiers['transcript_id'][0]

    #print('entity:', entity)
    return entity

def extract_refseq_features(refseq_file, sequence_handle=None):
    '''
    Extract a list of the features present in this file. This file should
    only contain sequences for one record.
    
    Parameters
    ----------
    refseq_file: string
        The filename of a refseq file for a single sequence
    sequence_name: string
        The name of the sequence that these refseq annotations are for
    sequence_handle File
        A file handle for the sequence file
        
    Returns
    --------
    features: {}
        A list of features indexed by GenBank gene ids
    '''
    record_count = 0
    genes = {}
    count = 0
    
    for record in Bio.SeqIO.parse(refseq_file, 'genbank'):
        record_count += 1
        #print("dir:", dir(record))
    
        if sequence_handle is not None:
            # store the sequence of this record in the provied fasta
            # file handle
            Bio.SeqIO.write(record, sequence_handle, 'fasta')
        
        print('record:', record)
        for feature in record.features:
            # print("dir", dir(feature))
            # print(feature.type)
            # print(feature.location, type(feature.location))
            # print(dir(feature))
            try:
                curr_entity = parse_entity(feature)
            except Exception as ex:
                print("Error parsing feature: {}".format(feature), file=sys.stderr)
                print("Error: {}".format(ex))


            curr_entity['type'] = feature.type

            if feature.type == 'gene':
                curr_entity['mRNAs'] = []
                print("feature:", feature)
                print('curr_entity:', curr_entity)
                genes[curr_entity['geneId']] = curr_entity

                if 'pseudo' in feature.qualifiers:
                    curr_entity['type'] = 'pseudo'

                # print("gene", curr_entity)
            elif feature.type == 'CDS':
                #print("feature:", feature)

                cds = {}
                cds['start'] = feature.location.start.position
                cds['end'] = feature.location.end.position
                genes[curr_entity['geneId']]['cds'] = cds
            elif feature.type == 'ncRNA':
                genes[curr_entity['geneId']] = curr_entity
            elif feature.type == 'mRNA':
                if 'mRNAs' not in genes[curr_entity['geneId']]:
                    genes[curr_entity['geneId']]['mRNAs'] = []
                genes[curr_entity['geneId']]['mRNAs'] += [curr_entity]
            elif feature.type == 'misc_RNA':
                genes[curr_entity['geneId']] = curr_entity        
                #print(feature)
            elif feature.type == 'precursor_RNA':
                genes[curr_entity['geneId']] = curr_entity
            elif feature.type == 'tRNA':
                genes[curr_entity['geneId']] = curr_entity
            else:
                print(feature)

            count += 1

    return genes

In [97]:
import json
import os
import os.path as op
import urllib

def extract_assembly_annotations(assembly_location):
    #wget -O /tmp/assembly.txt ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.33_GRCh38.p7/GCF_000001405.33_GRCh38.p7_assembly_report.txt
    !wget -O /tmp/assembly.txt {assembly_location}

    !cat /tmp/assembly.txt | grep "^#" | tail -n 1 > /tmp/assembly_headers.txt
    !cat /tmp/assembly.txt | grep -v "^#" > /tmp/assembly_processed.txt

    with open('/tmp/assembly_headers.txt', 'r') as f:
        headers = f.readlines()[0].strip().split()[1:]
        print(headers)
        
    output_dir = op.join(op.expanduser('~/data/nuccore'),
                         op.splitext(op.basename(assembly_location))[0])

    if not op.exists(output_dir):
        os.makedirs(output_dir)

    output_files = []
    annotations = {
        'sequences': []
    }

    with open('/tmp/assembly_processed.txt', 'r') as f:
        output_fa = op.join(output_dir, 'seq.fa')
        output_chromSizes = op.join(output_dir, 'chromSizes.tsv')
        
        if op.exists(output_fa):
            os.remove(output_fa)
        
        if op.exists(output_chromSizes):
            os.remove(output_chromSizes)
            
        f_chromsizes = open(op.join(output_dir, 'chromSizes.tsv'), 'a')
            
        for line in f:
            parts = line.strip().split()
            print("parts:", parts)
            genbank_accn = parts[6]
            sequence_name = parts[0]
            ucsc_name = parts[-1]
            refseq_name = parts[-5]
            seq_len = int(parts[-2])
            
            common_name = ucsc_name if ucsc_name != 'na' else refseq_name
            
            f_chromsizes.write("{}\t{}\n".format(common_name, seq_len))
            print('common_name:', common_name)

            output_file  = op.join(output_dir, '{}.gb'.format(genbank_accn))
            done_file = '{}.done'.format(output_file)
            print(output_file)

            if not op.exists(done_file):
                try:
                    download_file = output_file + '.orig'
                    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=gbwithparts&retmode=text'.format(genbank_accn)
                    #print("url:", url)
                    urllib.request.urlretrieve(url, output_file)
                    with open(done_file, 'w') as f1:
                        pass
                except Exception as ex:
                    continue
                    print('Ex:', ex)

            #output_struct = dict(zip(headers, parts))

            output_struct = {}
            output_struct['refseqFilename'] = output_file
            output_struct['sequence_name'] = sequence_name
            output_struct['ucsc_name'] = ucsc_name
            output_struct['refseq_name'] = refseq_name
            output_struct['gene_features'] = extract_refseq_features(output_file,
                                                                     sequence_handle=output_fa)
            output_files += [output_struct]
        annotations['sequences'] += [output_struct]

    with open(op.join(output_dir, 'annotations.json'), 'w') as fo:
        json.dump(annotations, fo, indent=2)
    #print(output_files)
    return output_dir

In [98]:
#
# assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt"
# 
# assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_assembly_report.txt"
# h37rv
assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2/GCF_000195955.2_ASM19595v2_assembly_report.txt"
output_dir = extract_assembly_annotations(assembly_location)


--2018-01-23 16:40:00--  ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2/GCF_000195955.2_ASM19595v2_assembly_report.txt
           => ‘/tmp/assembly.txt’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.230
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.230|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2 ... done.
==> SIZE GCF_000195955.2_ASM19595v2_assembly_report.txt ... 1252
==> PASV ... done.    ==> RETR GCF_000195955.2_ASM19595v2_assembly_report.txt ... done.
Length: 1252 (1.2K) (unauthoritative)


2018-01-23 16:40:01 (2.44 MB/s) - ‘/tmp/assembly.txt’ saved [1252]

['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', 'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name']
parts: ['ANONYMOUS

Error parsing feature: type: repeat_region
location: [23172:23273](-)
qualifiers:
    Key: note, Value: ['101 bp Mycobacterial Interspersed Repetitive Unit,Class I. See Supply et al. (1997) Molecular Microbiology 26, 991-1003']

Error parsing feature: type: repeat_region
location: [79506:79551](+)
qualifiers:
    Key: locus_tag, Value: ['Rv0071']
    Key: note, Value: ['5 x 9 bp GTGGACCCG repeats']

Error parsing feature: type: repeat_region
location: [80235:80550](+)
qualifiers:
    Key: note, Value: ["(MTV030.15), len: 315 nt. Probable REP'-1 pseudogene fragment"]

Error parsing feature: type: repeat_region
location: [103712:105215](-)
qualifiers:
    Key: note, Value: ['REP-2, len: 1503 nt. REP251, member of REP13E12 family.']

Error parsing feature: type: repeat_region
location: [154072:154125](+)
qualifiers:
    Key: gene, Value: ['treS']
    Key: locus_tag, Value: ['Rv0126']
    Key: note, Value: ['53 bp Mycobacterial Interspersed Repetitive Unit,Class III (see Supply et al., 199

feature: type: gene
location: [505085:506582](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:886388']
    Key: gene, Value: ['lpqM']
    Key: locus_tag, Value: ['Rv0419']

curr_entity: {'start': 505085, 'end': 506582, 'strand': '+', 'name': 'lpqM', 'location_parts': {'start': 505085, 'end': 506582, 'strand': '+'}, 'geneId': '886388', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [506560:506971](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:886383']
    Key: locus_tag, Value: ['Rv0420c']

curr_entity: {'start': 506560, 'end': 506971, 'strand': '-', 'location_parts': {'start': 506560, 'end': 506971, 'strand': '-'}, 'geneId': '886383', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [507131:507761](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:886377']
    Key: locus_tag, Value: ['Rv0421c']

curr_entity: {'start': 507131, 'end': 507761, 'strand': '-', 'location_parts': {'start': 507131, 'end': 507761, 'strand': '-'}, 'geneId': '886377', 'type': 'gene', 'mRN

Error parsing feature: type: repeat_region
location: [547487:547517](-)
qualifiers:
    Key: gene, Value: ['mazE1']
    Key: locus_tag, Value: ['Rv0456B']
    Key: note, Value: ['3 copies of a 10 bp near-perfect direct repeat,ATTACTACCTATTACTACGTATTACTATCT']

Error parsing feature: type: misc_feature
location: [575032:575069](+)
qualifiers:
    Key: locus_tag, Value: ['Rv0485']
    Key: note, Value: ['mcr19, fragment of putative small regulatory RNA (See DiChiara et al., 2010), cloned from M. bovis BCG~Pasteur; ends not mapped, 66-82 nt band detected by Northern blot in M. bovis BCG Pasteur.']

Error parsing feature: type: repeat_region
location: [580577:580654](+)
qualifiers:
    Key: note, Value: ['77 bp Mycobacterial Interspersed Repetitive Unit,Class I (see Supply et al., 1997)']

Error parsing feature: type: repeat_region
location: [580654:580731](+)
qualifiers:
    Key: note, Value: ['77 bp Mycobacterial Interspersed Repetitive Unit,Class I (see Supply et al., 1997)']

Error pars

 type: gene
location: [1058259:1060575](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:885442']
    Key: gene, Value: ['uvrD1']
    Key: gene_synonym, Value: ['uvrD']
    Key: locus_tag, Value: ['Rv0949']

curr_entity: {'start': 1058259, 'end': 1060575, 'strand': '+', 'name': 'uvrD1', 'location_parts': {'start': 1058259, 'end': 1060575, 'strand': '+'}, 'geneId': '885442', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [1060655:1061654](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:885438']
    Key: locus_tag, Value: ['Rv0950c']

curr_entity: {'start': 1060655, 'end': 1061654, 'strand': '-', 'location_parts': {'start': 1060655, 'end': 1061654, 'strand': '-'}, 'geneId': '885438', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [1061963:1063127](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:885434']
    Key: gene, Value: ['sucC']
    Key: locus_tag, Value: ['Rv0951']

curr_entity: {'start': 1061963, 'end': 1063127, 'strand': '+', 'name': 'sucC', 'location_pa

Error parsing feature: type: mobile_element
location: [1158920:1160433](-)
qualifiers:
    Key: mobile_element_type, Value: ['insertion sequence:IS1560-1']
    Key: note, Value: ['IS1560-1, len: 1513 nt. Insertion sequence IS1560.']

Error parsing feature: type: mobile_element
location: [1164571:1165549](-)
qualifiers:
    Key: mobile_element_type, Value: ['insertion sequence:IS-LIKE-1']
    Key: note, Value: ['IS-LIKE-1, len: 978 nt. Insertion sequence, ISLIKE,region. This region is a possible MT-complex-specific genomic island (See Becq et al., 2007).']

Error parsing feature: type: repeat_region
location: [1164571:1164589](+)
qualifiers:
    Key: note, Value: ['18 bp inverted repeat at the left end of IS-LIKE element, CTAGGGCGTGTCTCCCAA. This region is a possible MT-complex-specific genomic island (See Becq et al.,2007).']

Error parsing feature: type: repeat_region
location: [1165531:1165549](-)
qualifiers:
    Key: note, Value: ['18 bp inverted repeat at the right end of a IS-LIKE

feature: type: gene
location: [1709643:1710603](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:886451']
    Key: locus_tag, Value: ['Rv1518']

curr_entity: {'start': 1709643, 'end': 1710603, 'strand': '+', 'location_parts': {'start': 1709643, 'end': 1710603, 'strand': '+'}, 'geneId': '886451', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [1710732:1711002](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:886453']
    Key: locus_tag, Value: ['Rv1519']

curr_entity: {'start': 1710732, 'end': 1711002, 'strand': '+', 'location_parts': {'start': 1710732, 'end': 1711002, 'strand': '+'}, 'geneId': '886453', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [1711027:1712068](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:886447']
    Key: locus_tag, Value: ['Rv1520']

curr_entity: {'start': 1711027, 'end': 1712068, 'strand': '+', 'location_parts': {'start': 1711027, 'end': 1712068, 'strand': '+'}, 'geneId': '886447', 'type': 'gene', 'mRNAs': []}
feature: type: gene
lo

Error parsing feature: type: repeat_region
location: [1779265:1779277](-)
qualifiers:
    Key: note, Value: ['12 bp direct repeat 1, ccacggccaacc, flanking phage-like element, second site at 1788514..1788525. This region is a possible MT-complex-specific genomic island (See Becq et al., 2007).']

Error parsing feature: type: repeat_region
location: [1779958:1780047](-)
qualifiers:
    Key: note, Value: ['89 bp direct repeat 2, first copy at 1780485..1780573, GGGTTGCGTTGTCGATTCGTTTGAGCCGCCGGTAGGTGCCGGCGGAGATGCCGAGGGC TG CGCCGATAGCAGTGTCTGTTTTCGTCGAA. This region is a possible MT-complex-specific genomic island (See Becq et al.,2007).']

Error parsing feature: type: repeat_region
location: [1780484:1780573](-)
qualifiers:
    Key: note, Value: ['89 bp direct repeat 1, second copy at 1779959..1780047, GGGTTGCGTTGTCGATTCGTTTGAGCCGCCGGTAGGTGCCGGCGGAGATGCCGAGGGC TG CGCCGATAGCAGTGTCTGTTTTCGTCGAA. Many repeats, both direct and inverted, in this region. This region is a possible MT-complex-spec

feature: type: gene
location: [2363390:2364107](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:888639']
    Key: locus_tag, Value: ['Rv2102']

curr_entity: {'start': 2363390, 'end': 2364107, 'strand': '+', 'location_parts': {'start': 2363390, 'end': 2364107, 'strand': '+'}, 'geneId': '888639', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [2364085:2364520](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:888003']
    Key: gene, Value: ['vapC37']
    Key: locus_tag, Value: ['Rv2103c']

curr_entity: {'start': 2364085, 'end': 2364520, 'strand': '-', 'name': 'vapC37', 'location_parts': {'start': 2364085, 'end': 2364520, 'strand': '-'}, 'geneId': '888003', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [2364526:2364781](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:888014']
    Key: gene, Value: ['vapB37']
    Key: locus_tag, Value: ['Rv2104c']

curr_entity: {'start': 2364526, 'end': 2364781, 'strand': '-', 'name': 'vapB37', 'location_parts': {'start': 2364526,

Error parsing feature: type: mobile_element
location: [2430116:2431471](-)
qualifiers:
    Key: mobile_element_type, Value: ['insertion sequence:IS6110-6']
    Key: note, Value: ['IS6110-6, len: 1355 nt. Insertion sequence IS6110.']

Error parsing feature: type: repeat_region
location: [2430116:2430144](-)
qualifiers:
    Key: note, Value: ['28 bp Inverted repeat at the left end of IS6110; GAGTCTCCGGACTCACCGGGGCGGTTCA']

Error parsing feature: type: repeat_region
location: [2431443:2431471](-)
qualifiers:
    Key: note, Value: ['28 bp Inverted repeat at the right end of IS6110,TGAACCGCCCCGGCATGTCCGGAGACTC']

Error parsing feature: type: misc_feature
location: [2437822:2437866](-)
qualifiers:
    Key: locus_tag, Value: ['Rv2175c']
    Key: note, Value: ['mcr5, fragment of putative small regulatory RNA (See DiChiara et al., 2010), cloned from M. bovis BCG Pasteur; ends not mapped, ~82 nt band detected by Northern blot.; Fragment of putative small regulatory RNA']

Error parsing feature: 

type: gene
location: [2965477:2965837](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:888594']
    Key: locus_tag, Value: ['Rv2640c']

curr_entity: {'start': 2965477, 'end': 2965837, 'strand': '-', 'location_parts': {'start': 2965477, 'end': 2965837, 'strand': '-'}, 'geneId': '888594', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [2965938:2966397](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:888629']
    Key: gene, Value: ['cadI']
    Key: locus_tag, Value: ['Rv2641']

curr_entity: {'start': 2965938, 'end': 2966397, 'strand': '+', 'name': 'cadI', 'location_parts': {'start': 2965938, 'end': 2966397, 'strand': '+'}, 'geneId': '888629', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [2966532:2966913](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:887703']
    Key: locus_tag, Value: ['Rv2642']

curr_entity: {'start': 2966532, 'end': 2966913, 'strand': '+', 'location_parts': {'start': 2966532, 'end': 2966913, 'strand': '+'}, 'geneId': '887703', 'type': 'gen

Error parsing feature: type: repeat_region
location: [2996002:2996053](+)
qualifiers:
    Key: note, Value: ['51 bp Mycobacterial Interspersed Repetitive Unit,Class II']

Error parsing feature: type: repeat_region
location: [2996053:2996104](+)
qualifiers:
    Key: note, Value: ['51 bp Mycobacterial Interspersed Repetitive Unit,Class II']

Error parsing feature: type: repeat_region
location: [2996104:2996155](+)
qualifiers:
    Key: locus_tag, Value: ['Rv2680']
    Key: note, Value: ['51 bp Mycobacterial Interspersed Repetitive Unit,Class II']

Error parsing feature: type: repeat_region
location: [3007062:3007115](-)
qualifiers:
    Key: note, Value: ['51 bp Mycobacterial Interspersed Repetitive Unit,Class II']

Error parsing feature: type: repeat_region
location: [3007115:3007168](-)
qualifiers:
    Key: note, Value: ['51 bp Mycobacterial Interspersed Repetitive Unit,Class II']

Error parsing feature: type: repeat_region
location: [3007168:3007221](-)
qualifiers:
    Key: note, Value:

Error: 'db_xref'
type: mobile_element
location: [3481398:3482722](+)
qualifiers:
    Key: mobile_element_type, Value: ['insertion sequence:IS1081-6']
    Key: note, Value: ['IS1081-6, len: 1324 nt. Insertion sequence IS1081. This region is a possible MT-complex-specific genomic island (See Becq et al., 2007).']

Error: 'db_xref'
type: repeat_region
location: [3481398:3481413](+)
qualifiers:
    Key: note, Value: ['15 bp inverted repeat at left end of IS1081: TCGCGTGATCCTTCG. This region is a possible MT-complex-specific genomic island (See Becq et al.,2007).']

feature: type: gene
location: [3481450:3482698](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:888790']
    Key: locus_tag, Value: ['Rv3115']

curr_entity: {'start': 3481450, 'end': 3482698, 'strand': '+', 'location_parts': {'start': 3481450, 'end': 3482698, 'strand': '+'}, 'geneId': '888790', 'type': 'gene', 'mRNAs': []}
Error: 'db_xref'
type: repeat_region
location: [3482707:3482722](-)
qualifiers:
    Key: note, Value: ['15


Error parsing feature: type: repeat_region
location: [3481398:3481413](+)
qualifiers:
    Key: note, Value: ['15 bp inverted repeat at left end of IS1081: TCGCGTGATCCTTCG. This region is a possible MT-complex-specific genomic island (See Becq et al.,2007).']

Error parsing feature: type: repeat_region
location: [3482707:3482722](-)
qualifiers:
    Key: note, Value: ['15 bp inverted repeat at right end of IS1081: TCGCGTGATCCTTCG. This region is a possible MT-complex-specific genomic island (See Becq et al.,2007).']

Error parsing feature: type: misc_feature
location: [3493167:3494181](-)
qualifiers:
    Key: locus_tag, Value: ['Rv3128c']
    Key: note, Value: ['This ORF corresponds to a fusion of MTCY164.38 and MTCY164.39c. Has in-frame amber stop codon']
    Key: pseudogene, Value: ['unknown']

Error parsing feature: type: repeat_region
location: [3551226:3551229](+)
qualifiers:
    Key: note, Value: ["3 bp direct repeat, cga, at 5'-end of IS6110. This region is a possible MT-complex-


curr_entity: {'start': 4133515, 'end': 4134592, 'strand': '+', 'name': 'moxR2', 'location_parts': {'start': 4133515, 'end': 4134592, 'strand': '+'}, 'geneId': '885323', 'type': 'gene', 'mRNAs': []}
Error: 'db_xref'
type: repeat_region
location: [4134600:4134725](+)
qualifiers:
    Key: note, Value: ['125 bp Mycobacterial Interspersed Repetitive Unit,Class III.']

feature: type: gene
location: [4134725:4136048](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:885059']
    Key: locus_tag, Value: ['Rv3693']

curr_entity: {'start': 4134725, 'end': 4136048, 'strand': '+', 'location_parts': {'start': 4134725, 'end': 4136048, 'strand': '+'}, 'geneId': '885059', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [4136121:4137114](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:885578']
    Key: locus_tag, Value: ['Rv3694c']

curr_entity: {'start': 4136121, 'end': 4137114, 'strand': '-', 'location_parts': {'start': 4136121, 'end': 4137114, 'strand': '-'}, 'geneId': '885578', 'type': '

Error parsing feature: type: repeat_region
location: [4134600:4134725](+)
qualifiers:
    Key: note, Value: ['125 bp Mycobacterial Interspersed Repetitive Unit,Class III.']

Error parsing feature: type: mobile_element
location: [4252992:4254324](+)
qualifiers:
    Key: mobile_element_type, Value: ['insertion sequence:IS1557-3']
    Key: note, Value: ['IS1557-3, len: 1332 nt. Insertion sequence IS1557.']

Error parsing feature: type: mobile_element
location: [4301542:4303415](-)
qualifiers:
    Key: mobile_element_type, Value: ['insertion sequence:IS1537']
    Key: note, Value: ['IS1537, len: 1873 nt. Insertion sequence IS1537.']

Error parsing feature: type: misc_feature
location: [4314797:4314891](+)
qualifiers:
    Key: note, Value: ['ncrMT3949, fragment of putative small regulatory RNA (See Pelly et al., 2012), cloned from M. tuberculosis CDC1551; supported by RNA-seq in H37Rv (unpublished data).; Fragment of putative small regulatory RNA']

Error parsing feature: type: repeat_regio

In [104]:
def annotations_to_lines(annotations_json_file):
    '''
    Convert a list of annotations in JSON format to a text file.
    
    Parameters
    ----------
    annotations_json_file: string
        The filename containing the json file with the gene annotations
    
    Returns
    -------
        A list of lines containing refGene formatted lines with the annotations
    '''
    with open(annotations_json_file, 'r') as f:
        json_file = json.load(f)
    
        for sequence in json_file['sequences']:
            common_name = sequence['ucsc_name'] if sequence['ucsc_name'] != 'na' else sequence['refseq_name']
            print('common_name', common_name)
            
            for gene in sequence['gene_features'].values():
                if 'cds' in gene:
                    print("{chr}\t{start}\t{end}".format(
                        chr=common_name,
                        start=gene['cds']['start'],
                        end=gene['cds']['end']))
                else:
                    print
                
            

In [105]:
lines = annotations_to_lines(op.join(output_dir, 'annotations.json'))

common_name NC_000962.3
NC_000962.3	0	1524
NC_000962.3	2051	3260
NC_000962.3	3279	4437
NC_000962.3	4433	4997
NC_000962.3	5239	7267
NC_000962.3	7301	9818
NC_000962.3	9913	10828


KeyError: 'cds'