In [4]:
import Bio.SeqIO

In [5]:
import sys

def parse_location(location_str):
    return location_str.split('..')

def parse_string(location_str):
    if location_str.index('complement') == 0:
        return '-'
    else:
        return '+'

def parse_entity(feature):
    '''
    Parse a transcribed entity
    '''
    entity = {}
    # print('feature:', feature)
    
    entity['start'] = feature.location.start.position
    entity['end'] = feature.location.end.position
    entity['strand'] = '+' if feature.location.strand == 1 else '-'
    #print(dir(feature))
    #print("qulaifiers:", feature.qualifiers)
    if 'gene' in feature.qualifiers:
        entity['name'] = feature.qualifiers['gene'][0]
    
    
    for part in feature.location.parts:
        entity['location_parts'] = {
            'start': part.start.position,
            'end': part.end.position,
            'strand': '+' if part.strand == 1 else '-',
        }

    for xref in feature.qualifiers['db_xref']:
        parts = xref.split(':')
        if parts[0] == 'GeneID':
            entity['geneId'] = parts[1]
            break

    if 'transcript_id' in feature.qualifiers:
        entity['transcriptId'] = feature.qualifiers['transcript_id'][0]

    #print('entity:', entity)
    return entity

def extract_refseq_features(refseq_file, sequence_handle=None):
    '''
    Extract a list of the features present in this file. This file should
    only contain sequences for one record.
    
    Parameters
    ----------
    refseq_file: string
        The filename of a refseq file for a single sequence
    sequence_name: string
        The name of the sequence that these refseq annotations are for
    sequence_handle File
        A file handle for the sequence file
        
    Returns
    --------
    features: {}
        A list of features indexed by GenBank gene ids
    '''
    record_count = 0
    genes = {}
    count = 0
    
    for record in Bio.SeqIO.parse(refseq_file, 'genbank'):
        record_count += 1
        #print("dir:", dir(record))
    
        if sequence_handle is not None:
            # store the sequence of this record in the provied fasta
            # file handle
            Bio.SeqIO.write(record, sequence_handle, 'fasta')
        
        print('record:', record)
        for feature in record.features:
            # print("dir", dir(feature))
            # print(feature.type)
            # print(feature.location, type(feature.location))
            # print(dir(feature))
            try:
                curr_entity = parse_entity(feature)
            except Exception as ex:
                print("Error parsing feature: {}".format(feature), file=sys.stderr)
                print("Error: {}".format(ex))


            curr_entity['type'] = feature.type

            if feature.type == 'gene':
                curr_entity['mRNAs'] = []
                print("feature:", feature)
                print('curr_entity:', curr_entity)
                genes[curr_entity['geneId']] = curr_entity

                if 'pseudo' in feature.qualifiers:
                    curr_entity['type'] = 'pseudo'

                # print("gene", curr_entity)
            elif feature.type == 'CDS':
                #print("feature:", feature)

                cds = {}
                cds['start'] = feature.location.start.position
                cds['end'] = feature.location.end.position
                genes[curr_entity['geneId']]['cds'] = cds
            elif feature.type == 'ncRNA':
                genes[curr_entity['geneId']] = curr_entity
            elif feature.type == 'mRNA':
                if 'mRNAs' not in genes[curr_entity['geneId']]:
                    genes[curr_entity['geneId']]['mRNAs'] = []
                genes[curr_entity['geneId']]['mRNAs'] += [curr_entity]
            elif feature.type == 'misc_RNA':
                genes[curr_entity['geneId']] = curr_entity        
                #print(feature)
            elif feature.type == 'precursor_RNA':
                genes[curr_entity['geneId']] = curr_entity
            elif feature.type == 'tRNA':
                genes[curr_entity['geneId']] = curr_entity
            else:
                print(feature)

            count += 1

    return genes

In [6]:
import json
import os
import os.path as op
import urllib

def extract_assembly_annotations(assembly_location):
    #wget -O /tmp/assembly.txt ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.33_GRCh38.p7/GCF_000001405.33_GRCh38.p7_assembly_report.txt
    !wget -O /tmp/assembly.txt {assembly_location}

    !cat /tmp/assembly.txt | grep "^#" | tail -n 1 > /tmp/assembly_headers.txt
    !cat /tmp/assembly.txt | grep -v "^#" > /tmp/assembly_processed.txt

    with open('/tmp/assembly_headers.txt', 'r') as f:
        headers = f.readlines()[0].strip().split()[1:]
        print(headers)
        
    output_dir = op.join(op.expanduser('~/data/nuccore'),
                         op.splitext(op.basename(assembly_location))[0])

    if not op.exists(output_dir):
        os.makedirs(output_dir)

    output_files = []
    annotations = {
        'sequences': []
    }

    with open('/tmp/assembly_processed.txt', 'r') as f:
        output_fa = op.join(output_dir, 'seq.fa')
        output_chromSizes = op.join(output_dir, 'chromSizes.tsv')
        
        if op.exists(output_fa):
            os.remove(output_fa)
        
        if op.exists(output_chromSizes):
            os.remove(output_chromSizes)
            
        f_chromsizes = open(op.join(output_dir, 'chromSizes.tsv'), 'a')
            
        for line in f:
            parts = line.strip().split()
            print("parts:", parts)
            genbank_accn = parts[6]
            sequence_name = parts[0]
            ucsc_name = parts[-1]
            refseq_name = parts[-5]
            seq_len = int(parts[-2])
            
            common_name = ucsc_name if ucsc_name != 'na' else refseq_name
            
            f_chromsizes.write("{}\t{}\n".format(common_name, seq_len))
            print('common_name:', common_name)

            output_file  = op.join(output_dir, '{}.gb'.format(genbank_accn))
            done_file = '{}.done'.format(output_file)
            print(output_file)

            if not op.exists(done_file):
                try:
                    download_file = output_file + '.orig'
                    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=gbwithparts&retmode=text'.format(genbank_accn)
                    #print("url:", url)
                    urllib.request.urlretrieve(url, output_file)
                    with open(done_file, 'w') as f1:
                        pass
                except Exception as ex:
                    continue
                    print('Ex:', ex)

            #output_struct = dict(zip(headers, parts))

            output_struct = {}
            output_struct['refseqFilename'] = output_file
            output_struct['sequence_name'] = sequence_name
            output_struct['ucsc_name'] = ucsc_name
            output_struct['refseq_name'] = refseq_name
            output_struct['gene_features'] = extract_refseq_features(output_file,
                                                                     sequence_handle=output_fa)
            output_files += [output_struct]
        annotations['sequences'] += [output_struct]

    with open(op.join(output_dir, 'annotations.json'), 'w') as fo:
        json.dump(annotations, fo, indent=2)
    #print(output_files)
    return output_dir

In [8]:
#
# assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_assembly_report.txt"
# 
# assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_assembly_report.txt"

# h37rv
# assembly_location="ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2/GCF_000195955.2_ASM19595v2_assembly_report.txt"

# sacCer3
assembly_location='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_assembly_report.txt'

output_dir = extract_assembly_annotations(assembly_location)


--2018-03-12 13:14:20--  ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_assembly_report.txt
           => ‘/tmp/assembly.txt’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /genomes/all/GCF/000/146/045/GCF_000146045.2_R64 ... done.
==> SIZE GCF_000146045.2_R64_assembly_report.txt ... 2713
==> PASV ... done.    ==> RETR GCF_000146045.2_R64_assembly_report.txt ... done.
Length: 2713 (2.6K) (unauthoritative)


2018-03-12 13:14:21 (245 KB/s) - ‘/tmp/assembly.txt’ saved [2713]

['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', 'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name']
parts: ['I', 'assembled-molecule', 'I', 'Chromosome', 

Error parsing feature: type: centromere
location: [151464:151474](+)
qualifiers:
    Key: note, Value: ['CEN1_CDEI of CEN1']

Error parsing feature: type: centromere
location: [151474:151557](+)
qualifiers:
    Key: note, Value: ['CEN1_CDEII of CEN1']

Error parsing feature: type: centromere
location: [151557:151582](+)
qualifiers:
    Key: note, Value: ['CEN1_CDEIII of CEN1']



record: ID: NC_001134.8
Name: NC_001134
Description: Saccharomyces cerevisiae S288c chromosome II, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1336
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001134']
/sequence_version=8
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='Life with 6000 genes', ...), Reference(title='Complete DNA sequence of yeast chromosome II', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The reference
sequence is identical to 

Error parsing feature: type: centromere
location: [238206:238216](+)
qualifiers:
    Key: note, Value: ['CEN2_CDEI of CEN2']

Error parsing feature: type: centromere
location: [238216:238298](+)
qualifiers:
    Key: note, Value: ['CEN2_CDEII of CEN2']

Error parsing feature: type: centromere
location: [238298:238323](+)
qualifiers:
    Key: note, Value: ['CEN2_CDEIII of CEN2']




curr_entity: {'start': 723735, 'end': 724263, 'strand': '-', 'name': 'TRS20', 'location_parts': {'start': 723735, 'end': 724263, 'strand': '-'}, 'geneId': '852556', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<724455:>726540](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:852557']
    Key: gene, Value: ['MTC4']
    Key: locus_tag, Value: ['YBR255W']

curr_entity: {'start': 724455, 'end': 726540, 'strand': '+', 'name': 'MTC4', 'location_parts': {'start': 724455, 'end': 726540, 'strand': '+'}, 'geneId': '852557', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<726617:>727074](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:852558']
    Key: gene, Value: ['RCF3']
    Key: locus_tag, Value: ['YBR255C-A']

curr_entity: {'start': 726617, 'end': 727074, 'strand': '-', 'name': 'RCF3', 'location_parts': {'start': 726617, 'end': 727074, 'strand': '-'}, 'geneId': '852558', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<727385:>728102](-)
qualifiers:


Error parsing feature: type: centromere
location: [114384:114394](+)
qualifiers:
    Key: note, Value: ['CEN3_CDEI of CEN3']

Error parsing feature: type: centromere
location: [114394:114476](+)
qualifiers:
    Key: note, Value: ['CEN3_CDEII of CEN3']

Error parsing feature: type: centromere
location: [114476:114501](+)
qualifiers:
    Key: note, Value: ['CEN3_CDEIII of CEN3']



record: ID: NC_001136.10
Name: NC_001136
Description: Saccharomyces cerevisiae S288c chromosome IV, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 2458
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001136']
/sequence_version=10
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome IV', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission',

Error parsing feature: type: centromere
location: [449710:449720](+)
qualifiers:
    Key: note, Value: ['CEN4_CDEI of CEN4']

Error parsing feature: type: centromere
location: [449720:449796](+)
qualifiers:
    Key: note, Value: ['CEN4_CDEII of CEN4']

Error parsing feature: type: centromere
location: [449796:449821](+)
qualifiers:
    Key: note, Value: ['CEN4_CDEIII of CEN4']




feature: type: gene
location: [<755627:>763874](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:851727']
    Key: gene, Value: ['NUM1']
    Key: gene_synonym, Value: ['PAC12']
    Key: locus_tag, Value: ['YDR150W']

curr_entity: {'start': 755627, 'end': 763874, 'strand': '+', 'name': 'NUM1', 'location_parts': {'start': 755627, 'end': 763874, 'strand': '+'}, 'geneId': '851727', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<764177:>765155](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:851729']
    Key: gene, Value: ['CTH1']
    Key: locus_tag, Value: ['YDR151C']

curr_entity: {'start': 764177, 'end': 765155, 'strand': '-', 'name': 'CTH1', 'location_parts': {'start': 764177, 'end': 765155, 'strand': '-'}, 'geneId': '851729', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<765705:>766503](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:851730']
    Key: gene, Value: ['GIR2']
    Key: locus_tag, Value: ['YDR152W']

curr_entity: {'start': 765705, 'end': 7665

record: ID: NC_001137.3
Name: NC_001137
Description: Saccharomyces cerevisiae S288c chromosome V, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 985
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001137']
/sequence_version=3
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome V', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)

Error parsing feature: type: centromere
location: [151986:151996](+)
qualifiers:
    Key: note, Value: ['CEN5_CDEI of CEN5']

Error parsing feature: type: centromere
location: [151996:152079](+)
qualifiers:
    Key: note, Value: ['CEN5_CDEII of CEN5']

Error parsing feature: type: centromere
location: [152079:152104](+)
qualifiers:
    Key: note, Value: ['CEN5_CDEIII of CEN5']



record: ID: NC_001138.5
Name: NC_001138
Description: Saccharomyces cerevisiae S288c chromosome VI, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 447
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001138']
/sequence_version=5
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='Life with 6000 genes', ...), Reference(title='Analysis of the nucleotide sequence of chromosome VI from Saccharomyces cerevisiae', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFS

Error parsing feature: type: centromere
location: [148509:148519](+)
qualifiers:
    Key: note, Value: ['CEN6_CDEI of CEN6']

Error parsing feature: type: centromere
location: [148519:148602](+)
qualifiers:
    Key: note, Value: ['CEN6_CDEII of CEN6']

Error parsing feature: type: centromere
location: [148602:148627](+)
qualifiers:
    Key: note, Value: ['CEN6_CDEIII of CEN6']



record: ID: NC_001139.9
Name: NC_001139
Description: Saccharomyces cerevisiae S288c chromosome VII, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1795
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001139']
/sequence_version=9
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome VII', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This recor

Error parsing feature: type: centromere
location: [496919:496944](-)
qualifiers:
    Key: note, Value: ['CEN7_CDEIII of CEN7']

Error parsing feature: type: centromere
location: [496944:497028](-)
qualifiers:
    Key: note, Value: ['CEN7_CDEII of CEN7']

Error parsing feature: type: centromere
location: [497028:497038](-)
qualifiers:
    Key: note, Value: ['CEN7_CDEI of CEN7']



type: gene
location: [<678694:>682135](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:852989']
    Key: gene, Value: ['ASK10']
    Key: gene_synonym, Value: ['RGC2']
    Key: locus_tag, Value: ['YGR097W']

curr_entity: {'start': 678694, 'end': 682135, 'strand': '+', 'name': 'ASK10', 'location_parts': {'start': 678694, 'end': 682135, 'strand': '+'}, 'geneId': '852989', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<682565:>687458](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:852990']
    Key: gene, Value: ['ESP1']
    Key: locus_tag, Value: ['YGR098C']

curr_entity: {'start': 682565, 'end': 687458, 'strand': '-', 'name': 'ESP1', 'location_parts': {'start': 682565, 'end': 687458, 'strand': '-'}, 'geneId': '852990', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<687898:>689965](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:852991']
    Key: gene, Value: ['TEL2']
    Key: locus_tag, Value: ['YGR099W']

curr_entity: {'start': 687898, 'end': 689965, 'stra

record: ID: NC_001140.6
Name: NC_001140
Description: Saccharomyces cerevisiae S288c chromosome VIII, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 950
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001140']
/sequence_version=6
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='Life with 6000 genes', ...), Reference(title='Complete nucleotide sequence of Saccharomyces cerevisiae chromosome VIII', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The refere

Error parsing feature: type: centromere
location: [105585:105610](-)
qualifiers:
    Key: note, Value: ['CEN8_CDEIII of CEN8']

Error parsing feature: type: centromere
location: [105610:105693](-)
qualifiers:
    Key: note, Value: ['CEN8_CDEII of CEN8']

Error parsing feature: type: centromere
location: [105693:105703](-)
qualifiers:
    Key: note, Value: ['CEN8_CDEI of CEN8']



record: ID: NC_001141.2
Name: NC_001141
Description: Saccharomyces cerevisiae S288c chromosome IX, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 711
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001141']
/sequence_version=2
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome IX', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The reference
seque

Error parsing feature: type: centromere
location: [355628:355638](+)
qualifiers:
    Key: note, Value: ['CEN9_CDEI of CEN9']

Error parsing feature: type: centromere
location: [355638:355720](+)
qualifiers:
    Key: note, Value: ['CEN9_CDEII of CEN9']

Error parsing feature: type: centromere
location: [355720:355745](+)
qualifiers:
    Key: note, Value: ['CEN9_CDEIII of CEN9']



record: ID: NC_001142.9
Name: NC_001142
Description: Saccharomyces cerevisiae S288c chromosome X, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1208
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001142']
/sequence_version=9
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='Life with 6000 genes', ...), Reference(title='Complete nucleotide sequence of Saccharomyces cerevisiae chromosome X', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The reference
s

Error parsing feature: type: centromere
location: [436306:436331](-)
qualifiers:
    Key: note, Value: ['CEN10_CDEIII of CEN10']

Error parsing feature: type: centromere
location: [436331:436415](-)
qualifiers:
    Key: note, Value: ['CEN10_CDEII of CEN10']

Error parsing feature: type: centromere
location: [436415:436425](-)
qualifiers:
    Key: note, Value: ['CEN10_CDEI of CEN10']



record: ID: NC_001143.9
Name: NC_001143
Description: Saccharomyces cerevisiae S288c chromosome XI, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1033
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001143']
/sequence_version=9
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='Life with 6000 genes', ...), Reference(title='Complete DNA sequence of yeast chromosome XI', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The reference
sequence is identical to 

Error parsing feature: type: centromere
location: [440128:440153](-)
qualifiers:
    Key: note, Value: ['CEN11_CDEIII of CEN11']

Error parsing feature: type: centromere
location: [440153:440236](-)
qualifiers:
    Key: note, Value: ['CEN11_CDEII of CEN11']

Error parsing feature: type: centromere
location: [440236:440246](-)
qualifiers:
    Key: note, Value: ['CEN11_CDEI of CEN11']



record: ID: NC_001144.5
Name: NC_001144
Description: Saccharomyces cerevisiae S288c chromosome XII, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1739
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001144']
/sequence_version=5
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome XII', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission',

Error parsing feature: type: centromere
location: [150827:150852](-)
qualifiers:
    Key: note, Value: ['CEN12_CDEIII of CEN12']

Error parsing feature: type: centromere
location: [150852:150937](-)
qualifiers:
    Key: note, Value: ['CEN12_CDEII of CEN12']

Error parsing feature: type: centromere
location: [150937:150947](-)
qualifiers:
    Key: note, Value: ['CEN12_CDEI of CEN12']




type: repeat_region
location: [688368:688664](+)
qualifiers:
    Key: db_xref, Value: ['SGD:S000007091']
    Key: note, Value: ['Ty1 LTR']
    Key: rpt_type, Value: ['long_terminal_repeat']

feature: type: gene
location: [<689082:>691029](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:850979']
    Key: gene, Value: ['PIG1']
    Key: locus_tag, Value: ['YLR273C']

curr_entity: {'start': 689082, 'end': 691029, 'strand': '-', 'name': 'PIG1', 'location_parts': {'start': 689082, 'end': 691029, 'strand': '-'}, 'geneId': '850979', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<691554:>693882](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:850980']
    Key: gene, Value: ['MCM5']
    Key: gene_synonym, Value: ['BOB1; CDC46']
    Key: locus_tag, Value: ['YLR274W']

curr_entity: {'start': 691554, 'end': 693882, 'strand': '+', 'name': 'MCM5', 'location_parts': {'start': 691554, 'end': 693882, 'strand': '+'}, 'geneId': '850980', 'type': 'gene', 'mRNAs': []}
feature: type: gene
lo

record: ID: NC_001145.3
Name: NC_001145
Description: Saccharomyces cerevisiae S288c chromosome XIII, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1543
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001145']
/sequence_version=3
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome XIII', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The reference


Error parsing feature: type: centromere
location: [268030:268040](+)
qualifiers:
    Key: note, Value: ['CEN13_CDEI of CEN13']

Error parsing feature: type: centromere
location: [268040:268124](+)
qualifiers:
    Key: note, Value: ['CEN13_CDEII of CEN13']

Error parsing feature: type: centromere
location: [268124:268149](+)
qualifiers:
    Key: note, Value: ['CEN13_CDEIII of CEN13']



 type: gene
location: [<610158:>610365](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:855212']
    Key: gene, Value: ['PAI3']
    Key: locus_tag, Value: ['YMR174C']

curr_entity: {'start': 610158, 'end': 610365, 'strand': '-', 'name': 'PAI3', 'location_parts': {'start': 610158, 'end': 610365, 'strand': '-'}, 'geneId': '855212', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<611015:>611255](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:855213']
    Key: gene, Value: ['SIP18']
    Key: locus_tag, Value: ['YMR175W']

curr_entity: {'start': 611015, 'end': 611255, 'strand': '+', 'name': 'SIP18', 'location_parts': {'start': 611015, 'end': 611255, 'strand': '+'}, 'geneId': '855213', 'type': 'gene', 'mRNAs': []}
type: rep_origin
location: [611274:611489](+)
qualifiers:
    Key: db_xref, Value: ['SGD:S000118374']
    Key: note, Value: ['ARS1323; Autonomously Replicating Sequence']

feature: type: gene
location: [<611313:>611508](+)
qualifiers:
    Key: db_xref, Value: ['Gene

record: ID: NC_001146.8
Name: NC_001146
Description: Saccharomyces cerevisiae S288c chromosome XIV, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1283
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001146']
/sequence_version=8
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome XIV and its evolutionary implications', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/c

Error parsing feature: type: centromere
location: [628757:628767](+)
qualifiers:
    Key: note, Value: ['CEN14_CDEI of CEN14']

Error parsing feature: type: centromere
location: [628767:628850](+)
qualifiers:
    Key: note, Value: ['CEN14_CDEII of CEN14']

Error parsing feature: type: centromere
location: [628850:628875](+)
qualifiers:
    Key: note, Value: ['CEN14_CDEIII of CEN14']



feature: type: gene
location: [<587106:>587847](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:855707']
    Key: gene, Value: ['EFM6']
    Key: locus_tag, Value: ['YNL024C']

curr_entity: {'start': 587106, 'end': 587847, 'strand': '-', 'name': 'EFM6', 'location_parts': {'start': 587106, 'end': 587847, 'strand': '-'}, 'geneId': '855707', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<588262:>591160](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:855708']
    Key: gene, Value: ['FAP1']
    Key: locus_tag, Value: ['YNL023C']

curr_entity: {'start': 588262, 'end': 591160, 'strand': '-', 'name': 'FAP1', 'location_parts': {'start': 588262, 'end': 591160, 'strand': '-'}, 'geneId': '855708', 'type': 'gene', 'mRNAs': []}
type: STS
location: [588728:588831](+)
qualifiers:
    Key: db_xref, Value: ['UniSTS:257343']
    Key: standard_name, Value: ['px-17d12']

feature: type: gene
location: [<591426:>592899](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:855709']
    Key: gene, 

Error parsing feature: type: centromere
location: [326583:326608](-)
qualifiers:
    Key: note, Value: ['CEN15_CDEIII of CEN15']

Error parsing feature: type: centromere
location: [326608:326692](-)
qualifiers:
    Key: note, Value: ['CEN15_CDEII of CEN15']

Error parsing feature: type: centromere
location: [326692:326702](-)
qualifiers:
    Key: note, Value: ['CEN15_CDEI of CEN15']




feature: type: gene
location: [<727511:>729644](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:854381']
    Key: gene, Value: ['NOC2']
    Key: locus_tag, Value: ['YOR206W']

curr_entity: {'start': 727511, 'end': 729644, 'strand': '+', 'name': 'NOC2', 'location_parts': {'start': 727511, 'end': 729644, 'strand': '+'}, 'geneId': '854381', 'type': 'gene', 'mRNAs': []}
type: rep_origin
location: [729739:729969](+)
qualifiers:
    Key: db_xref, Value: ['SGD:S000118491']
    Key: note, Value: ['ARS1521; Autonomously Replicating Sequence']

feature: type: gene
location: [<730007:>733457](-)
qualifiers:
    Key: db_xref, Value: ['GeneID:854382']
    Key: gene, Value: ['RET1']
    Key: gene_synonym, Value: ['PDS2; RPC128; RPC2']
    Key: locus_tag, Value: ['YOR207C']

curr_entity: {'start': 730007, 'end': 733457, 'strand': '-', 'name': 'RET1', 'location_parts': {'start': 730007, 'end': 733457, 'strand': '-'}, 'geneId': '854382', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<73

record: ID: NC_001148.4
Name: NC_001148
Description: Saccharomyces cerevisiae S288c chromosome XVI, complete sequence
Database cross-references: BioProject:PRJNA128, Assembly:GCF_000146045.2
Number of features: 1540
/molecule_type=DNA
/topology=linear
/data_file_division=CON
/date=27-MAR-2017
/accessions=['NC_001148']
/sequence_version=4
/keywords=['RefSeq']
/source=Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The nucleotide sequence of Saccharomyces cerevisiae chromosome XVI', ...), Reference(title='Life with 6000 genes', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=REVIEWED REFSEQ: This record has been curated by SGD. The reference
se

Error parsing feature: type: centromere
location: [555956:555966](+)
qualifiers:
    Key: note, Value: ['CEN16_CDEI of CEN16']

Error parsing feature: type: centromere
location: [555966:556048](+)
qualifiers:
    Key: note, Value: ['CEN16_CDEII of CEN16']

Error parsing feature: type: centromere
location: [556048:556073](+)
qualifiers:
    Key: note, Value: ['CEN16_CDEIII of CEN16']




curr_entity: {'start': 627879, 'end': 631245, 'strand': '+', 'name': 'CSR2', 'location_parts': {'start': 627879, 'end': 631245, 'strand': '+'}, 'geneId': '856142', 'type': 'gene', 'mRNAs': []}
feature: type: gene
location: [<631514:>633761](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:856143']
    Key: gene, Value: ['NTO1']
    Key: locus_tag, Value: ['YPR031W']

curr_entity: {'start': 631514, 'end': 633761, 'strand': '+', 'name': 'NTO1', 'location_parts': {'start': 631514, 'end': 633761, 'strand': '+'}, 'geneId': '856143', 'type': 'gene', 'mRNAs': []}
type: rep_origin
location: [633873:634122](+)
qualifiers:
    Key: db_xref, Value: ['SGD:S000118419']
    Key: note, Value: ['ARS1623; Autonomously Replicating Sequence']

feature: type: gene
location: [<634122:>637224](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:856144']
    Key: gene, Value: ['SRO7']
    Key: gene_synonym, Value: ['SNI1; SOP1']
    Key: locus_tag, Value: ['YPR032W']

curr_entity: {'start': 634122, 'end': 6372

record: ID: NC_001224.1
Name: NC_001224
Description: Saccharomyces cerevisiae S288c mitochondrion, complete genome
Database cross-references: BioProject:PRJNA128
Number of features: 102
/molecule_type=DNA
/topology=circular
/data_file_division=PLN
/date=05-JUN-2017
/accessions=['NC_001224']
/sequence_version=1
/keywords=['RefSeq']
/source=mitochondrion Saccharomyces cerevisiae S288C
/organism=Saccharomyces cerevisiae S288C
/taxonomy=['Eukaryota', 'Fungi', 'Dikarya', 'Ascomycota', 'Saccharomycotina', 'Saccharomycetes', 'Saccharomycetales', 'Saccharomycetaceae', 'Saccharomyces']
/references=[Reference(title='The complete sequence of the mitochondrial genome of Saccharomyces cerevisiae', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/comment=PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence is identical to KP263414.
COMPLETENESS: full length.
Seq('TT

In [104]:
def annotations_to_lines(annotations_json_file):
    '''
    Convert a list of annotations in JSON format to a text file.
    
    Parameters
    ----------
    annotations_json_file: string
        The filename containing the json file with the gene annotations
    
    Returns
    -------
        A list of lines containing refGene formatted lines with the annotations
    '''
    with open(annotations_json_file, 'r') as f:
        json_file = json.load(f)
    
        for sequence in json_file['sequences']:
            common_name = sequence['ucsc_name'] if sequence['ucsc_name'] != 'na' else sequence['refseq_name']
            print('common_name', common_name)
            
            for gene in sequence['gene_features'].values():
                if 'cds' in gene:
                    print("{chr}\t{start}\t{end}".format(
                        chr=common_name,
                        start=gene['cds']['start'],
                        end=gene['cds']['end']))
                else:
                    print
                
            

In [105]:
lines = annotations_to_lines(op.join(output_dir, 'annotations.json'))

common_name NC_000962.3
NC_000962.3	0	1524
NC_000962.3	2051	3260
NC_000962.3	3279	4437
NC_000962.3	4433	4997
NC_000962.3	5239	7267
NC_000962.3	7301	9818
NC_000962.3	9913	10828


KeyError: 'cds'