In [134]:
import gffutils
db = gffutils.create_db("genome.genes.gff3", 
                        dbfn='Gb.db', 
                        force=True, 
                        keep_order=True,
                        merge_strategy='merge', 
                        sort_attribute_values=True)

def match_mRNAs_and_CDSs(mRNA_list, CDS_list):
    """Join together into a tuple all CDS objects in list and their parent mRNA"""
    matched = []
    for mRNA in mRNA_list:
        # Start with an empty list of CDSs for each mRNA
        all_CDSs_of_mRNA = []
        for cds in CDS_list:
            if(mRNA.id == cds.attributes.get("Parent")[0]):
                all_CDSs_of_mRNA.append(cds)
        matched.append((mRNA, all_CDSs_of_mRNA))
    return matched

def collect_all_mRNAs_and_CDSs(db, gene):
    """
    Gather all mRNA and CDS features of a gene into lists and return 
    them as a tuple
    """
    mRNAs = []
    CDSs = []
    for feature in db.children(gene):
        if(feature.featuretype == "mRNA"):
            mRNAs.append(feature)
            continue
        if(feature.featuretype == "CDS"):
            CDSs.append(feature)
            continue
    return (mRNAs, CDSs)

def remove_dublicates(list_of_products):
    """Remove dublicate gene product names from a list"""
    return list(set(list_of_products))

def get_products_from_cds_list(cds_list):
    """Get a list of unique gene product strings from a list of CDS objects"""
    all_products = []
    found_products = False
    for cds in cds_list:
        # If there is a product it is returned inside of a list
        product = cds.attributes.get("Product")
        if product:
            all_products.append(product[0])
            found_products = True
    if found_products:
        return remove_dublicates(all_products)
    else:
        return list()
        
def get_IDs_from_cds_list(cds_list):
    """Get a list of CDS IDs from a list of CDS objects"""
    return [cds.id for cds in cds_list]

In [135]:
# Populate gene dict with all transcript and cds annotations 
# in a list of dicts, e.g.
# {gene1_ID:
#[
#{'Gene_name': ['HIBN'],
# 'Transcript_ID': 'gbgene20309.t1', 
# 'Ontology_term': ['GO:0005634', 'GO:0061676'], 
# 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901'], 
# 'CDS_ID': 'gbgene5187.t2.CDS7', 
# 'CDS_Product': ['Histone-binding protein N1/N2']}, 
#{'Gene_name': ['HIBN'],
# 'Transcript_ID': 'gbgene20309.t2', 
# 'Ontology_term': ['GO:0005634', 'GO:0061676'], 
# 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901'], 
# 'CDS_ID': 'gbgene5187.t2.CDS7', 
# 'CDS_Product': ['Histone-binding protein N1/N2']
# ...
#],
# ...
#}
gene_annotations = {}
for gene in db.features_of_type('gene'):
    # Gather the gene's all mRNAs and CDSs into own lists
    mRNAs, CDSs = collect_all_mRNAs_and_CDSs(db, gene)
    # Iterate over CDSs and their parent mRNA and store all relevant info into
    # gene_annotation dictionary
    for tx_cds in match_mRNAs_and_CDSs(mRNAs, CDSs):        
        mRNA = tx_cds[0]
        all_CDSs_of_mRNA = tx_cds[1]
        cds_products = get_products_from_cds_list(all_CDSs_of_mRNA)
        cds_IDs = get_IDs_from_cds_list(all_CDSs_of_mRNA)
        # Populate gene_annotations dictionary
        if gene.id in gene_annotations:
            gene_annotations[gene.id].append({
                "Gene_name" : gene.attributes.get("Name"),
                "Transcript_ID" : mRNA.id,
                "Ontology_term" : mRNA.attributes.get("Ontology_term"),
                "Dbxref" : mRNA.attributes.get("Dbxref"),
                "CDS_ID" : cds_IDs,
                "CDS_Product" : cds_products
                })
        else:
            gene_annotations[gene.id] = [{
                "Gene_name" : gene.attributes.get("Name"),
                "Transcript_ID" : mRNA.id,
                "Ontology_term" : mRNA.attributes.get("Ontology_term"),
                "Dbxref" : mRNA.attributes.get("Dbxref"),
                "CDS_ID" : cds_IDs,
                "CDS_Product" : cds_products
                }]

**feature**
 'astuple',
 'attributes',
 'bin',
 'calc_bin',
 'chrom',
 'dialect',
 'end',
 'extra',
 'featuretype',
 'file_order',
 'frame',
 'id',
 'keep_order',
 'score',
 'seqid',
 'sequence',
 'sort_attribute_values',
 'source',
 'start',
 'stop',
 'strand'

**db**
'add_relation',
 'all_features',
 'analyze',
 'bed12',
 'children',
 'children_bp',
 'conn',
 'count_features_of_type',
 'create_introns',
 'dbfn',
 'default_encoding',
 'delete',
 'dialect',
 'directives',
 'execute',
 'features_of_type',
 'featuretypes',
 'interfeatures',
 'iter_by_parent_childs',
 'keep_order',
 'merge',
 'merge_all',
 'method',
 'parents',
 'pragmas',
 'region',
 'schema',
 'set_pragmas',
 'sort_attribute_values',
 'update',
 'version'

 **attributes**
 'clear',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values'

In [137]:
print(gene_annotations[gene.id])

[{'Gene_name': None, 'Transcript_ID': 'gbgene20353.t1', 'Ontology_term': None, 'Dbxref': None, 'CDS_ID': ['gbgene20353.t1.CDS1'], 'CDS_Product': []}]


In [136]:
print(gene_annotations["gbgene20309"])

[{'Gene_name': ['HIBN'], 'Transcript_ID': 'gbgene20309.t1', 'Ontology_term': ['GO:0005634', 'GO:0061676'], 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901', 'KEGG:xla:397901', 'PFAM:PF10516', 'PFAM:PF13181', 'InterPro:IPR019544', 'InterPro:IPR013026', 'InterPro:IPR011990', 'InterPro:IPR019734', 'EMBL:X04712'], 'CDS_ID': ['gbgene20309.t1.CDS1'], 'CDS_Product': ['Histone-binding protein N1/N2']}, {'Gene_name': ['HIBN'], 'Transcript_ID': 'gbgene20309.t2', 'Ontology_term': ['GO:0005634', 'GO:0061676'], 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901', 'KEGG:xla:397901', 'PFAM:PF10516', 'PFAM:PF13181', 'InterPro:IPR019544', 'InterPro:IPR013026', 'InterPro:IPR011990', 'InterPro:IPR019734', 'EMBL:X04712'], 'CDS_ID': ['gbgene20309.t2.CDS1'], 'CDS_Product': ['Histone-binding protein N1/N2']}]


In [141]:
gbgene20309_ch = list(db.children("gbgene20309"))
for feature in gbgene20309_ch:
    if(feature.featuretype == "mRNA"):
        print(feature.attributes)
    break

ID: ['gbgene20309.t1']
Parent: ['gbgene20309']
Ontology_term: ['GO:0005634', 'GO:0061676']
Dbxref: ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901', 'KEGG:xla:397901', 'PFAM:PF10516', 'PFAM:PF13181', 'InterPro:IPR019544', 'InterPro:IPR013026', 'InterPro:IPR011990', 'InterPro:IPR019734', 'EMBL:X04712']


In [143]:
%%capture cap --no-stderr
print(gene_annotations)
with open('output.json', 'w') as f:
    f.write(cap.stdout)