In [1]:
import click
import pandas as pd
from pathlib import Path
import gffutils
import csv
from collections import namedtuple
import numpy as np





def match_mRNAs_and_CDSs(mRNA_list, CDS_list):
    """Join together into a tuple all CDS objects in list and their parent mRNA"""
    matched = []
    for mRNA in mRNA_list:
        # Start with an empty list of CDSs for each mRNA
        all_CDSs_of_mRNA = []
        for cds in CDS_list:
            if(mRNA.id == cds.attributes.get("Parent")[0]):
                all_CDSs_of_mRNA.append(cds)
        matched.append((mRNA, all_CDSs_of_mRNA))
    return matched

def collect_all_mRNAs_and_CDSs(db, gene):
    """
    Gather all mRNA and CDS features of a gene into lists and return 
    them as a tuple
    """
    mRNAs = []
    CDSs = []
    for feature in db.children(gene):
        if(feature.featuretype == "mRNA"):
            mRNAs.append(feature)
            continue
        if(feature.featuretype == "CDS"):
            CDSs.append(feature)
            continue
    return (mRNAs, CDSs)

def remove_dublicates(list_of_products):
    """Remove dublicate gene product names from a list"""
    return list(set(list_of_products))

def get_products_from_cds_list(cds_list):
    """Get a list of unique gene product strings from a list of CDS objects"""
    all_products = []
    found_products = False
    for cds in cds_list:
        # If there is a product it is returned inside of a list
        product = cds.attributes.get("Product")
        if product:
            all_products.append(product[0])
            found_products = True
    if found_products:
        return remove_dublicates(all_products)
    else:
        return list()
        
def get_IDs_from_cds_list(cds_list):
    """Get a list of CDS IDs from a list of CDS objects"""
    return [cds.id for cds in cds_list]

def create_gene_annotations(db):
    """Create gene annotions dictionary with all essential info in memory"""
    click.echo("Creation of annotions dictionary started...")
    gene_annotations = {}
    for gene in db.features_of_type('gene'):
        # Gather the gene's all mRNAs and CDSs into own lists
        mRNAs, CDSs = collect_all_mRNAs_and_CDSs(db, gene)
        # Iterate over CDSs and their parent mRNA and store all relevant info into
        # gene_annotation dictionary
        for tx_cds in match_mRNAs_and_CDSs(mRNAs, CDSs):        
            mRNA = tx_cds[0]
            all_CDSs_of_mRNA = tx_cds[1]
            cds_products = get_products_from_cds_list(all_CDSs_of_mRNA)
            cds_IDs = get_IDs_from_cds_list(all_CDSs_of_mRNA)
            # Populate gene_annotations dictionary
            if gene.id in gene_annotations:
                gene_annotations[gene.id].append({
                    "Gene_name" : gene.attributes.get("Name"),
                    "Transcript_ID" : mRNA.id,
                    "Ontology_term" : mRNA.attributes.get("Ontology_term"),
                    "Dbxref" : mRNA.attributes.get("Dbxref"),
                    "CDS_ID" : cds_IDs,
                    "CDS_Product" : cds_products
                    })
            else:
                gene_annotations[gene.id] = [{
                    "Gene_name" : gene.attributes.get("Name"),
                    "Transcript_ID" : mRNA.id,
                    "Ontology_term" : mRNA.attributes.get("Ontology_term"),
                    "Dbxref" : mRNA.attributes.get("Dbxref"),
                    "CDS_ID" : cds_IDs,
                    "CDS_Product" : cds_products
                    }]
    click.echo("Gene annotions dictionary finished")
    return gene_annotations


def parse_refs(ref):
    """Read an unparsed reference element and return db id"""
    return (":").join(ref.split(":")[1:])

def add_ref(ref_dict, reference_db_name, unparsed_ref_element):
    """Create a list of reference values"""
    if reference_db_name in ref_dict:
        ref_dict[reference_db_name].append(parse_refs(unparsed_ref_element))
    else:
        ref_dict[reference_db_name] = [parse_refs(unparsed_ref_element)]
    return ref_dict

def parse_dbxref_list(dbxref_list, all_found):
    """
    Traverse a list of unparsed db references and add them to a dict of lists
    Keep also track of which db references didn't exist in the list
    """
    dbxref = {}
    for unparsed_reference in dbxref_list:
        if (unparsed_reference.startswith("UniProtKB") or unparsed_reference.startswith("Swiss-Prot")):
            dbxref = add_ref(dbxref, "UniProtKB", unparsed_reference)
            all_found['UniProtKB'] = True
        elif (unparsed_reference.startswith("GeneID")):
            dbxref = add_ref(dbxref, "GeneID", unparsed_reference)
            all_found['GeneID'] = True
        elif (unparsed_reference.startswith("KEGG")):
            dbxref = add_ref(dbxref, "KEGG", unparsed_reference)
            all_found['KEGG'] = True
        elif (unparsed_reference.startswith("PFAM")):
            dbxref = add_ref(dbxref, "PFAM", unparsed_reference)
            all_found['PFAM'] = True
        elif (unparsed_reference.startswith("InterPro")):
            dbxref = add_ref(dbxref, "InterPro", unparsed_reference)
            all_found['InterPro'] = True
        elif (unparsed_reference.startswith("EMBL")):
            dbxref = add_ref(dbxref, "EMBL", unparsed_reference)
            all_found['EMBL'] = True
        else:
            # If not matched with any, create an unparsed unknown element
            dbxref["Unknown"] = unparsed_reference
            all_found['Unknown'] = True
            click.echo(f"Unknown reference added: {unparsed_reference}")
    return (dbxref, all_found)

def ntuple_to_list(ntuple):
    """Convert named tuple values to list"""
    return list(ntuple)

def parse_Dbxref(dbxref_list):
    """Parse a list dbxrefs and return a dict of db as key and ids as values"""
    dbxref = {}
    # Handle completely missing Dbxref data
    if dbxref_list is None:
        return {
            "UniProtKB":[""],
            "GeneID":[""],
            "KEGG":[""],
            "PFAM":[""],
            "InterPro":[""],
            "EMBL":[""]
            }
    
    # Keep track of partially missing Dbxref data
    all_found = {
        "UniProtKB" : False,
        "GeneID" : False,
        "KEGG" : False,
        "PFAM" : False,
        "InterPro" : False,
        "EMBL" : False,
        "Unknown" : False
    }
    # Create dicts for db ids and how partially the references were found
    dbxref, all_found = parse_dbxref_list(dbxref_list, all_found)
    
    # Fill in missing data with empty strings in a list
    for db, found in all_found.items():
        if not found:
            dbxref[db] = [""]
    return dbxref

def remove_duplicate_values(dbxref_dict):
    """Go through all db references and remove duplicate db IDs"""
    for key, value in dbxref_dict.items():
        dbxref_dict[key] = remove_dublicates(value)
    return dbxref_dict

def convert_nones_to_list(element):
    """Convert NoneTypes are converted to empty lists"""
    if element is None:
        return []
    else:
        return element


def concat_list(in_list,concat_char = ","):
    """Concatenate all list elements and return a single element list of the results"""
    if isinstance(in_list, list):
        return [concat_char.join(in_list)]
    else:
        return [""]

In [2]:
filename = "genome.genes.flybasewithcurated.gff3"
database_filename = "genome.genes.sqlite"

In [3]:
#db = gffutils.create_db(filename, database_filename)

In [4]:
#type(db)

In [5]:
db = gffutils.FeatureDB(database_filename)

In [7]:
from dataclasses import dataclass, field
from typing import ClassVar
from collections import Counter


    
    #@staticmethod
    #def gather_db_names(cls, dbnames: dict) -> None:
    #    cls.all_dbxs.update(dbnames.keys())
    
    def __post_init__(self):
        if self.dbxref_lst:
            db_names: list[str] = [get_db_name(db_ref)[0] for db_ref in self.dbxref_lst]
            db_name_counts: dict = dict(Counter(db_names))
            dbxref_dict = {}
            for db_name_ref in self.dbxref_lst:
                db_name, db_id = get_db_name(db_name_ref)
                num_db_names = db_name_counts.get(db_name)
                # Add to dict if only one else update counter dict and add
                # unique ending to the dict key
                if num_db_names == 1:
                    dbxref_dict.update({db_name:db_id})
                else:
                    db_names_left: int = num_db_names - 1
                    db_name_counts.update({db_name:db_names_left})
                    new_db_name: str = f"{db_name}_{num_db_names}"
                    dbxref_dict.update({new_db_name:db_id})
            self.dbxref = dbxref_dict
            Transcript.all_dbxs.update(dbxref_dict.keys())
            #Transcript.all_dbxs.update(dbxref.keys())
    
    def to_list(self):
        
        [
            self.id,
            self.flybase,
            self.flybase_revhits,
            self.flybase_name,
            self.gbue11,
            self.gbue11_revhits,
            self.orthodb,
            self.orthodb_revhits,
            self.ontology_term,        
        ]
    
#if index == 0:
#    print(element.attributes)

In [8]:
transcripts = []
for index,element in enumerate(db.features_of_type("mRNA")):
    transcripts.append(Transcript(
        id = element.attributes.get("ID"),
        parent = element.attributes.get("Parent"),
        flybase = element.attributes.get("flybase"),
        flybase_revhits = element.attributes.get("flybase_revhits"),
        flybase_name = element.attributes.get("flybase_name"),
        gbue11 = element.attributes.get("gbue11"),
        gbue11_revhits = element.attributes.get("gbue11_revhits"),
        orthodb = element.attributes.get("orthodb"),
        orthodb_revhits = element.attributes.get("orthodb_revhits"),
        ontology_term = element.attributes.get("Ontology_term"),
        dbxref_lst = element.attributes.get("Dbxref"),
        #element.attributes.get("dbxref"),
    ))
    #if index == 19:
    #    print(element.attributes)
    #    break


In [9]:
transcripts[4]

Transcript(id=['gbgene5.t1'], parent=['gbgene5'], flybase=['FBgn0033354'], flybase_revhits=None, flybase_name=['FANCI-PA'], gbue11=['GBUE012419-PA'], gbue11_revhits=None, orthodb=['56086_0_003082'], orthodb_revhits=None, ontology_term=['GO:0006281', 'GO:0070182'], dbxref={'FlyBase_Annotation_IDs': 'CG13745-PA', 'FlyBase': 'FBpp0087756', 'FlyMine': 'FBpp0087756', 'GB_protein_2': 'AAF59016', 'GB_protein': 'AAF59016.1', 'modMine': 'FBpp0087756', 'REFSEQ': 'NP_610429', 'UniProt/TrEMBL': 'A1Z7L1'})

In [10]:
import csv
from collections import namedtuple
from dataclasses import dataclass, asdict

transcript_annotation_header = ["id","parent","flybase","flybase_revhits","flybase_name","gbue11","gbue11_revhits","orthodb","orthodb_revhits","ontology_term"]
dbx_refs_sorted = sorted(list(Transcript.all_dbxs))

df_data = []
header = []
diff_exp_data = 'shrinked_padj-filtered.tsv'

with open(diff_exp_data, 'r') as reader:
    # Handle DE data with column names so it's easier to access each element
    diff_exp_reader = csv.reader(reader, delimiter="\t")
    de_header = next(diff_exp_reader)
    header = de_header + transcript_annotation_header + dbx_refs_sorted
    De_data = namedtuple("DE_data", de_header)
    for data in map(De_data._make, diff_exp_reader):
        # Capture all data from our current DE data row
        current_de_data = list(data)
        # Find transcripts that belong to the DE gene
        for transcript in transcripts:
            if data.Genes == transcript.parent[0]:
                # Get each piece of transcript annotation in same order to a list 
                # so that it can be printed in correct order as annotation to the
                # rest of DE data
                transcript_annotation = []
                for column in transcript_annotation_header:
                    column_value = asdict(transcript).get(column)
                    # Join all non-None column values to one string
                    if column_value:
                        transcript_annotation.append(','.join(column_value))
                    else:
                        # None:s should be written as "NA" in the annotation
                        transcript_annotation.append(np.nan)
                # Fetch all in the DB existing dbx references from the annotation 
                for dbx_ref in dbx_refs_sorted:
                    # If we have a dbx_ref fetch values from it
                    if transcript.dbxref:
                        dbx_ref_fetched = transcript.dbxref.get(dbx_ref)
                        # If the reference value is non-None add it to the annotation
                        if dbx_ref_fetched:
                            transcript_annotation.append(dbx_ref_fetched)
                        else:
                            # None:s should be written as "NA" in the annotation
                            transcript_annotation.append(np.nan)
                    else:
                        # None:s should be written as "NA" in the annotation
                        transcript_annotation.append(np.nan)
                # Append the data row as a list
                df_data.append(current_de_data + transcript_annotation)


In [19]:
df = pd.DataFrame(df_data, columns = header)
annotated_diff_exp_data = 'shrinked_padj-filtered_annotated.tsv'
# Drop all columns where all values are np.nan
df = df.dropna(axis=1, how='all')
df = df.drop("parent", axis=1)
renamings
'id':'transcript_ID',
'flybase':'FlyBase_ID',
'flybase_revhits':'FlyBase_reverse_hits_IDs',
'flybase_name':'FlyBase_symbol_name',
'ontology_term':'Gene_Ontology_terms',
'FlyBase':'FlyBase_reference_ID1',
'FlyBase_2':'FlyBase_reference_ID2',
'FlyBase_Annotation_IDs':'FlyBase_Annotation_Symbol_ID1',
'FlyBase_Annotation_IDs_2':'FlyBase_Annotation_Symbol_ID2',
'FlyMine':'FlyMine_ID1',
'FlyMine_2':'FlyMine_ID2',
'GB_protein':'GB_protein_ID1',
'GB_protein_2':'GB_protein_ID2',
'GB_protein_3':'GB_protein_ID3',
'GB_protein_4':'GB_protein_ID4',
'GB_protein_5':'GB_protein_ID5',
'REFSEQ':'NCBI_Reference_Sequence_ID1',
'REFSEQ_2':'NCBI_Reference_Sequence_ID2',
'UniProt/Swiss-Prot':'UniProt/Swiss-Prot_ID',
'UniProt/TrEMBL':'UniProt/TrEMBL_ID1',
'UniProt/TrEMBL_2':'UniProt/TrEMBL_ID2',
}

df.rename(columns=renamings)

Unnamed: 0,Genes,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,id,flybase,flybase_revhits,...,GB_protein_3,GB_protein_4,GB_protein_5,REFSEQ,REFSEQ_2,UniProt/Swiss-Prot,UniProt/TrEMBL,UniProt/TrEMBL_2,modMine,modMine_2
0,gbgene898,50282.2933422622,3.18665147135842,0.375224271502857,8.49281446843104,2.01692141069787e-17,2.71699483235111e-13,gbgene898.t1,FBgn0053126,"gbgene898.t1,gbgene11898.t1,gbgene13186.t1,gbg...",...,,,,NP_001285560,,,Q8SXR1,,,
1,gbgene18399,4719.41064222729,5.47217063433783,0.660615318009431,8.30708810916151,9.80791214533117e-17,1.32122384509756e-12,gbgene18399.t1,FBgn0000042,"gbgene1363.t1,gbgene7972.t1,gbgene9345.t1,gbge...",...,,,,NP_001284915,,P10987,,,,
2,gbgene7672,57572.1989547611,-4.53703656570831,0.952828618088298,-7.38306254249907,1.54689315293287e-13,2.08381976631587e-09,gbgene7672.t1,,,...,,,,,,,,,,
3,gbgene2487,98.9639623042805,-1.0610887648853,0.868372946758628,-7.24566816785309,4.30312483601809e-13,5.79673946659996e-09,gbgene2487.t1,FBgn0030592,"gbgene2487.t1,gbgene14616.t1,gbgene14625.t2",...,,,,NP_572981,,,Q9VY06,,FBpp0073797,
4,gbgene15915,36591.4534399807,-4.50257993023928,0.953317548342053,-7.17849580463953,7.04825532781262e-13,9.49470475209638e-09,gbgene15915.t1,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,gbgene14914,36.3481070818882,2.97653895221461,0.590945529286848,4.99788164664735,5.79635409312971e-07,0.00780826859885504,gbgene14914.t1,FBgn0030589,,...,,,,NP_572978,,,Q9VY09,,FBpp0073801,
102,gbgene6556,276.704688600132,-3.99124773622068,0.806298033937006,-4.98640752957921,6.15123261304436e-07,0.00828632545303206,gbgene6556.t1,FBgn0013812,"gbgene6556.t1,gbgene6556.t2,gbgene7853.t1,gbge...",...,,,,NP_524424,,,Q9VDG0,,FBpp0271744,
103,gbgene6556,276.704688600132,-3.99124773622068,0.806298033937006,-4.98640752957921,6.15123261304436e-07,0.00828632545303206,gbgene6556.t2,FBgn0013812,"gbgene6556.t1,gbgene6556.t2,gbgene7853.t1,gbge...",...,,,,NP_524424,,,Q9VDG0,,FBpp0271744,
104,gbgene883,135.764437148795,3.09428117848639,0.619886170429192,4.9815493950296,6.30771784056506e-07,0.00849712670302519,gbgene883.t1,FBgn0086687,"gbgene883.t1,gbgene1586.t1,gbgene1587.t1,gbgen...",...,,,,NP_731711,,,Q7K4Y0,,,


Unnamed: 0,gene_ID,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,transcript_ID,FlyBase_ID,FlyBase_reverse_hits_IDs,...,GB_protein_ID3,GB_protein_ID4,GB_protein_ID5,NCBI_Reference_Sequence_ID1,NCBI_Reference_Sequence_ID2,UniProt/Swiss-Prot_ID,UniProt/TrEMBL_ID1,UniProt/TrEMBL_ID2,modMine,modMine_2
0,gbgene898,50282.2933422622,3.18665147135842,0.375224271502857,8.49281446843104,2.01692141069787e-17,2.71699483235111e-13,gbgene898.t1,FBgn0053126,"gbgene898.t1,gbgene11898.t1,gbgene13186.t1,gbg...",...,,,,NP_001285560,,,Q8SXR1,,,
1,gbgene18399,4719.41064222729,5.47217063433783,0.660615318009431,8.30708810916151,9.80791214533117e-17,1.32122384509756e-12,gbgene18399.t1,FBgn0000042,"gbgene1363.t1,gbgene7972.t1,gbgene9345.t1,gbge...",...,,,,NP_001284915,,P10987,,,,
2,gbgene7672,57572.1989547611,-4.53703656570831,0.952828618088298,-7.38306254249907,1.54689315293287e-13,2.08381976631587e-09,gbgene7672.t1,,,...,,,,,,,,,,
3,gbgene2487,98.9639623042805,-1.0610887648853,0.868372946758628,-7.24566816785309,4.30312483601809e-13,5.79673946659996e-09,gbgene2487.t1,FBgn0030592,"gbgene2487.t1,gbgene14616.t1,gbgene14625.t2",...,,,,NP_572981,,,Q9VY06,,FBpp0073797,
4,gbgene15915,36591.4534399807,-4.50257993023928,0.953317548342053,-7.17849580463953,7.04825532781262e-13,9.49470475209638e-09,gbgene15915.t1,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,gbgene14914,36.3481070818882,2.97653895221461,0.590945529286848,4.99788164664735,5.79635409312971e-07,0.00780826859885504,gbgene14914.t1,FBgn0030589,,...,,,,NP_572978,,,Q9VY09,,FBpp0073801,
102,gbgene6556,276.704688600132,-3.99124773622068,0.806298033937006,-4.98640752957921,6.15123261304436e-07,0.00828632545303206,gbgene6556.t1,FBgn0013812,"gbgene6556.t1,gbgene6556.t2,gbgene7853.t1,gbge...",...,,,,NP_524424,,,Q9VDG0,,FBpp0271744,
103,gbgene6556,276.704688600132,-3.99124773622068,0.806298033937006,-4.98640752957921,6.15123261304436e-07,0.00828632545303206,gbgene6556.t2,FBgn0013812,"gbgene6556.t1,gbgene6556.t2,gbgene7853.t1,gbge...",...,,,,NP_524424,,,Q9VDG0,,FBpp0271744,
104,gbgene883,135.764437148795,3.09428117848639,0.619886170429192,4.9815493950296,6.30771784056506e-07,0.00849712670302519,gbgene883.t1,FBgn0086687,"gbgene883.t1,gbgene1586.t1,gbgene1587.t1,gbgen...",...,,,,NP_731711,,,Q7K4Y0,,,


Tehdä nämä kaikki eri osiot funktioiksi ja lisätä riittävät jutut argumenteiksi

['FlyBase',
 'FlyBase_2',
 'FlyBase_3',
 'FlyBase_4',
 'FlyBase_Annotation_IDs',
 'FlyBase_Annotation_IDs_2',
 'FlyBase_Annotation_IDs_3',
 'FlyBase_Annotation_IDs_4',
 'FlyMine',
 'FlyMine_2',
 'GB_protein',
 'GB_protein_2',
 'GB_protein_3',
 'GB_protein_4',
 'GB_protein_5',
 'GB_protein_6',
 'GI',
 'REFSEQ',
 'REFSEQ_2',
 'REFSEQ_3',
 'UniProt/Swiss-Prot',
 'UniProt/Swiss-Prot_2',
 'UniProt/TrEMBL',
 'UniProt/TrEMBL_2',
 'modMine',
 'modMine_2']

In [40]:
dbxref_dict

{'FlyBase_Annotation_IDs': 'CG13175-PA',
 'FlyBase': 'FBpp0099493',
 'FlyMine': 'FBpp0099493',
 'GB_protein_2': 'AAF58560',
 'GB_protein': 'AAF58560.2',
 'modMine': 'FBpp0099493',
 'REFSEQ': 'NP_610740',
 'UniProt/TrEMBL': 'A1Z8U7'}

In [69]:
gb

{'ID': ['gbgene5.t1'],
 'Parent': ['gbgene5'],
 'flybase': ['FBgn0033354'],
 'flybase_name': ['FANCI-PA'],
 'gbue11': ['GBUE012419-PA'],
 'orthodb': ['56086_0_003082'],
 'Ontology_term': ['GO:0006281', 'GO:0070182'],
 'Dbxref': ['FlyBase_Annotation_IDs:CG13745-PA',
  'FlyBase:FBpp0087756',
  'FlyMine:FBpp0087756',
  'GB_protein:AAF59016',
  'GB_protein:AAF59016.1',
  'modMine:FBpp0087756',
  'REFSEQ:NP_610429',
  'UniProt/TrEMBL:A1Z7L1']}

In [82]:
a = {ref.split(":")[0]:ref.split(":")[1] for ref in dbxref}

In [84]:
a

{'FlyBase_Annotation_IDs': 'CG13175-PA',
 'FlyBase': 'FBpp0099493',
 'FlyMine': 'FBpp0099493',
 'GB_protein': 'AAF58560.2',
 'modMine': 'FBpp0099493',
 'REFSEQ': 'NP_610740',
 'UniProt/TrEMBL': 'A1Z8U7'}

In [83]:
a["FlyMine"]

'FBpp0099493'

In [59]:
print(element.attributes)

ID: ['gbgene4.t1']
Parent: ['gbgene4']
flybase: ['FBgn0053964']
flybase_name: ['CG33964-PA']
gbue11: ['GBUE012418-PA']
orthodb: ['56086_0_003081']
Ontology_term: ['GO:0008168']
Dbxref: ['FlyBase_Annotation_IDs:CG33964-PA', 'FlyBase:FBpp0099380', 'FlyMine:FBpp0099380', 'GB_protein:AAF58558', 'GB_protein:AAF58558.4', 'modMine:FBpp0099380', 'REFSEQ:NP_001033941', 'UniProt/TrEMBL:Q86NX3']


In [134]:
import gffutils
db = gffutils.create_db("genome.genes.gff3", 
                        dbfn='Gb.db', 
                        force=True, 
                        keep_order=True,
                        merge_strategy='merge', 
                        sort_attribute_values=True)

def match_mRNAs_and_CDSs(mRNA_list, CDS_list):
    """Join together into a tuple all CDS objects in list and their parent mRNA"""
    matched = []
    for mRNA in mRNA_list:
        # Start with an empty list of CDSs for each mRNA
        all_CDSs_of_mRNA = []
        for cds in CDS_list:
            if(mRNA.id == cds.attributes.get("Parent")[0]):
                all_CDSs_of_mRNA.append(cds)
        matched.append((mRNA, all_CDSs_of_mRNA))
    return matched

def collect_all_mRNAs_and_CDSs(db, gene):
    """
    Gather all mRNA and CDS features of a gene into lists and return 
    them as a tuple
    """
    mRNAs = []
    CDSs = []
    for feature in db.children(gene):
        if(feature.featuretype == "mRNA"):
            mRNAs.append(feature)
            continue
        if(feature.featuretype == "CDS"):
            CDSs.append(feature)
            continue
    return (mRNAs, CDSs)

def remove_dublicates(list_of_products):
    """Remove dublicate gene product names from a list"""
    return list(set(list_of_products))

def get_products_from_cds_list(cds_list):
    """Get a list of unique gene product strings from a list of CDS objects"""
    all_products = []
    found_products = False
    for cds in cds_list:
        # If there is a product it is returned inside of a list
        product = cds.attributes.get("Product")
        if product:
            all_products.append(product[0])
            found_products = True
    if found_products:
        return remove_dublicates(all_products)
    else:
        return list()
        
def get_IDs_from_cds_list(cds_list):
    """Get a list of CDS IDs from a list of CDS objects"""
    return [cds.id for cds in cds_list]

In [135]:
# Populate gene dict with all transcript and cds annotations 
# in a list of dicts, e.g.
# {gene1_ID:
#[
#{'Gene_name': ['HIBN'],
# 'Transcript_ID': 'gbgene20309.t1', 
# 'Ontology_term': ['GO:0005634', 'GO:0061676'], 
# 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901'], 
# 'CDS_ID': 'gbgene5187.t2.CDS7', 
# 'CDS_Product': ['Histone-binding protein N1/N2']}, 
#{'Gene_name': ['HIBN'],
# 'Transcript_ID': 'gbgene20309.t2', 
# 'Ontology_term': ['GO:0005634', 'GO:0061676'], 
# 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901'], 
# 'CDS_ID': 'gbgene5187.t2.CDS7', 
# 'CDS_Product': ['Histone-binding protein N1/N2']
# ...
#],
# ...
#}
gene_annotations = {}
for gene in db.features_of_type('gene'):
    # Gather the gene's all mRNAs and CDSs into own lists
    mRNAs, CDSs = collect_all_mRNAs_and_CDSs(db, gene)
    # Iterate over CDSs and their parent mRNA and store all relevant info into
    # gene_annotation dictionary
    for tx_cds in match_mRNAs_and_CDSs(mRNAs, CDSs):        
        mRNA = tx_cds[0]
        all_CDSs_of_mRNA = tx_cds[1]
        cds_products = get_products_from_cds_list(all_CDSs_of_mRNA)
        cds_IDs = get_IDs_from_cds_list(all_CDSs_of_mRNA)
        # Populate gene_annotations dictionary
        if gene.id in gene_annotations:
            gene_annotations[gene.id].append({
                "Gene_name" : gene.attributes.get("Name"),
                "Transcript_ID" : mRNA.id,
                "Ontology_term" : mRNA.attributes.get("Ontology_term"),
                "Dbxref" : mRNA.attributes.get("Dbxref"),
                "CDS_ID" : cds_IDs,
                "CDS_Product" : cds_products
                })
        else:
            gene_annotations[gene.id] = [{
                "Gene_name" : gene.attributes.get("Name"),
                "Transcript_ID" : mRNA.id,
                "Ontology_term" : mRNA.attributes.get("Ontology_term"),
                "Dbxref" : mRNA.attributes.get("Dbxref"),
                "CDS_ID" : cds_IDs,
                "CDS_Product" : cds_products
                }]

**feature**
 'astuple',
 'attributes',
 'bin',
 'calc_bin',
 'chrom',
 'dialect',
 'end',
 'extra',
 'featuretype',
 'file_order',
 'frame',
 'id',
 'keep_order',
 'score',
 'seqid',
 'sequence',
 'sort_attribute_values',
 'source',
 'start',
 'stop',
 'strand'

**db**
'add_relation',
 'all_features',
 'analyze',
 'bed12',
 'children',
 'children_bp',
 'conn',
 'count_features_of_type',
 'create_introns',
 'dbfn',
 'default_encoding',
 'delete',
 'dialect',
 'directives',
 'execute',
 'features_of_type',
 'featuretypes',
 'interfeatures',
 'iter_by_parent_childs',
 'keep_order',
 'merge',
 'merge_all',
 'method',
 'parents',
 'pragmas',
 'region',
 'schema',
 'set_pragmas',
 'sort_attribute_values',
 'update',
 'version'

 **attributes**
 'clear',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values'

In [137]:
print(gene_annotations[gene.id])

[{'Gene_name': None, 'Transcript_ID': 'gbgene20353.t1', 'Ontology_term': None, 'Dbxref': None, 'CDS_ID': ['gbgene20353.t1.CDS1'], 'CDS_Product': []}]


In [136]:
print(gene_annotations["gbgene20309"])

[{'Gene_name': ['HIBN'], 'Transcript_ID': 'gbgene20309.t1', 'Ontology_term': ['GO:0005634', 'GO:0061676'], 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901', 'KEGG:xla:397901', 'PFAM:PF10516', 'PFAM:PF13181', 'InterPro:IPR019544', 'InterPro:IPR013026', 'InterPro:IPR011990', 'InterPro:IPR019734', 'EMBL:X04712'], 'CDS_ID': ['gbgene20309.t1.CDS1'], 'CDS_Product': ['Histone-binding protein N1/N2']}, {'Gene_name': ['HIBN'], 'Transcript_ID': 'gbgene20309.t2', 'Ontology_term': ['GO:0005634', 'GO:0061676'], 'Dbxref': ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901', 'KEGG:xla:397901', 'PFAM:PF10516', 'PFAM:PF13181', 'InterPro:IPR019544', 'InterPro:IPR013026', 'InterPro:IPR011990', 'InterPro:IPR019734', 'EMBL:X04712'], 'CDS_ID': ['gbgene20309.t2.CDS1'], 'CDS_Product': ['Histone-binding protein N1/N2']}]


In [141]:
gbgene20309_ch = list(db.children("gbgene20309"))
for feature in gbgene20309_ch:
    if(feature.featuretype == "mRNA"):
        print(feature.attributes)
    break

ID: ['gbgene20309.t1']
Parent: ['gbgene20309']
Ontology_term: ['GO:0005634', 'GO:0061676']
Dbxref: ['UniProtKB/Swiss-Prot:P06180', 'GeneID:397901', 'KEGG:xla:397901', 'PFAM:PF10516', 'PFAM:PF13181', 'InterPro:IPR019544', 'InterPro:IPR013026', 'InterPro:IPR011990', 'InterPro:IPR019734', 'EMBL:X04712']


In [143]:
%%capture cap --no-stderr
print(gene_annotations)
with open('output.json', 'w') as f:
    f.write(cap.stdout)