This notebook uses gffutils more directly to handle annotations.

In [1]:
import click
import pandas as pd
from pathlib import Path
import gffutils
import csv
from collections import namedtuple

In [2]:
def read_gff_db(db_file):
    """Read annotation sqlite database and return FeatureDB instance"""
    click.echo(f"Reading {db_file} database...")
    return gffutils.FeatureDB(db_file)

def remove_dublicates(list_of_GOs):
    """Remove dublicate GO terms from a list"""
    #click.echo("Removing duplicate GO-terms...")
    return list(set(list_of_GOs))

def join_dict_value_lists(all_genes):
    """
    Iterate over all list of lists in dict values and update the value to
    one concatenated list
    """
    click.echo("Joining list of duplicated GO-terms into one and removing duplicates...")
    for key,value in all_genes.items():
        if value is not None:
            all_genes[key] = remove_dublicates(sum(value, []))
    click.echo("Lists concatenated and GO-terms deduplicated")
    return all_genes

def collect_all_mRNAs(db, gene):
    """Gather all mRNAs of a gene into lists and return them as a list"""
    #click.echo("Collecting all mRNAs from genes...")
    mRNAs = []
    for feature in db.children(gene):
        if(feature.featuretype == "mRNA"):
            mRNAs.append(feature)
    #click.echo("All mRNAs from genes collected")
    return mRNAs

def populate_GO_dict(db):
    """
    Read FeatureDB and return a dict of gene IDs as keys and unique GO terms as
    values
    """
    click.echo("Populating GO-terms dictionary...")
    all_genes = {}
    for gene in db.features_of_type('gene'):
        for mRNA in collect_all_mRNAs(db, gene):
            GO_terms = mRNA.attributes.get("Ontology_term")
            if GO_terms is None:
                GO_terms = []
            if gene.id in all_genes:
                all_genes[gene.id].append(GO_terms)
            else:
                all_genes[gene.id] = [GO_terms]
    click.echo("GO-terms dictionary populated")
    return join_dict_value_lists(all_genes)

In [3]:
db = read_gff_db(Path("genome.genes.sqlite"))

Reading genome.genes.sqlite database...


In [4]:
for mrna in db.features_of_type('mRNA'):
    print(mrna)
    break

gbscaf00001	.	mRNA	10755	50930	.	+	.	ID=gbgene1.t1;Parent=gbgene1;


In [9]:
for attribute in mrna.attributes.items():
    print(attribute)

('ID', ['gbgene1.t1'])
('Parent', ['gbgene1'])


In [5]:
help(mrna)

Help on Feature in module gffutils.feature object:

class Feature(builtins.object)
 |  Feature(seqid='.', source='.', featuretype='.', start='.', end='.', score='.', strand='.', frame='.', attributes=None, extra=None, bin=None, id=None, dialect=None, file_order=None, keep_order=False, sort_attribute_values=False)
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __getitem__(self, key)
 |  
 |  __hash__(self)
 |      Return hash(self).
 |  
 |  __init__(self, seqid='.', source='.', featuretype='.', start='.', end='.', score='.', strand='.', frame='.', attributes=None, extra=None, bin=None, id=None, dialect=None, file_order=None, keep_order=False, sort_attribute_values=False)
 |      Represents a feature from the database.
 |      
 |      Usually you won't want to use this directly, since it has various
 |      implementation details needed for operating in the context of FeatureDB
 |      objects.  Instead, try the :func:`gffutils.feature.