In [25]:
import gffutils
import pandas as pd
def create_db(gff_file):
    return gffutils.create_db(gff_file, 
                                dbfn='Gb.db', 
                                force=True, 
                                keep_order=True,
                                merge_strategy='merge', 
                                sort_attribute_values=True)


def collect_all_mRNAs(db, gene):
    """
    Gather all mRNAs of a gene into lists and return them as a list
    """
    mRNAs = []
    for feature in db.children(gene):
        if(feature.featuretype == "mRNA"):
            mRNAs.append(feature)
    return mRNAs

def remove_dublicates(list_of_GOs):
    """Remove dublicate GO terms from a list"""
    return list(set(list_of_GOs))

def join_dict_value_lists(all_genes):
    for key,value in all_genes.items():
        if value is not None:
        #if isinstance(, list):
            all_genes[key] = remove_dublicates(sum(value, []))
    return all_genes

def populate_GO_dict(db):
    all_genes = {}
    for gene in db.features_of_type('gene'):
        for mRNA in collect_all_mRNAs(db, gene):
            GO_terms = mRNA.attributes.get("Ontology_term")
            if GO_terms is None:
                GO_terms = []
            if gene.id in all_genes:
                all_genes[gene.id].append(GO_terms)
            else:
                all_genes[gene.id] = [GO_terms]
    return join_dict_value_lists(all_genes)
    
def fill_Nones(GO_dict, fill_value):
    for key in GO_dict:
        if not GO_dict[key]:
            GO_dict[key] = fill_value
    return GO_dict

def concatenate_unique_GO_list(GO_dict):
    for key in GO_dict:
        if isinstance(GO_dict[key], list):
            GO_dict[key] = ",".join(GO_dict[key])
    return GO_dict

def convert_dict_to_df(GO_dict):
    df = pd.DataFrame.from_dict(GO_dict, 
                                    orient='index',
                                    columns=['GO-term'])
    df.index.name = "transcripts"
    return df

def write_tsv(filename, df):
    """Write pandas DataFrame as tsv file"""
    df.to_csv(filename, sep='\t')

In [26]:
output = "GO_universe.tsv"
fill_value = ""
db = create_db("genome.genes.gff3")
GO_dict = populate_GO_dict(db)
GO_dict = fill_Nones(GO_dict, fill_value)
GO_dict = concatenate_unique_GO_list(GO_dict)
GO_df = convert_dict_to_df(GO_dict)

#GO_df
write_tsv(output, GO_df)

In [None]:
GO_dict