In [20]:
import json
import gzip
from os import path as ospath
import pandas as pd
import numpy as np
import random
import pprint
from src.utils import write_to_json
terms_fp = './downloads/input/go_term_info.json'
genes_fp = './downloads/input/bp_module_gene_info.json'
term_dispositions_fp = './downloads/input/term_dispositions.json'
ibd_modules_fp = './downloads/input/ibd_modules_organized.json'

clean_ibd_modukess_fp = './downloads/inout/clean_ibd_modules.json'


unknown_terms =['UNKNOWN:0001', 'UNKNOWN:0002', 'UNKNOWN:0003']

In [2]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def spread_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, genes_df, row):
    result = []
    for evidence in row['evidence']:
        result_ref =[]
        for reference in evidence['references']:
            result_ref.append(get_pd_row_key(df, reference))
        gene_row = get_pd_row_key(genes_df, evidence['with_gene_id'])
       
        evidence_item = {
          'with_gene_id':gene_row,
          'groups': evidence['groups'],
          'references':result_ref
        }
        result.append(evidence_item)
            
    return result

def get_evidence_type(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
        if len(evidence['references']) == 0:
            return 'n/a' 
            
    return 'homology'



def count_unique_refs_row(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))

def get_other(row):
    if len(row['slim_terms'])==0:
        return ASPECT_OTHER_MAP[row['term']['apsect']]
    
    
def count_unique_slims(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))


def term_type(term):
    return  'unknown' if term['id'] in unknown_terms  else 'known'    
            
    


In [3]:
def count_unique_refs(df):
    refs = set()
        
    for evidences in list(df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

TypeError: count_unique_refs() missing 1 required positional argument: 'df'

In [28]:
terms_df = pd.read_json(terms_fp)
terms_df = terms_df.set_index('term_id', drop=False)
terms_df = terms_df.rename(columns={'term_id': 'id', 'term_label': 'label'})
terms_df

Unnamed: 0_level_0,id,label
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GO:0015854,GO:0015854,guanine transport
GO:0061842,GO:0061842,microtubule organizing center localization
GO:2001271,GO:2001271,negative regulation of cysteine-type endopepti...
GO:0070782,GO:0070782,phosphatidylserine exposure on apoptotic cell ...
GO:0006407,GO:0006407,rRNA export from nucleus
...,...,...
GO:0009098,GO:0009098,leucine biosynthetic process
GO:0007608,GO:0007608,sensory perception of smell
GO:0006253,GO:0006253,dCTP catabolic process
GO:0038098,GO:0038098,sequestering of BMP from receptor via BMP binding


In [10]:
term_dispositions_df = pd.read_json(term_dispositions_fp)
term_dispositions_df = term_dispositions_df.rename(columns={'term_id': 'id', 'affected_term_id': 'target_id'})
term_dispositions_df = term_dispositions_df.set_index('id', drop=False)

term_dispositions_df

Unnamed: 0_level_0,id,disposition,target_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GO:0001818,GO:0001818,negative,GO:0001816
GO:0001915,GO:0001915,negative,GO:0001913
GO:0001916,GO:0001916,positive,GO:0001913
GO:0001938,GO:0001938,positive,GO:0001935
GO:0001953,GO:0001953,negative,GO:0007160
...,...,...,...
GO:1902808,GO:1902808,positive,GO:0044843
GO:1902902,GO:1902902,negative,GO:0000045
GO:1903036,GO:1903036,positive,GO:0009611
GO:1903070,GO:1903070,negative,GO:0030433


In [15]:
genes_df = pd.read_json(genes_fp, dtype={'taxon_id':str})
genes_df = genes_df.set_index('gene', drop=False)
genes_df

Unnamed: 0_level_0,gene,gene_symbol,gene_name,taxon_id,panther_family,long_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UniProtKB:Q8IWV8,UniProtKB:Q8IWV8,UBR2,E3 ubiquitin-protein ligase UBR2,9606,,
UniProtKB:Q96D42,UniProtKB:Q96D42,HAVCR1,Hepatitis A virus cellular receptor 1,9606,,
UniProtKB:Q96RW7,UniProtKB:Q96RW7,HMCN1,Hemicentin-1,9606,,
UniProtKB:P37059,UniProtKB:P37059,HSD17B2,17-beta-hydroxysteroid dehydrogenase type 2,9606,,
UniProtKB:Q8WXG6,UniProtKB:Q8WXG6,MADD,MAP kinase-activating death domain protein,9606,,
...,...,...,...,...,...,...
UniProtKB:Q9H3U5,UniProtKB:Q9H3U5,MFSD1,Major facilitator superfamily domain-containin...,9606,,
UniProtKB:O94989,UniProtKB:O94989,ARHGEF15,Rho guanine nucleotide exchange factor 15,9606,,
UniProtKB:O00458,UniProtKB:O00458,IFRD1,Interferon-related developmental regulator 1,9606,,
UniProtKB:Q96I25,UniProtKB:Q96I25,RBM17,Splicing factor 45,9606,,


In [24]:
ibd_modules_df = pd.read_json(ibd_modules_fp)

ibd_modules_df

Unnamed: 0,id,categories
0,GO:0071840,"[{'id': 'GO:0034330', 'modules': [{'module_ter..."
1,GO:0051179,"[{'id': 'other localization', 'modules': [{'mo..."
2,GO:0050896,"[{'id': 'GO:0007165', 'modules': [{'module_ter..."
3,GO:0002376,"[{'id': 'other immune system process', 'module..."
4,GO:0010817,"[{'id': 'other regulation of hormone levels', ..."
5,GO:0042592,"[{'id': 'other homeostatic process', 'modules'..."
6,GO:0007623,"[{'id': 'other circadian rhythm', 'modules': [..."
7,GO:0008152,"[{'id': 'GO:0006259', 'modules': [{'module_ter..."
8,GO:0003008,"[{'id': 'GO:0003013', 'modules': [{'module_ter..."
9,GO:0009987,"[{'id': 'GO:0007155', 'modules': [{'module_ter..."


In [30]:
import pandas as pd
from pandas import json_normalize

# Assuming ibd_modules_df is already loaded with your JSON data
# Flatten the JSON structure
flattened_data = []

# Iterate through each entry in the DataFrame
for entry in ibd_modules_df.itertuples():
    category_id = entry.id
    # Check if categories is a list and iterate
    if isinstance(entry.categories, list):
        for category in entry.categories:
            category_label = category.get('label') if isinstance(category, dict) else None
            # Check if modules is a list and iterate
            if isinstance(category.get('modules'), list):
                for module in category['modules']:
                    module_id = module.get('id') if isinstance(module, dict) else None
                    module_label = module.get('label') if isinstance(module, dict) else None
                    module_term = module.get('module_term') if isinstance(module, dict) else None
                    module_term_id = module_term.get('id') if isinstance(module_term, dict) else None
                    module_term_label = module_term.get('label') if isinstance(module_term, dict) else None
                    flattened_data.append({
                        'category_id': category_id,
                        'category_label': category_label,
                        'module_id': module_id,
                        'module_label': module_label,
                        'module_term_id': module_term_id,
                        'module_term_label': module_term_label,
                        'modules': category.get('modules') if isinstance(category, dict) else None
                    })

# Convert to DataFrame
flat_df = pd.DataFrame(flattened_data)

# Assuming terms_df is loaded and structured appropriately for merging
# Merge with terms_df to get labels for IDs
# Adjust these merges based on the actual structure of terms_df
flat_df = flat_df.merge(terms_df, left_on='category_id', right_on='term_id', how='left')
flat_df = flat_df.merge(terms_df, left_on='module_id', right_on='id', how='left')
flat_df = flat_df.merge(terms_df, left_on='module_term_id', right_on='id', how='left')

# Select and rename columns as needed
final_df = flat_df[['category_id', 'category_label', 'module_id', 'module_label', 'module_term_id', 'module_term_label', 'modules']]

final_df

Unnamed: 0,category_id,category_label,module_id,module_label,module_term_id,module_term_label,modules
0,GO:0071840,,,,,,"[{'module_term': 'GO:0120192', 'nodes': [{'ptn..."
1,GO:0071840,,,,,,"[{'module_term': 'GO:0120192', 'nodes': [{'ptn..."
2,GO:0071840,,,,,,"[{'module_term': 'GO:0120192', 'nodes': [{'ptn..."
3,GO:0071840,,,,,,"[{'module_term': 'GO:0120192', 'nodes': [{'ptn..."
4,GO:0071840,,,,,,"[{'module_term': 'GO:0120192', 'nodes': [{'ptn..."
...,...,...,...,...,...,...,...
773,other biological process,,,,,,"[{'module_term': 'GO:0010985', 'nodes': [{'ptn..."
774,other biological process,,,,,,"[{'module_term': 'GO:0007631', 'nodes': [{'ptn..."
775,other biological process,,,,,,"[{'module_term': 'GO:0099623', 'nodes': [{'ptn..."
776,other biological process,,,,,,"[{'module_term': 'GO:0042698', 'nodes': [{'ptn..."


In [None]:
annos_df = pd.read_json(annotations_fp)[:10]
annos_df = annos_df.merge(genes_df[
    ['taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
#anno_json = annos_df.to_json(orient="records", default_handler=None)
#json_str = json.loads(anno_json)
#write_to_json(json_str, ospath.join(sample_annotations_fp))
#pprint.pprint(json_str)
annos_df

In [None]:
%%time

annos_df = pd.read_json(annotations_fp)[:100]
annos_df = annos_df.merge(genes_df[['gene_symbol',
     'gene_name','taxon_id', 'taxon_label',
                                    'taxon_abbr', 
                                    'coordinates_chr_num','coordinates_start','coordinates_end','coordinates_strand']], how='left', left_on="gene", right_index=True)
#annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df



In [None]:
%%time

def count_evidence(evidences):
    return len(evidences)

def generate_id(row):
    return f'{row["gene"]}_{row["_id"]}'

def get_groups(evidences):
    groups = set()
    for evidence in evidences:
        for group in evidence['groups']:
            groups.add(group)
            
    return list(groups)
    
annos_df = pd.read_json(annotations_fp)[:5]
annos_df = annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name',
     'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df['term'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
annos_df['term_type'] = annos_df['term'].apply(lambda x: term_type(x))
annos_df['slim_terms'] = annos_df['slim_terms'].apply(lambda x: spread_terms(terms_df, x))
annos_df['qualifier'] = annos_df['qualifier'].str.replace('_', ' ')
annos_df['evidence'] = annos_df.apply(lambda x: get_evidence(articles_df, genes_df, x),axis=1)
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df['groups'] = annos_df['evidence'].apply(lambda x: get_groups(x))
annos_df['evidence_count'] = annos_df['evidence'].apply(lambda x: count_evidence(x))


   
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', clean_annotations_fp))
annos_df

In [None]:
annos_df['_id'] = annos_df.index
annos_df['_id'] = annos_df.apply(lambda x: generate_id(x), axis=1)
annos_df


In [None]:
%%time

def term_display_id(term):
    return term['id'] if term['id'].startswith("GO") else ''

fields=['gene', 'term']
export_annos_df = pd.read_json(annotations_fp)[:5]
export_annos_df = export_annos_df[fields]
export_annos_df = export_annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name']], how='left', left_on="gene", right_index=True)
export_annos_df['term'] = export_annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
export_annos_df['term_id'] = export_annos_df['term'].apply(lambda x: term_display_id(x))
export_annos_df['term_label'] = export_annos_df['term'].apply(lambda x: x['label'])
export_annos_df = export_annos_df.drop(columns=['term'])

compression_opts = dict(method='zip',
                        archive_name='annotations.csv')  
export_annos_df.to_csv(ospath.join('.', export_annotations_csv_fp), index=False, compression=compression_opts)  
   
    
export_anno_json = export_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(export_anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', export_annotations_json_fp), zip=True)


    
export_annos_df

In [None]:
%%time
    
old_annos_df = pd.read_json(old_annotations_fp)
old_annos_df = old_annos_df.merge(genes_df[
    [ 'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)

   
anno_json = old_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', old_clean_annotations_fp))
old_annos_df

In [None]:
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df[annos_df['evidence_type']=='direct']

In [None]:
print(dict(annos_df.iloc[1]))

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

In [None]:
old_clean_annos_df = pd.read_json(old_clean_annotations_fp)

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [None]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms(annos_df):
    terms = set()
    for term in list(annos_df['term']):
        terms.add(term['id'])
    
    return list(terms)

#count_uniq = count_unique_terms()
#write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
#print(len(count_uniq))

def count_unique_slim_terms(annos_df):
    terms = set()
        
    for s_terms in list(annos_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return list(terms)

#count_uniq = count_unique_slim_terms()
#write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
#print(count_uniq)
def count_unique_refs(annos_df):
    refs = set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                if ref is not None:
                    refs.add(ref['pmid'])
            
    return list(refs)

def count_unique_withs(annos_df):
    gene = set()
    gene_name=set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            gene.add(evidence['with_gene_id']['gene'])
            
    return list(gene)

len(count_unique_refs(clean_annos_df))

In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
    'aspect', 
    'evidence_type'
}
    

stats = {k: len(clean_annos_df[k].unique()) for k in scalar_cols }

stats['terms']=len(count_unique_terms(clean_annos_df))
stats['slim_terms']=len(count_unique_slim_terms(clean_annos_df))
stats['references']=len(count_unique_refs(clean_annos_df))
stats['with_gene']=len(count_unique_withs(clean_annos_df))
stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
}
    

stats = {k: len(old_clean_annos_df[k].unique()) for k in scalar_cols }

stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene', 'gene_symbol', 'gene_name'])
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
unique_genes = unique_genes[~unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene'])]
duplicate_genes = duplicate_genes[['gene']]
res = unique_genes[unique_genes['gene'].isin(list(duplicate_genes['gene']))]
res = res.sort_values(by=['gene'])
res.to_csv(duplicate_gene_fp)
res

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene_name'])]
duplicate_genes[duplicate_genes['gene_name']=='Uncharacterized protein (Fragment)']
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
unique_genes = old_clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes_2 = unique_genes[unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes_2
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
res = duplicate_genes[~duplicate_genes['gene_symbol'].isin(list(duplicate_genes_2['gene_symbol']))]
res

In [None]:
gene_df.loc('UniProtKB:X6R8D5')

In [None]:
clean_annos_df['term_label'] = clean_annos_df['term'].apply(lambda x: x['label'])
unknown_cc_df = clean_annos_df[clean_annos_df['term_label']=='Unknown cellular component']
unique_genes = unknown_cc_df.drop_duplicates(subset=['gene_name'])
unique_genes

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)[:100]
clean_annos_df =  clean_annos_df.drop(['evidence'], axis=1)
clean_annos_df

In [None]:
def uniquify_term(series):
    unique_terms = {}
    for item in series:
        if isinstance(item, dict):
            unique_terms[item['id']] = item
    return list(unique_terms.values())

def uniquify_slim_terms(series):
    unique_terms = {}
    for item_list in series:
        if isinstance(item_list, list):
            for item in item_list:
                unique_terms[item['id']] = item
    return list(unique_terms.values())

genes_df = (
    clean_annos_df.groupby('gene')
    .agg({
        'term': uniquify_term,
        'slim_terms': uniquify_slim_terms
    })
    .reset_index()
)

anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df

In [None]:
def uniquify_term(series, evidence_series):
    unique_terms = {}
    term_counts = {}
    duplicates = []

    for idx, item in enumerate(series):
        if isinstance(item, dict):
            term_id = item['id']
            term = item.copy()  
            term.pop('is_goslim', None) 
            term['evidence_type'] = evidence_series.iloc[idx] 
            
            if term_id in term_counts:
                term_counts[term_id] += 1
                duplicates.append(term)
            else:
                term_counts[term_id] = 1

            unique_terms[term_id] = term
            
    return (list(unique_terms.values()), duplicates)


def uniquify_slim_terms(series, evidence_series):
    unique_terms = {}
    for idx, item_list in enumerate(series):
        if isinstance(item_list, list):
            for item in item_list:
                term = item.copy() 
                term.pop('is_goslim', None)
                term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
                unique_terms[term['id']] = term
    return list(unique_terms.values())


genes_df = clean_annos_df.groupby('gene').apply(lambda group: pd.Series({
    'terms': uniquify_term(group['term'], group['evidence_type'])[0],
    'duplicated_terms': uniquify_term(group['term'], group['evidence_type'])[1],  
    'slim_terms': uniquify_slim_terms(group['slim_terms'], group['evidence_type'])
})).reset_index()

anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df


In [None]:
def uniquify_term(series, evidence_series):
    unique_terms = {}
    term_counts = {}

    for idx, item in enumerate(series):
        if isinstance(item, dict):
            term_id = item['id']
            term = item.copy()  
            term.pop('is_goslim', None) 
            term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
            
            if term_id in term_counts:
                raise ValueError(f"Duplicate term found: {term}")
            else:
                term_counts[term_id] = 1

            unique_terms[term_id] = term
            
    return list(unique_terms.values())


def uniquify_slim_terms(series, evidence_series):
    unique_terms = {}
    for idx, item_list in enumerate(series):
        if isinstance(item_list, list):
            for item in item_list:
                term = item.copy() 
                term.pop('is_goslim', None)
                term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
                unique_terms[term['id']] = term
    return list(unique_terms.values())

columns_to_extract = [
    'gene_name',
    'taxon_id',
    'taxon_label',
    'taxon_abbr',
    'panther_family',
    'long_id',
    'coordinates_chr_num',
    'coordinates_start',
    'coordinates_end',
    'gene_symbol'
]

def group_terms(group):
    unique_terms = uniquify_term(group['term'], group['evidence_type'])
    slim_terms = uniquify_slim_terms(group['slim_terms'], group['evidence_type'])
    return pd.Series({
         **{col: group[col].iloc[0] for col in columns_to_extract},
        'terms': unique_terms,
        'slim_terms': slim_terms,
        'terms_count': len(unique_terms)
    })

genes_df = clean_annos_df.groupby('gene').apply(group_terms).reset_index()
genes_df = genes_df.sort_values(by='terms_count', ascending=False).reset_index(drop=True)


anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df