In [1]:
import json
import gzip
from os import path as ospath
import pandas as pd
import numpy as np
import random
import pprint
from src.utils import write_to_json
terms_fp = './downloads/input/go_term_info.json'
genes_fp = './downloads/input/bp_module_gene_info.json'
term_dispositions_fp = './downloads/input/term_dispositions.json'
ibd_modules_fp = './downloads/input/ibd_modules_organized.json'

clean_ibd_modules_fp = './downloads/input/clean_ibd_modules.json'


In [None]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def spread_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None


In [2]:
terms_df = pd.read_json(terms_fp)
terms_df = terms_df.set_index('term_id', drop=False)
terms_df = terms_df.rename(columns={'term_id': 'id', 'term_label': 'label'})
terms_df

Unnamed: 0_level_0,id,label
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GO:0042552,GO:0042552,myelination
GO:0045065,GO:0045065,cytotoxic T cell differentiation
GO:0001992,GO:0001992,regulation of systemic arterial blood pressure...
GO:0048247,GO:0048247,lymphocyte chemotaxis
GO:0045954,GO:0045954,positive regulation of natural killer cell med...
...,...,...
OTHER:0008,OTHER:0008,other circadian rhythm
OTHER:0009,OTHER:0009,other metabolic process
OTHER:0010,OTHER:0010,other system process
OTHER:0011,OTHER:0011,other cellular process


In [3]:
term_dispositions_df = pd.read_json(term_dispositions_fp)
term_dispositions_df = term_dispositions_df.set_index('term_id', drop=False)
term_dispositions_df = term_dispositions_df.rename(columns={'term_id': 'id', 'affected_term_id': 'disposition_target_id'})

term_dispositions_df

Unnamed: 0_level_0,id,disposition,disposition_target_id
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GO:0001818,GO:0001818,negative,GO:0001816
GO:0001915,GO:0001915,negative,GO:0001913
GO:0001916,GO:0001916,positive,GO:0001913
GO:0001938,GO:0001938,positive,GO:0001935
GO:0001953,GO:0001953,negative,GO:0007160
...,...,...,...
GO:1904262,GO:1904262,negative,GO:0038202
GO:1904263,GO:1904263,positive,GO:0038202
GO:1903036,GO:1903036,positive,GO:0009611
GO:1903070,GO:1903070,negative,GO:0030433


In [4]:
genes_df = pd.read_json(genes_fp, dtype={'taxon_id':str})
genes_df = genes_df.set_index('gene', drop=False)
genes_df

Unnamed: 0_level_0,gene,gene_symbol,gene_name,taxon_id,panther_family,long_id
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
UniProtKB:Q8IWV8,UniProtKB:Q8IWV8,UBR2,E3 ubiquitin-protein ligase UBR2,9606,,
UniProtKB:Q96D42,UniProtKB:Q96D42,HAVCR1,Hepatitis A virus cellular receptor 1,9606,,
UniProtKB:Q96RW7,UniProtKB:Q96RW7,HMCN1,Hemicentin-1,9606,,
UniProtKB:P37059,UniProtKB:P37059,HSD17B2,17-beta-hydroxysteroid dehydrogenase type 2,9606,,
UniProtKB:Q8WXG6,UniProtKB:Q8WXG6,MADD,MAP kinase-activating death domain protein,9606,,
...,...,...,...,...,...,...
UniProtKB:Q9H3U5,UniProtKB:Q9H3U5,MFSD1,Major facilitator superfamily domain-containin...,9606,,
UniProtKB:O94989,UniProtKB:O94989,ARHGEF15,Rho guanine nucleotide exchange factor 15,9606,,
UniProtKB:O00458,UniProtKB:O00458,IFRD1,Interferon-related developmental regulator 1,9606,,
UniProtKB:Q96I25,UniProtKB:Q96I25,RBM17,Splicing factor 45,9606,,


In [5]:
ibd_modules_df = pd.read_json(ibd_modules_fp)

ibd_modules_df

Unnamed: 0,id,categories
0,GO:0071840,"[{'id': 'GO:0034330', 'modules': [{'module_ter..."
1,GO:0051179,"[{'id': 'OTHER:0003', 'modules': [{'module_ter..."
2,GO:0050896,"[{'id': 'GO:0007165', 'modules': [{'module_ter..."
3,GO:0002376,"[{'id': 'OTHER:0005', 'modules': [{'module_ter..."
4,GO:0010817,"[{'id': 'OTHER:0006', 'modules': [{'module_ter..."
5,GO:0042592,"[{'id': 'OTHER:0007', 'modules': [{'module_ter..."
6,GO:0007623,"[{'id': 'OTHER:0008', 'modules': [{'module_ter..."
7,GO:0008152,"[{'id': 'GO:0006259', 'modules': [{'module_ter..."
8,GO:0003008,"[{'id': 'GO:0003013', 'modules': [{'module_ter..."
9,GO:0009987,"[{'id': 'GO:0007155', 'modules': [{'module_ter..."


In [None]:
def enrich_json_data(data, terms_df, genes_df):
    for entry in data:
        entry['label']= get_pd_row(terms_df, entry['id'])['label']
        entry['count']= len(entry["categories"])
        for category in entry.get("categories", []):
            # Add category label
            category_id = category.get("id")
            category["label"] = get_pd_row(terms_df, category_id)['label']
            category['count']= len(category["modules"])

            for module in category.get("modules", []):
                module_term_id = module.get("module_term")
                module["label"] = terms_df.loc[module_term_id, 'label'] if module_term_id in terms_df.index else None
                nodes = module.get("nodes", [])
                module['count']= len(nodes)
                for node in nodes:
                    node['terms'] = [{'term_id': term, 'term_label': terms_df.loc[term, 'label']} for term in node.get('terms', []) if term in terms_df.index]
                    node['leaf_genes'] = [genes_df.loc[gene].to_dict() for gene in node.get('leaf_genes', []) if gene in genes_df.index]
                   

with open(ibd_modules_fp, 'r') as file:
    json_data = json.load(file)


# Call the function to enrich the JSON data
enrich_json_data(json_data, terms_df, genes_df)

# Output the enriched JSON data
print(json.dumps(json_data[:1], indent=2))

#json_str = json.loads(json_data)
write_to_json(json_data, ospath.join(clean_ibd_modules_fp), indent=2)

In [6]:
def find_disposition_sources(module_id, df):
    matching_terms = df[df['disposition_target_id'] == module_id]
    sources = [{'term_id': row['id'], 'disposition': row['disposition']} for index, row in matching_terms.iterrows()]
    return sources

flattened_data = []

for section in ibd_modules_df.itertuples(index=False):
    section_id = section.id
    category_count= len(section.categories)
    for category in section.categories:
        category_id = category['id']
        module_count = len(category['modules'])
        
        for module in category['modules']:
          
            module_id = module['module_term']
            disposition_sources = find_disposition_sources(module_id, term_dispositions_df)

            node_count = len(module.get("nodes", []))
            
            for node in module.get('nodes', []):
                node_id = node.get('ptn_id')
                node_label = node.get('label')
                
                term_info = [{'term_id': term, 'term_label': terms_df.loc[term, 'label']} for term in node.get('terms', []) if term in terms_df.index]
                gene_info = [genes_df.loc[gene].to_dict() for gene in node.get('leaf_genes', []) if gene in genes_df.index]

                flattened_data.append({
                    'section_id': section_id,
                    'category_id': category_id,
                    'module_id': module_id,
                    'disposition_sources': disposition_sources,
                    'node_id': node_id,
                    'node_label': node_label,
                    'terms': term_info,
                    'leaf_genes': gene_info,
                    'category_count': category_count,
                    'module_count': module_count,
                    'node_count': node_count
                })

flat_df = pd.DataFrame(flattened_data)

flat_df = flat_df.merge(terms_df, left_on='section_id', right_on='id', how='left', suffixes=('', '_section')).rename(columns={'label': 'section_label'})
flat_df = flat_df.merge(terms_df, left_on='category_id', right_on='id', how='left', suffixes=('', '_category')).rename(columns={'label': 'category_label'})
flat_df = flat_df.merge(terms_df, left_on='module_id', right_on='id', how='left', suffixes=('', '_module_term')).rename(columns={'label': 'module_label'})
flat_df = flat_df.merge(term_dispositions_df, left_on='module_id', right_on='id', how='left')


final_df = flat_df[['section_id',
                    'section_label',
                    'category_id', 
                    'category_label',                     
                    'module_label',
                    'module_id', 
                    'disposition_sources',
                    'disposition',
                    'disposition_target_id',
                    'node_id', 
                    'node_label', 
                    'terms', 
                    'leaf_genes',
                    'category_count',
                    'module_count',
                    'node_count']]



final_df

Unnamed: 0,section_id,section_label,category_id,category_label,module_label,module_id,disposition_sources,disposition,disposition_target_id,node_id,node_label,terms,leaf_genes,category_count,module_count,node_count
0,GO:0071840,cellular component organization or biogenesis,GO:0034330,cell junction organization,tight junction assembly,GO:0120192,[],,,PTN007679700,PATJ HOMOLOG,"[{'term_id': 'GO:0120192', 'term_label': 'tigh...","[{'gene': 'UniProtKB:O75970', 'gene_symbol': '...",7,7,9
1,GO:0071840,cellular component organization or biogenesis,GO:0034330,cell junction organization,tight junction assembly,GO:0120192,[],,,PTN002698193,LIPOLYSIS-STIMULATED LIPOPROTEIN RECEPTOR,"[{'term_id': 'GO:1904274', 'term_label': 'tric...","[{'gene': 'UniProtKB:Q86X29', 'gene_symbol': '...",7,7,9
2,GO:0071840,cellular component organization or biogenesis,GO:0034330,cell junction organization,tight junction assembly,GO:0120192,[],,,PTN002731222,PROTEIN POF1B,"[{'term_id': 'GO:0070830', 'term_label': 'bice...","[{'gene': 'UniProtKB:Q8WVV4', 'gene_symbol': '...",7,7,9
3,GO:0071840,cellular component organization or biogenesis,GO:0034330,cell junction organization,tight junction assembly,GO:0120192,[],,,PTN000462700,FERM AND PDZ DOMAIN-CONTAINING PROTEIN 2,"[{'term_id': 'GO:0070830', 'term_label': 'bice...","[{'gene': 'UniProtKB:Q68DX3', 'gene_symbol': '...",7,7,9
4,GO:0071840,cellular component organization or biogenesis,GO:0034330,cell junction organization,tight junction assembly,GO:0120192,[],,,PTN004555956,MARVEL DOMAIN-CONTAINING PROTEIN 3,"[{'term_id': 'GO:0070830', 'term_label': 'bice...","[{'gene': 'UniProtKB:Q96A59', 'gene_symbol': '...",7,7,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5564,OTHER:0001,other biological process,GO:0060078,regulation of postsynaptic membrane potential,regulation of postsynaptic membrane potential,GO:0060078,[],,,PTN001806145,PRO-FMRFAMIDE-RELATED NEUROPEPTIDE FF,"[{'term_id': 'GO:0060079', 'term_label': 'exci...","[{'gene': 'UniProtKB:O15130', 'gene_symbol': '...",12,1,5
5565,OTHER:0001,other biological process,GO:0060078,regulation of postsynaptic membrane potential,regulation of postsynaptic membrane potential,GO:0060078,[],,,PTN000436343,GAMMA-AMINOBUTYRIC ACID RECEPTOR ALPHA-LIKE,"[{'term_id': 'GO:0060078', 'term_label': 'regu...","[{'gene': 'UniProtKB:P47869', 'gene_symbol': '...",12,1,5
5566,OTHER:0001,other biological process,GO:0060078,regulation of postsynaptic membrane potential,regulation of postsynaptic membrane potential,GO:0060078,[],,,PTN000751216,"NMDA RECEPTOR 2, ISOFORM C","[{'term_id': 'GO:0060079', 'term_label': 'exci...","[{'gene': 'UniProtKB:Q14957', 'gene_symbol': '...",12,1,5
5567,OTHER:0001,other biological process,GO:0060078,regulation of postsynaptic membrane potential,regulation of postsynaptic membrane potential,GO:0060078,[],,,PTN002823877,INHIBITORY SYNAPTIC FACTOR 2A,"[{'term_id': 'GO:0060080', 'term_label': 'inhi...","[{'gene': 'UniProtKB:A6NMK8', 'gene_symbol': '...",12,1,5


In [7]:
ibd_json = final_df.to_json(orient="records", default_handler=None)
json_str = json.loads(ibd_json)
write_to_json(json_str, ospath.join(clean_ibd_modules_fp), indent=2)
#pprint.pprint(json_str)

In [None]:
filtered_df = final_df[final_df['disposition_sources'].apply(lambda x: len(x) > 1)]


filtered_df

In [None]:
annos_df = pd.read_json(annotations_fp)[:10]
annos_df = annos_df.merge(genes_df[
    ['taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
#anno_json = annos_df.to_json(orient="records", default_handler=None)
#json_str = json.loads(anno_json)
#write_to_json(json_str, ospath.join(sample_annotations_fp))
#pprint.pprint(json_str)
annos_df

In [None]:
%%time

annos_df = pd.read_json(annotations_fp)[:100]
annos_df = annos_df.merge(genes_df[['gene_symbol',
     'gene_name','taxon_id', 'taxon_label',
                                    'taxon_abbr', 
                                    'coordinates_chr_num','coordinates_start','coordinates_end','coordinates_strand']], how='left', left_on="gene", right_index=True)
#annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df



In [None]:
%%time

def count_evidence(evidences):
    return len(evidences)

def generate_id(row):
    return f'{row["gene"]}_{row["_id"]}'

def get_groups(evidences):
    groups = set()
    for evidence in evidences:
        for group in evidence['groups']:
            groups.add(group)
            
    return list(groups)
    
annos_df = pd.read_json(annotations_fp)[:5]
annos_df = annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name',
     'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df['term'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
annos_df['term_type'] = annos_df['term'].apply(lambda x: term_type(x))
annos_df['slim_terms'] = annos_df['slim_terms'].apply(lambda x: spread_terms(terms_df, x))
annos_df['qualifier'] = annos_df['qualifier'].str.replace('_', ' ')
annos_df['evidence'] = annos_df.apply(lambda x: get_evidence(articles_df, genes_df, x),axis=1)
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df['groups'] = annos_df['evidence'].apply(lambda x: get_groups(x))
annos_df['evidence_count'] = annos_df['evidence'].apply(lambda x: count_evidence(x))


   
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', clean_annotations_fp))
annos_df

In [None]:
annos_df['_id'] = annos_df.index
annos_df['_id'] = annos_df.apply(lambda x: generate_id(x), axis=1)
annos_df


In [None]:
%%time

def term_display_id(term):
    return term['id'] if term['id'].startswith("GO") else ''

fields=['gene', 'term']
export_annos_df = pd.read_json(annotations_fp)[:5]
export_annos_df = export_annos_df[fields]
export_annos_df = export_annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name']], how='left', left_on="gene", right_index=True)
export_annos_df['term'] = export_annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
export_annos_df['term_id'] = export_annos_df['term'].apply(lambda x: term_display_id(x))
export_annos_df['term_label'] = export_annos_df['term'].apply(lambda x: x['label'])
export_annos_df = export_annos_df.drop(columns=['term'])

compression_opts = dict(method='zip',
                        archive_name='annotations.csv')  
export_annos_df.to_csv(ospath.join('.', export_annotations_csv_fp), index=False, compression=compression_opts)  
   
    
export_anno_json = export_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(export_anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', export_annotations_json_fp), zip=True)


    
export_annos_df

In [None]:
%%time
    
old_annos_df = pd.read_json(old_annotations_fp)
old_annos_df = old_annos_df.merge(genes_df[
    [ 'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)

   
anno_json = old_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', old_clean_annotations_fp))
old_annos_df

In [None]:
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df[annos_df['evidence_type']=='direct']

In [None]:
print(dict(annos_df.iloc[1]))

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

In [None]:
old_clean_annos_df = pd.read_json(old_clean_annotations_fp)

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [None]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms(annos_df):
    terms = set()
    for term in list(annos_df['term']):
        terms.add(term['id'])
    
    return list(terms)

#count_uniq = count_unique_terms()
#write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
#print(len(count_uniq))

def count_unique_slim_terms(annos_df):
    terms = set()
        
    for s_terms in list(annos_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return list(terms)

#count_uniq = count_unique_slim_terms()
#write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
#print(count_uniq)
def count_unique_refs(annos_df):
    refs = set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                if ref is not None:
                    refs.add(ref['pmid'])
            
    return list(refs)

def count_unique_withs(annos_df):
    gene = set()
    gene_name=set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            gene.add(evidence['with_gene_id']['gene'])
            
    return list(gene)

len(count_unique_refs(clean_annos_df))

In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
    'aspect', 
    'evidence_type'
}
    

stats = {k: len(clean_annos_df[k].unique()) for k in scalar_cols }

stats['terms']=len(count_unique_terms(clean_annos_df))
stats['slim_terms']=len(count_unique_slim_terms(clean_annos_df))
stats['references']=len(count_unique_refs(clean_annos_df))
stats['with_gene']=len(count_unique_withs(clean_annos_df))
stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
}
    

stats = {k: len(old_clean_annos_df[k].unique()) for k in scalar_cols }

stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene', 'gene_symbol', 'gene_name'])
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
unique_genes = unique_genes[~unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene'])]
duplicate_genes = duplicate_genes[['gene']]
res = unique_genes[unique_genes['gene'].isin(list(duplicate_genes['gene']))]
res = res.sort_values(by=['gene'])
res.to_csv(duplicate_gene_fp)
res

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene_name'])]
duplicate_genes[duplicate_genes['gene_name']=='Uncharacterized protein (Fragment)']
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
unique_genes = old_clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes_2 = unique_genes[unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes_2
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
res = duplicate_genes[~duplicate_genes['gene_symbol'].isin(list(duplicate_genes_2['gene_symbol']))]
res

In [None]:
gene_df.loc('UniProtKB:X6R8D5')

In [None]:
clean_annos_df['term_label'] = clean_annos_df['term'].apply(lambda x: x['label'])
unknown_cc_df = clean_annos_df[clean_annos_df['term_label']=='Unknown cellular component']
unique_genes = unknown_cc_df.drop_duplicates(subset=['gene_name'])
unique_genes

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)[:100]
clean_annos_df =  clean_annos_df.drop(['evidence'], axis=1)
clean_annos_df

In [None]:
def uniquify_term(series):
    unique_terms = {}
    for item in series:
        if isinstance(item, dict):
            unique_terms[item['id']] = item
    return list(unique_terms.values())

def uniquify_slim_terms(series):
    unique_terms = {}
    for item_list in series:
        if isinstance(item_list, list):
            for item in item_list:
                unique_terms[item['id']] = item
    return list(unique_terms.values())

genes_df = (
    clean_annos_df.groupby('gene')
    .agg({
        'term': uniquify_term,
        'slim_terms': uniquify_slim_terms
    })
    .reset_index()
)

anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df

In [None]:
def uniquify_term(series, evidence_series):
    unique_terms = {}
    term_counts = {}
    duplicates = []

    for idx, item in enumerate(series):
        if isinstance(item, dict):
            term_id = item['id']
            term = item.copy()  
            term.pop('is_goslim', None) 
            term['evidence_type'] = evidence_series.iloc[idx] 
            
            if term_id in term_counts:
                term_counts[term_id] += 1
                duplicates.append(term)
            else:
                term_counts[term_id] = 1

            unique_terms[term_id] = term
            
    return (list(unique_terms.values()), duplicates)


def uniquify_slim_terms(series, evidence_series):
    unique_terms = {}
    for idx, item_list in enumerate(series):
        if isinstance(item_list, list):
            for item in item_list:
                term = item.copy() 
                term.pop('is_goslim', None)
                term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
                unique_terms[term['id']] = term
    return list(unique_terms.values())


genes_df = clean_annos_df.groupby('gene').apply(lambda group: pd.Series({
    'terms': uniquify_term(group['term'], group['evidence_type'])[0],
    'duplicated_terms': uniquify_term(group['term'], group['evidence_type'])[1],  
    'slim_terms': uniquify_slim_terms(group['slim_terms'], group['evidence_type'])
})).reset_index()

anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df


In [None]:
def uniquify_term(series, evidence_series):
    unique_terms = {}
    term_counts = {}

    for idx, item in enumerate(series):
        if isinstance(item, dict):
            term_id = item['id']
            term = item.copy()  
            term.pop('is_goslim', None) 
            term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
            
            if term_id in term_counts:
                raise ValueError(f"Duplicate term found: {term}")
            else:
                term_counts[term_id] = 1

            unique_terms[term_id] = term
            
    return list(unique_terms.values())


def uniquify_slim_terms(series, evidence_series):
    unique_terms = {}
    for idx, item_list in enumerate(series):
        if isinstance(item_list, list):
            for item in item_list:
                term = item.copy() 
                term.pop('is_goslim', None)
                term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
                unique_terms[term['id']] = term
    return list(unique_terms.values())

columns_to_extract = [
    'gene_name',
    'taxon_id',
    'taxon_label',
    'taxon_abbr',
    'panther_family',
    'long_id',
    'coordinates_chr_num',
    'coordinates_start',
    'coordinates_end',
    'gene_symbol'
]

def group_terms(group):
    unique_terms = uniquify_term(group['term'], group['evidence_type'])
    slim_terms = uniquify_slim_terms(group['slim_terms'], group['evidence_type'])
    return pd.Series({
         **{col: group[col].iloc[0] for col in columns_to_extract},
        'terms': unique_terms,
        'slim_terms': slim_terms,
        'terms_count': len(unique_terms)
    })

genes_df = clean_annos_df.groupby('gene').apply(group_terms).reset_index()
genes_df = genes_df.sort_values(by='terms_count', ascending=False).reset_index(drop=True)


anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df