In [15]:
import json
import gzip
from os import path as ospath
import pandas as pd
import numpy as np
import random
import pprint
from src.utils import write_to_json
terms_fp = './downloads/input/full_go_annotated.json'
annotations_fp = './downloads/input/human_iba_annotations.json'
sample_annotations_fp = './data/sample_human_iba_annotations.json'
articles_fp = './downloads/clean-articles.json'
gene_info_fp = './downloads/input/human_iba_gene_info.json'
taxon_fp = './downloads/input/taxon_lkp.json'
clean_annotations_fp = './downloads/human_iba_annotations_clean.json'
genes_annotations_fp = './downloads/genes_annotations_clean.json'
export_annotations_csv_fp = './downloads/export_annotations_csv.zip'
export_annotations_json_fp = './downloads/export_annotations.json.gz'
filtered_terms_fp = './data/filtered_terms.json'
filtered_slim_terms_fp = './data/filtered_slim_terms.json'
filtered_refs_fp = './data/filtered_refs.json'
filtered_evidences_fp = './data/filtered_evidences.json'
duplicate_gene_fp = './downloads/duplicate_gene.csv'
duplicate_symbol_fp = './downloads/duplicate_symbol.csv'



old_annotations_fp = './downloads/input-old/human_iba_annotations.json'
old_clean_annotations_fp = './downloads/old_human_iba_annotations_clean.json'

unknown_terms =['UNKNOWN:0001', 'UNKNOWN:0002', 'UNKNOWN:0003']

In [7]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def spread_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, genes_df, row):
    result = []
    for evidence in row['evidence']:
        result_ref =[]
        for reference in evidence['references']:
            result_ref.append(get_pd_row_key(df, reference))
        gene_row = get_pd_row_key(genes_df, evidence['with_gene_id'])
       
        evidence_item = {
          'with_gene_id':gene_row,
          'groups': evidence['groups'],
          'references':result_ref
        }
        result.append(evidence_item)
            
    return result

def get_evidence_type(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
        if len(evidence['references']) == 0:
            return 'n/a' 
            
    return 'homology'



def count_unique_refs_row(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))

def get_other(row):
    if len(row['slim_terms'])==0:
        return ASPECT_OTHER_MAP[row['term']['apsect']]
    
    
def count_unique_slims(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))


def term_type(term):
    return  'unknown' if term['id'] in unknown_terms  else 'known'    
            
    


In [None]:
def count_unique_refs(df):
    refs = set()
        
    for evidences in list(df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

In [10]:
terms_df = pd.read_json(terms_fp, dtype={'is_goslim':bool})
terms_df = terms_df.set_index('ID', drop=False)
terms_df = terms_df.rename(columns={'ID': 'id', 'LABEL': 'label', 'hasOBONamespace':'aspect'})
terms_df['aspect'] = terms_df['aspect'].str.replace('_', ' ')

In [11]:
articles_df = pd.read_json(articles_fp)
articles_df = articles_df.set_index('pmid', drop=False)

In [12]:
taxon_df = pd.read_json(taxon_fp, dtype={'taxon_id':str})

In [13]:
genes_df = pd.read_json(gene_info_fp, dtype={'taxon_id':str})
genes_df['taxon_id'] = genes_df['taxon_id'].str.replace('taxon:', '')
genes_df = genes_df.merge(taxon_df, how='left', on='taxon_id')
genes_df = genes_df.set_index('gene', drop=False)

In [None]:
annos_df = pd.read_json(annotations_fp)[:10]
annos_df = annos_df.merge(genes_df[
    ['taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
#anno_json = annos_df.to_json(orient="records", default_handler=None)
#json_str = json.loads(anno_json)
#write_to_json(json_str, ospath.join(sample_annotations_fp))
#pprint.pprint(json_str)
annos_df

In [None]:
%%time

annos_df = pd.read_json(annotations_fp)[:100]
annos_df = annos_df.merge(genes_df[['gene_symbol',
     'gene_name','taxon_id', 'taxon_label',
                                    'taxon_abbr', 
                                    'coordinates_chr_num','coordinates_start','coordinates_end','coordinates_strand']], how='left', left_on="gene", right_index=True)
#annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df



In [50]:
%%time

def count_evidence(evidences):
    return len(evidences)

def generate_id(row):
    return f'{row["gene"]}_{row["_id"]}'

def get_groups(evidences):
    groups = set()
    for evidence in evidences:
        for group in evidence['groups']:
            groups.add(group)
            
    return list(groups)
    
annos_df = pd.read_json(annotations_fp)[:5]
annos_df = annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name',
     'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df['term'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
annos_df['term_type'] = annos_df['term'].apply(lambda x: term_type(x))
annos_df['slim_terms'] = annos_df['slim_terms'].apply(lambda x: spread_terms(terms_df, x))
annos_df['qualifier'] = annos_df['qualifier'].str.replace('_', ' ')
annos_df['evidence'] = annos_df.apply(lambda x: get_evidence(articles_df, genes_df, x),axis=1)
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df['groups'] = annos_df['evidence'].apply(lambda x: get_groups(x))
annos_df['evidence_count'] = annos_df['evidence'].apply(lambda x: count_evidence(x))


   
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', clean_annotations_fp))
annos_df

CPU times: total: 2.3 s
Wall time: 4.18 s


Unnamed: 0,gene,term,slim_terms,qualifier,evidence,group,evidence_type,gene_symbol,gene_name,taxon_id,taxon_label,taxon_abbr,coordinates_chr_num,coordinates_start,coordinates_end,coordinates_strand,aspect,term_type,groups,evidence_count
0,UniProtKB:O15520,"{'id': 'GO:0030154', 'label': 'cell differenti...","[{'id': 'GO:0030154', 'label': 'cell different...",,[{'with_gene_id': {'gene': 'ZFIN:ZDB-GENE-9805...,GO_Central,homology,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,5,44388485.0,44388797.0,-1.0,biological process,known,"[RGD, MGI, BHF-UCL, FlyBase, AgBase, UniProt, ...",21
1,UniProtKB:O15520,"{'id': 'GO:0008083', 'label': 'growth factor a...","[{'id': 'GO:0048018', 'label': 'receptor ligan...",,"[{'with_gene_id': {'gene': 'UniProtKB:O15520',...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,5,44388485.0,44388797.0,-1.0,molecular function,known,"[RGD, ParkinsonsUK-UCL,RGD, MGI, WB, BHF-UCL, ...",12
2,UniProtKB:O15520,"{'id': 'GO:0009887', 'label': 'animal organ mo...","[{'id': 'GO:0048856', 'label': 'anatomical str...",,"[{'with_gene_id': {'gene': 'UniProtKB:O15520',...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,5,44388485.0,44388797.0,-1.0,biological process,known,"[RGD, MGI, BHF-UCL, FlyBase, UniProt, ZFIN]",18
3,UniProtKB:O15520,"{'id': 'GO:0005737', 'label': 'cytoplasm', 'as...","[{'id': 'GO:0005737', 'label': 'cytoplasm', 'a...",,"[{'with_gene_id': {'gene': 'UniProtKB:P48800',...",GO_Central,homology,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,5,44388485.0,44388797.0,-1.0,cellular component,known,"[RGD, MGI, WB, BHF-UCL, UniProt, AgBase, ZFIN]",14
4,UniProtKB:O15520,"{'id': 'GO:0001934', 'label': 'positive regula...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,"[{'with_gene_id': {'gene': 'UniProtKB:O15520',...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,5,44388485.0,44388797.0,-1.0,biological process,known,"[UniProt, MGI]",10


In [52]:
annos_df['_id'] = annos_df.index
annos_df['_id'] = annos_df.apply(lambda x: generate_id(x), axis=1)
annos_df


Unnamed: 0,gene,term,slim_terms,qualifier,evidence,group,evidence_type,gene_symbol,gene_name,taxon_id,...,taxon_abbr,coordinates_chr_num,coordinates_start,coordinates_end,coordinates_strand,aspect,term_type,groups,evidence_count,_id
0,UniProtKB:O15520,"{'id': 'GO:0030154', 'label': 'cell differenti...","[{'id': 'GO:0030154', 'label': 'cell different...",,[{'with_gene_id': {'gene': 'ZFIN:ZDB-GENE-9805...,GO_Central,homology,FGF10,Fibroblast growth factor 10,9606,...,Hsa,5,44388485.0,44388797.0,-1.0,biological process,known,"[RGD, MGI, BHF-UCL, FlyBase, AgBase, UniProt, ...",21,UniProtKB:O15520_0
1,UniProtKB:O15520,"{'id': 'GO:0008083', 'label': 'growth factor a...","[{'id': 'GO:0048018', 'label': 'receptor ligan...",,"[{'with_gene_id': {'gene': 'UniProtKB:O15520',...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,...,Hsa,5,44388485.0,44388797.0,-1.0,molecular function,known,"[RGD, ParkinsonsUK-UCL,RGD, MGI, WB, BHF-UCL, ...",12,UniProtKB:O15520_1
2,UniProtKB:O15520,"{'id': 'GO:0009887', 'label': 'animal organ mo...","[{'id': 'GO:0048856', 'label': 'anatomical str...",,"[{'with_gene_id': {'gene': 'UniProtKB:O15520',...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,...,Hsa,5,44388485.0,44388797.0,-1.0,biological process,known,"[RGD, MGI, BHF-UCL, FlyBase, UniProt, ZFIN]",18,UniProtKB:O15520_2
3,UniProtKB:O15520,"{'id': 'GO:0005737', 'label': 'cytoplasm', 'as...","[{'id': 'GO:0005737', 'label': 'cytoplasm', 'a...",,"[{'with_gene_id': {'gene': 'UniProtKB:P48800',...",GO_Central,homology,FGF10,Fibroblast growth factor 10,9606,...,Hsa,5,44388485.0,44388797.0,-1.0,cellular component,known,"[RGD, MGI, WB, BHF-UCL, UniProt, AgBase, ZFIN]",14,UniProtKB:O15520_3
4,UniProtKB:O15520,"{'id': 'GO:0001934', 'label': 'positive regula...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,"[{'with_gene_id': {'gene': 'UniProtKB:O15520',...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,...,Hsa,5,44388485.0,44388797.0,-1.0,biological process,known,"[UniProt, MGI]",10,UniProtKB:O15520_4


In [14]:
%%time

def term_display_id(term):
    return term['id'] if term['id'].startswith("GO") else ''

fields=['gene', 'term']
export_annos_df = pd.read_json(annotations_fp)[:5]
export_annos_df = export_annos_df[fields]
export_annos_df = export_annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name']], how='left', left_on="gene", right_index=True)
export_annos_df['term'] = export_annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
export_annos_df['term_id'] = export_annos_df['term'].apply(lambda x: term_display_id(x))
export_annos_df['term_label'] = export_annos_df['term'].apply(lambda x: x['label'])
export_annos_df = export_annos_df.drop(columns=['term'])

compression_opts = dict(method='zip',
                        archive_name='annotations.csv')  
export_annos_df.to_csv(ospath.join('.', export_annotations_csv_fp), index=False, compression=compression_opts)  
   
    
export_anno_json = export_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(export_anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', export_annotations_json_fp), zip=True)


    
export_annos_df

CPU times: total: 2.84 s
Wall time: 4.32 s


Unnamed: 0,gene,gene_symbol,gene_name,term_id,term_label
0,UniProtKB:O15520,FGF10,Fibroblast growth factor 10,GO:0030154,cell differentiation
1,UniProtKB:O15520,FGF10,Fibroblast growth factor 10,GO:0008083,growth factor activity
2,UniProtKB:O15520,FGF10,Fibroblast growth factor 10,GO:0009887,animal organ morphogenesis
3,UniProtKB:O15520,FGF10,Fibroblast growth factor 10,GO:0005737,cytoplasm
4,UniProtKB:O15520,FGF10,Fibroblast growth factor 10,GO:0001934,positive regulation of protein phosphorylation


In [None]:
%%time
    
old_annos_df = pd.read_json(old_annotations_fp)
old_annos_df = old_annos_df.merge(genes_df[
    [ 'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)

   
anno_json = old_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', old_clean_annotations_fp))
old_annos_df

In [None]:
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df[annos_df['evidence_type']=='direct']

In [None]:
print(dict(annos_df.iloc[1]))

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

In [None]:
old_clean_annos_df = pd.read_json(old_clean_annotations_fp)

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [None]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms(annos_df):
    terms = set()
    for term in list(annos_df['term']):
        terms.add(term['id'])
    
    return list(terms)

#count_uniq = count_unique_terms()
#write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
#print(len(count_uniq))

def count_unique_slim_terms(annos_df):
    terms = set()
        
    for s_terms in list(annos_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return list(terms)

#count_uniq = count_unique_slim_terms()
#write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
#print(count_uniq)
def count_unique_refs(annos_df):
    refs = set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                if ref is not None:
                    refs.add(ref['pmid'])
            
    return list(refs)

def count_unique_withs(annos_df):
    gene = set()
    gene_name=set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            gene.add(evidence['with_gene_id']['gene'])
            
    return list(gene)

len(count_unique_refs(clean_annos_df))

In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
    'aspect', 
    'evidence_type'
}
    

stats = {k: len(clean_annos_df[k].unique()) for k in scalar_cols }

stats['terms']=len(count_unique_terms(clean_annos_df))
stats['slim_terms']=len(count_unique_slim_terms(clean_annos_df))
stats['references']=len(count_unique_refs(clean_annos_df))
stats['with_gene']=len(count_unique_withs(clean_annos_df))
stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
}
    

stats = {k: len(old_clean_annos_df[k].unique()) for k in scalar_cols }

stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene', 'gene_symbol', 'gene_name'])
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
unique_genes = unique_genes[~unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene'])]
duplicate_genes = duplicate_genes[['gene']]
res = unique_genes[unique_genes['gene'].isin(list(duplicate_genes['gene']))]
res = res.sort_values(by=['gene'])
res.to_csv(duplicate_gene_fp)
res

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene_name'])]
duplicate_genes[duplicate_genes['gene_name']=='Uncharacterized protein (Fragment)']
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
unique_genes = old_clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes_2 = unique_genes[unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes_2
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
res = duplicate_genes[~duplicate_genes['gene_symbol'].isin(list(duplicate_genes_2['gene_symbol']))]
res

In [None]:
gene_df.loc('UniProtKB:X6R8D5')

In [None]:
clean_annos_df['term_label'] = clean_annos_df['term'].apply(lambda x: x['label'])
unknown_cc_df = clean_annos_df[clean_annos_df['term_label']=='Unknown cellular component']
unique_genes = unknown_cc_df.drop_duplicates(subset=['gene_name'])
unique_genes

In [10]:
clean_annos_df = pd.read_json(clean_annotations_fp)[:100]
clean_annos_df =  clean_annos_df.drop(['evidence'], axis=1)
clean_annos_df

Unnamed: 0,gene,term,slim_terms,group,evidence_type,gene_symbol,gene_name,taxon_id,taxon_label,taxon_abbr,panther_family,long_id,coordinates_chr_num,coordinates_start,coordinates_end,aspect,term_type,groups,evidence_count
0,UniProtKB:O15520,"{'id': 'GO:0030154', 'label': 'cell differenti...","[{'id': 'GO:0030154', 'label': 'cell different...",GO_Central,homology,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,PTHR11486,HUMAN|HGNC=3666|UniProtKB=O15520,5,44388485.0,44388797.0,biological process,known,"[AgBase, ZFIN, MGI, UniProt, FlyBase, RGD, BHF...",21
1,UniProtKB:O15520,"{'id': 'GO:0008083', 'label': 'growth factor a...","[{'id': 'GO:0048018', 'label': 'receptor ligan...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,PTHR11486,HUMAN|HGNC=3666|UniProtKB=O15520,5,44388485.0,44388797.0,molecular function,known,"[ParkinsonsUK-UCL,RGD, MGI, WB, UniProt, RGD, ...",12
2,UniProtKB:O15520,"{'id': 'GO:0009887', 'label': 'animal organ mo...","[{'id': 'GO:0048856', 'label': 'anatomical str...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,PTHR11486,HUMAN|HGNC=3666|UniProtKB=O15520,5,44388485.0,44388797.0,biological process,known,"[ZFIN, MGI, UniProt, FlyBase, RGD, BHF-UCL]",18
3,UniProtKB:O15520,"{'id': 'GO:0005737', 'label': 'cytoplasm', 'as...","[{'id': 'GO:0005737', 'label': 'cytoplasm', 'a...",GO_Central,homology,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,PTHR11486,HUMAN|HGNC=3666|UniProtKB=O15520,5,44388485.0,44388797.0,cellular component,known,"[ZFIN, AgBase, MGI, WB, UniProt, RGD, BHF-UCL]",14
4,UniProtKB:O15520,"{'id': 'GO:0001934', 'label': 'positive regula...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",GO_Central,direct,FGF10,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,PTHR11486,HUMAN|HGNC=3666|UniProtKB=O15520,5,44388485.0,44388797.0,biological process,known,"[UniProt, MGI]",10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,UniProtKB:Q9Y536,"{'id': 'GO:0016018', 'label': 'cyclosporin A b...","[{'id': 'OTHER:0001', 'label': 'Other molecula...",GO_Central,homology,PPIAL4A,Peptidyl-prolyl cis-trans isomerase A-like 4A,9606,Homo sapiens,Hsa,PTHR11071,HUMAN|HGNC=24369|UniProtKB=Q9Y536,1,120889746.0,120890405.0,molecular function,known,"[AgBase, UniProt, SGD, FlyBase, CAFA, RGD]",15
96,UniProtKB:Q9Y536,"{'id': 'GO:0005737', 'label': 'cytoplasm', 'as...","[{'id': 'GO:0005737', 'label': 'cytoplasm', 'a...",GO_Central,homology,PPIAL4A,Peptidyl-prolyl cis-trans isomerase A-like 4A,9606,Homo sapiens,Hsa,PTHR11071,HUMAN|HGNC=24369|UniProtKB=Q9Y536,1,120889746.0,120890405.0,cellular component,known,"[, dictyBase, AgBase, ParkinsonsUK-UCL, WB, Un...",28
97,UniProtKB:Q9Y536,"{'id': 'GO:0043231', 'label': 'intracellular m...","[{'id': 'GO:0043226', 'label': 'organelle', 'a...",GO_Central,homology,PPIAL4A,Peptidyl-prolyl cis-trans isomerase A-like 4A,9606,Homo sapiens,Hsa,PTHR11071,HUMAN|HGNC=24369|UniProtKB=Q9Y536,1,120889746.0,120890405.0,cellular component,known,"[, dictyBase, WB, UniProt, FlyBase, RGD]",10
98,UniProtKB:Q9Y536,"{'id': 'GO:0006457', 'label': 'protein folding...","[{'id': 'GO:0006457', 'label': 'protein foldin...",GO_Central,homology,PPIAL4A,Peptidyl-prolyl cis-trans isomerase A-like 4A,9606,Homo sapiens,Hsa,PTHR11071,HUMAN|HGNC=24369|UniProtKB=Q9Y536,1,120889746.0,120890405.0,biological process,known,"[SGD, UniProt]",6


In [16]:
def uniquify_term(series):
    unique_terms = {}
    for item in series:
        if isinstance(item, dict):
            unique_terms[item['id']] = item
    return list(unique_terms.values())

def uniquify_slim_terms(series):
    unique_terms = {}
    for item_list in series:
        if isinstance(item_list, list):
            for item in item_list:
                unique_terms[item['id']] = item
    return list(unique_terms.values())

genes_df = (
    clean_annos_df.groupby('gene')
    .agg({
        'term': uniquify_term,
        'slim_terms': uniquify_slim_terms
    })
    .reset_index()
)

anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df

Unnamed: 0,gene,term,slim_terms
0,UniProtKB:A0PJY2,"[{'id': 'GO:0006357', 'label': 'regulation of ...","[{'id': 'GO:0006355', 'label': 'regulation of ..."
1,UniProtKB:H3BQ33,"[{'id': 'GO:0007605', 'label': 'sensory percep...","[{'id': 'GO:0050877', 'label': 'nervous system..."
2,UniProtKB:O15520,"[{'id': 'GO:0030154', 'label': 'cell different...","[{'id': 'GO:0030154', 'label': 'cell different..."
3,UniProtKB:P05014,"[{'id': 'GO:0005615', 'label': 'extracellular ...","[{'id': 'GO:0005576', 'label': 'extracellular ..."
4,UniProtKB:P09661,"[{'id': 'GO:0000398', 'label': 'mRNA splicing,...","[{'id': 'GO:0016071', 'label': 'mRNA metabolic..."
5,UniProtKB:Q12996,"[{'id': 'GO:0005634', 'label': 'nucleus', 'asp...","[{'id': 'GO:0005634', 'label': 'nucleus', 'asp..."
6,UniProtKB:Q13402,"[{'id': 'GO:0031982', 'label': 'vesicle', 'asp...","[{'id': 'GO:0043226', 'label': 'organelle', 'a..."
7,UniProtKB:Q5VST9,"[{'id': 'GO:0005911', 'label': 'cell-cell junc...","[{'id': 'GO:0030054', 'label': 'cell junction'..."
8,UniProtKB:Q6AZW8,"[{'id': 'GO:0003700', 'label': 'DNA-binding tr...","[{'id': 'GO:0003700', 'label': 'DNA-binding tr..."
9,UniProtKB:Q6MZP7,"[{'id': 'GO:0005634', 'label': 'nucleus', 'asp...","[{'id': 'GO:0005634', 'label': 'nucleus', 'asp..."


In [21]:
def uniquify_term(series, evidence_series):
    unique_terms = {}
    term_counts = {}
    duplicates = []

    for idx, item in enumerate(series):
        if isinstance(item, dict):
            term_id = item['id']
            term = item.copy()  
            term.pop('is_goslim', None) 
            term['evidence_type'] = evidence_series.iloc[idx] 
            
            if term_id in term_counts:
                term_counts[term_id] += 1
                duplicates.append(term)
            else:
                term_counts[term_id] = 1

            unique_terms[term_id] = term
            
    return (list(unique_terms.values()), duplicates)


def uniquify_slim_terms(series, evidence_series):
    unique_terms = {}
    for idx, item_list in enumerate(series):
        if isinstance(item_list, list):
            for item in item_list:
                term = item.copy() 
                term.pop('is_goslim', None)
                term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
                unique_terms[term['id']] = term
    return list(unique_terms.values())


genes_df = clean_annos_df.groupby('gene').apply(lambda group: pd.Series({
    'terms': uniquify_term(group['term'], group['evidence_type'])[0],
    'duplicated_terms': uniquify_term(group['term'], group['evidence_type'])[1],  
    'slim_terms': uniquify_slim_terms(group['slim_terms'], group['evidence_type'])
})).reset_index()

anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df


Unnamed: 0,gene,terms,slim_terms
0,UniProtKB:A0PJY2,"([{'id': 'GO:0006357', 'label': 'regulation of...","[{'id': 'GO:0006355', 'label': 'regulation of ..."
1,UniProtKB:H3BQ33,"([{'id': 'GO:0007605', 'label': 'sensory perce...","[{'id': 'GO:0050877', 'label': 'nervous system..."
2,UniProtKB:O15520,"([{'id': 'GO:0030154', 'label': 'cell differen...","[{'id': 'GO:0030154', 'label': 'cell different..."
3,UniProtKB:P05014,"([{'id': 'GO:0005615', 'label': 'extracellular...","[{'id': 'GO:0005576', 'label': 'extracellular ..."
4,UniProtKB:P09661,"([{'id': 'GO:0000398', 'label': 'mRNA splicing...","[{'id': 'GO:0016071', 'label': 'mRNA metabolic..."
5,UniProtKB:Q12996,"([{'id': 'GO:0005634', 'label': 'nucleus', 'as...","[{'id': 'GO:0005634', 'label': 'nucleus', 'asp..."
6,UniProtKB:Q13402,"([{'id': 'GO:0031982', 'label': 'vesicle', 'as...","[{'id': 'GO:0043226', 'label': 'organelle', 'a..."
7,UniProtKB:Q5VST9,"([{'id': 'GO:0005911', 'label': 'cell-cell jun...","[{'id': 'GO:0030054', 'label': 'cell junction'..."
8,UniProtKB:Q6AZW8,"([{'id': 'GO:0003700', 'label': 'DNA-binding t...","[{'id': 'GO:0003700', 'label': 'DNA-binding tr..."
9,UniProtKB:Q6MZP7,"([{'id': 'GO:0005634', 'label': 'nucleus', 'as...","[{'id': 'GO:0005634', 'label': 'nucleus', 'asp..."


In [21]:
def uniquify_term(series, evidence_series):
    unique_terms = {}
    term_counts = {}

    for idx, item in enumerate(series):
        if isinstance(item, dict):
            term_id = item['id']
            term = item.copy()  
            term.pop('is_goslim', None) 
            term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
            
            if term_id in term_counts:
                raise ValueError(f"Duplicate term found: {term}")
            else:
                term_counts[term_id] = 1

            unique_terms[term_id] = term
            
    return list(unique_terms.values())


def uniquify_slim_terms(series, evidence_series):
    unique_terms = {}
    for idx, item_list in enumerate(series):
        if isinstance(item_list, list):
            for item in item_list:
                term = item.copy() 
                term.pop('is_goslim', None)
                term['evidence_type'] = evidence_series.iloc[idx]  # add the evidence_type
                unique_terms[term['id']] = term
    return list(unique_terms.values())

columns_to_extract = [
    'gene_name',
    'taxon_id',
    'taxon_label',
    'taxon_abbr',
    'panther_family',
    'long_id',
    'coordinates_chr_num',
    'coordinates_start',
    'coordinates_end',
    'gene_symbol'
]

def group_terms(group):
    unique_terms = uniquify_term(group['term'], group['evidence_type'])
    slim_terms = uniquify_slim_terms(group['slim_terms'], group['evidence_type'])
    return pd.Series({
         **{col: group[col].iloc[0] for col in columns_to_extract},
        'terms': unique_terms,
        'slim_terms': slim_terms,
        'terms_count': len(unique_terms)
    })

genes_df = clean_annos_df.groupby('gene').apply(group_terms).reset_index()
genes_df = genes_df.sort_values(by='terms_count', ascending=False).reset_index(drop=True)


anno_json = genes_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', genes_annotations_fp), indent=2)

genes_df

Unnamed: 0,gene,gene_name,taxon_id,taxon_label,taxon_abbr,panther_family,long_id,coordinates_chr_num,coordinates_start,coordinates_end,gene_symbol,terms,slim_terms,terms_count
0,UniProtKB:O15520,Fibroblast growth factor 10,9606,Homo sapiens,Hsa,PTHR11486,HUMAN|HGNC=3666|UniProtKB=O15520,5,44388485.0,44388797.0,FGF10,"[{'id': 'GO:0030154', 'label': 'cell different...","[{'id': 'GO:0030154', 'label': 'cell different...",18
1,UniProtKB:P05014,Interferon alpha-4,9606,Homo sapiens,Hsa,PTHR11691,HUMAN|HGNC=5425|UniProtKB=P05014,9,21186965.0,21187671.0,IFNA4,"[{'id': 'GO:0005615', 'label': 'extracellular ...","[{'id': 'GO:0005576', 'label': 'extracellular ...",12
2,UniProtKB:Q8IUX4,DNA dC-dU-editing enzyme APOBEC-3F,9606,Homo sapiens,Hsa,PTHR13857,HUMAN|HGNC=17356|UniProtKB=Q8IUX4,22,39040857.0,39053910.0,APOBEC3F,"[{'id': 'GO:0080111', 'label': 'DNA demethylat...","[{'id': 'GO:0006259', 'label': 'DNA metabolic ...",12
3,UniProtKB:Q13402,Unconventional myosin-VIIa,9606,Homo sapiens,Hsa,PTHR13140,HUMAN|HGNC=7606|UniProtKB=Q13402,11,77128302.0,77215238.0,MYO7A,"[{'id': 'GO:0031982', 'label': 'vesicle', 'asp...","[{'id': 'GO:0043226', 'label': 'organelle', 'a...",10
4,UniProtKB:Q9Y536,Peptidyl-prolyl cis-trans isomerase A-like 4A,9606,Homo sapiens,Hsa,PTHR11071,HUMAN|HGNC=24369|UniProtKB=Q9Y536,1,120889746.0,120890405.0,PPIAL4A,"[{'id': 'GO:0000413', 'label': 'protein peptid...","[{'id': 'GO:0036211', 'label': 'protein modifi...",6
5,UniProtKB:Q8IUG5,Unconventional myosin-XVIIIb,9606,Homo sapiens,Hsa,PTHR45615,HUMAN|HGNC=18150|UniProtKB=Q8IUG5,22,25761034.0,26031041.0,MYO18B,"[{'id': 'GO:0005737', 'label': 'cytoplasm', 'a...","[{'id': 'GO:0005737', 'label': 'cytoplasm', 'a...",6
6,UniProtKB:H3BQ33,Uncharacterized protein (Fragment),9606,Homo sapiens,Hsa,PTHR31548,HUMAN|Gene=H3BQ33_HUMAN|UniProtKB=H3BQ33,3,150882997.0,150941743.0,H3BQ33,"[{'id': 'GO:0007605', 'label': 'sensory percep...","[{'id': 'GO:0050877', 'label': 'nervous system...",4
7,UniProtKB:Q9UKW6,ETS-related transcription factor Elf-5,9606,Homo sapiens,Hsa,PTHR11849,HUMAN|HGNC=3320|UniProtKB=Q9UKW6,11,34491597.0,34513805.0,ELF5,"[{'id': 'GO:0005634', 'label': 'nucleus', 'asp...","[{'id': 'GO:0005634', 'label': 'nucleus', 'asp...",4
8,UniProtKB:A0PJY2,Fez family zinc finger protein 1,9606,Homo sapiens,Hsa,PTHR24390,HUMAN|HGNC=22788|UniProtKB=A0PJY2,7,122304331.0,122304505.0,FEZF1,"[{'id': 'GO:0006357', 'label': 'regulation of ...","[{'id': 'GO:0006355', 'label': 'regulation of ...",4
9,UniProtKB:Q5VST9,Obscurin,9606,Homo sapiens,Hsa,PTHR11640,HUMAN|HGNC=15719|UniProtKB=Q5VST9,1,228208130.0,228378876.0,OBSCN,"[{'id': 'GO:0005911', 'label': 'cell-cell junc...","[{'id': 'GO:0030054', 'label': 'cell junction'...",4
