In [1]:
import json
from os import path as ospath
import pandas as pd
import numpy as np
import random
from src.utils import write_to_json
terms_fp = './data/terms.json'
annotations_fp = './downloads/human_iba_annotations.json'
articles_fp = './downloads/clean-articles.json'
gene_info_fp = './downloads/human_iba_gene_info.json'
clean_annotations_fp = './downloads/human_iba_annotations_clean.json'
filtered_terms_fp = './data/filtered_terms.json'
filtered_slim_terms_fp = './data/filtered_slim_terms.json'
filtered_refs_fp = './data/filtered_refs.json'
filtered_evidences_fp = './data/filtered_evidences.json'

In [2]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def denormalize_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, gene_df, evidences):
    result = []
    for evidence in evidences:
        result_ref =[]
        for reference in evidence['references']:
            row = get_pd_row_key(df, reference)
            result_ref.append(row)
        gene_row = get_pd_row_key(gene_df, evidence['with_gene_id'])
        result.append({
            'with_gene_id':gene_row,
            'references':result_ref
        })
            
    return result


In [10]:
terms_df = pd.read_json(terms_fp, dtype={'is_goslim':bool})
terms_df = terms_df.set_index('ID', drop=False)
terms_df = terms_df.rename(columns={'ID': 'id', 'LABEL': 'label', 'hasOBONamespace':'aspect'})
terms_df['aspect'] = terms_df['aspect'].str.replace('_', ' ')
terms_df

#write_to_json(json_str, ospath.join('out.json'))


Unnamed: 0_level_0,id,label,aspect,is_goslim
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000001,GO:0000001,mitochondrion inheritance,biological process,False
GO:0000002,GO:0000002,mitochondrial genome maintenance,biological process,False
GO:0000003,GO:0000003,reproduction,biological process,False
GO:0000005,GO:0000005,obsolete ribosomal chaperone activity,molecular function,False
GO:0000006,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular function,False
...,...,...,...,...
GO:2001313,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological process,False
GO:2001314,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological process,False
GO:2001315,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological process,False
GO:2001316,GO:2001316,kojic acid metabolic process,biological process,False


In [4]:
article_df = pd.read_json(articles_fp)
article_df = article_df.set_index('pmid', drop=False)
article_df

Unnamed: 0_level_0,pmid,title,date,authors
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PMID:24361620,PMID:24361620,"Identification, modeling and ligand affinity o...",2014 Jun,"[Morrison AM, Goldstone JV, Lamb DC, Kubota A,..."
PMID:15451575,PMID:15451575,"FLRG, member of the follistatin family, a new ...",2004 Oct 15,"[Maguer-Satta V, Rimokh R]"
PMID:10772789,PMID:10772789,The L63 gene is necessary for the ecdysone-ind...,2000 May 1,"[Stowers RS, Garza D, Rascle A, Hogness DS]"
PMID:21670503,PMID:21670503,Phosphodiesterase 4B in the cardiac L-type Ca²...,2011 Jul,"[Leroy J, Richter W, Mika D, Castro LR, Abi-Ge..."
PMID:7011377,PMID:7011377,Kinetics and protein subunit interactions of E...,1981 Feb 17,[Rizzolo LJ]
...,...,...,...,...
PMID:15465819,PMID:15465819,Ubiquitin ligase activity of c-Cbl guides the ...,2004 Dec 31,"[de Melker AA, van der Horst G, Borst J]"
PMID:12376551,PMID:12376551,RhoG signals in parallel with Rac1 and Cdc42.,2002 Dec 6,"[Wennerberg K, Ellerbroek SM, Liu RY, Karnoub ..."
PMID:15603737,PMID:15603737,BAG5 inhibits parkin and enhances dopaminergic...,2004 Dec 16,"[Kalia SK, Lee S, Smith PD, Liu L, Crocker SJ,..."
PMID:12591913,PMID:12591913,The molecular function of Ase1p: evidence for ...,2003 Feb 17,"[Schuyler SC, Liu JY, Pellman D]"


In [5]:
gene_df = pd.read_json(gene_info_fp)
gene_df = gene_df.set_index('gene', drop=False)
gene_df

Unnamed: 0_level_0,gene,gene_symbol,gene_name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UniProtKB:P15036,UniProtKB:P15036,ETS2,Protein C-ets-2
MGI:MGI:99423,MGI:MGI:99423,Etv4,ETS translocation variant 4
UniProtKB:Q9HBH0,UniProtKB:Q9HBH0,RHOF,Rho-related GTP-binding protein RhoF
PomBase:SPAC1F7.04,PomBase:SPAC1F7.04,rho1,GTP-binding protein rho1
UniProtKB:P43681,UniProtKB:P43681,CHRNA4,Neuronal acetylcholine receptor subunit alpha-4
...,...,...,...
TAIR:locus:2166449,TAIR:locus:2166449,RPN8A,26S proteasome non-ATPase regulatory subunit 7...
UniProtKB:Q96B45,UniProtKB:Q96B45,BORCS7,BLOC-1-related complex subunit 7
ZFIN:ZDB-GENE-020814-2,ZFIN:ZDB-GENE-020814-2,cxadr,Coxsackievirus and adenovirus receptor homolog
TAIR:locus:2152916,TAIR:locus:2152916,PGL5,Probable 6-phosphogluconolactonase 5


In [13]:
%%time
ann_df = pd.read_json(annotations_fp)
ann_df['aspect'] = ann_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
ann_df['term'] = ann_df['term'].apply(lambda x: get_pd_row(terms_df, x))
ann_df['slim_terms'] = ann_df['slim_terms'].apply(lambda x: denormalize_terms(terms_df, x))
ann_df['qualifier'] = ann_df['qualifiers'].apply(lambda x: x[0])
ann_df['qualifier'] = ann_df['qualifier'].str.replace('_', ' ')

ann_df['evidence']=ann_df['evidence'].apply(lambda x: get_evidence(article_df, gene_df, x))
ann_df = ann_df.drop(columns=['qualifiers'])
anno_json = ann_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)

write_to_json(json_str, ospath.join('.', clean_annotations_fp))

CPU times: total: 2min 52s
Wall time: 4min 43s


In [12]:
ann_df

Unnamed: 0,gene,gene_symbol,gene_name,term,slim_terms,evidence,group,aspect,qualifier
0,UniProtKB:Q7L0Q8,RHOU,Rho-related GTP-binding protein RhoU,"{'id': 'GO:0003924', 'label': 'GTPase activity...","[{'id': 'GO:0003924', 'label': 'GTPase activit...","[{'with_gene_id': {'gene': 'SGD:S000000046', '...",GO_Central,molecular function,
1,UniProtKB:Q7L0Q8,RHOU,Rho-related GTP-binding protein RhoU,"{'id': 'GO:0030036', 'label': 'actin cytoskele...","[{'id': 'GO:0007010', 'label': 'cytoskeleton o...","[{'with_gene_id': {'gene': 'UniProtKB:P60953',...",GO_Central,biological process,
2,UniProtKB:Q7L0Q8,RHOU,Rho-related GTP-binding protein RhoU,"{'id': 'GO:0005525', 'label': 'GTP binding', '...",[],[{'with_gene_id': {'gene': 'CGD:CAL0000192659'...,GO_Central,molecular function,
3,UniProtKB:Q7L0Q8,RHOU,Rho-related GTP-binding protein RhoU,"{'id': 'GO:0007163', 'label': 'establishment o...","[{'id': 'GO:0007163', 'label': 'establishment ...","[{'with_gene_id': {'gene': 'MGI:MGI:106211', '...",GO_Central,biological process,
4,UniProtKB:Q7L0Q8,RHOU,Rho-related GTP-binding protein RhoU,"{'id': 'GO:0032488', 'label': 'Cdc42 protein s...","[{'id': 'GO:0023052', 'label': 'signaling', 'a...","[{'with_gene_id': {'gene': 'RGD:71043', 'gene_...",GO_Central,biological process,
...,...,...,...,...,...,...,...,...,...
68553,UniProtKB:O75077,ADAM23,Disintegrin and metalloproteinase domain-conta...,"{'id': 'GO:0099056', 'label': 'integral compon...","[{'id': 'GO:0005886', 'label': 'plasma membran...","[{'with_gene_id': {'gene': 'MGI:MGI:1345162', ...",GO_Central,cellular component,
68554,UniProtKB:Q8TBY9,WDR66,Cilia- and flagella-associated protein 251,"{'id': 'GO:0036126', 'label': 'sperm flagellum...","[{'id': 'GO:0005929', 'label': 'cilium', 'aspe...","[{'with_gene_id': {'gene': 'UniProtKB:Q8TBY9',...",GO_Central,cellular component,
68555,UniProtKB:O60279,SUSD5,Sushi domain-containing protein 5,"{'id': 'GO:0007219', 'label': 'Notch signaling...","[{'id': 'GO:0023052', 'label': 'signaling', 'a...","[{'with_gene_id': {'gene': 'MGI:MGI:2685972', ...",GO_Central,biological process,
68556,UniProtKB:Q6IPT4,CYB5RL,NADH-cytochrome b5 reductase-like,"{'id': 'GO:0004128', 'label': 'cytochrome-b5 r...","[{'id': 'GO:0016491', 'label': 'oxidoreductase...","[{'with_gene_id': {'gene': 'SGD:S000001633', '...",GO_Central,molecular function,


In [None]:
print(dict(ann_df.iloc[1]))

In [None]:
clean_ann_df = pd.read_json(clean_annotations_fp)
clean_ann_df

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
temp_df = ann_df[ann_df.duplicated(subset=['gene', 'gene_symbol'])]
temp_df

In [None]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms():
    terms = set()
    for term in list(ann_df['term']):
        terms.add(term['id'])
                    
    return [get_pd_row(terms_df, term) for term in terms]

count_uniq = count_unique_terms()
write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
print(len(count_uniq))

def count_unique_slim_terms():
    terms = set()
        
    for s_terms in list(ann_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return [get_pd_row(terms_df, term) for term in terms]

count_uniq = count_unique_slim_terms()
write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
print(len(count_uniq))

In [None]:
def count_unique_refs():
    refs = set()
        
    for evidences in list(ann_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [None]:
ann_df