In [3]:
import json
from os import path as ospath
import pandas as pd
import numpy as np
import random
from src.utils import write_to_json
terms_fp = './data/terms.json'
annotations_fp = './downloads/human_iba_annotations.json'
sample_annotations_fp = './data/sample_human_iba_annotations.json'
articles_fp = './downloads/clean-articles.json'
gene_info_fp = './downloads/human_iba_gene_info.json'
clean_annotations_fp = './downloads/human_iba_annotations_clean.json'
filtered_terms_fp = './data/filtered_terms.json'
filtered_slim_terms_fp = './data/filtered_slim_terms.json'
filtered_refs_fp = './data/filtered_refs.json'
filtered_evidences_fp = './data/filtered_evidences.json'

In [19]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def denormalize_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, gene_df, evidences):
    result = []
    for evidence in evidences:
        result_ref =[]
        for reference in evidence['references']:
            row = get_pd_row_key(df, reference)
            result_ref.append(row)
        gene_row = get_pd_row_key(gene_df, evidence['with_gene_id'])
        result.append({
            'with_gene_id':gene_row,
            'references':result_ref
        })
            
    return result

def get_evidence_type(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
            
    return 'homology'

def get_slim(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
            
    return 'homology'

def count_unique_refs_row(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))


In [25]:
def count_unique_refs(df):
    refs = set()
        
    for evidences in list(df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

TypeError: count_unique_refs() missing 1 required positional argument: 'df'

In [23]:
%%time
ann_df = pd.read_json(annotations_fp)
#ann_df = ann_df.loc[2000:2010];
ann_df['ref_count'] = ann_df['evidence'].apply(lambda x: count_unique_refs_row(x))
ann_df = ann_df.sort_values(by='ref_count')

ann_df

CPU times: total: 719 ms
Wall time: 1.47 s


Unnamed: 0,gene,gene_symbol,gene_name,term,slim_terms,qualifiers,evidence,group,ref_count
68557,UniProtKB:Q8N302,AGGF1,Angiogenic factor with G patch and FHA domains 1,GO:0045766,[],[involved_in],"[{'with_gene_id': 'UniProtKB:Q8N302', 'referen...",GO_Central,1
17944,UniProtKB:Q15811,ITSN1,Intersectin-1,GO:0042734,[GO:0005886],[is_active_in],"[{'with_gene_id': 'WB:Y116A8C.36d', 'reference...",GO_Central,1
17943,UniProtKB:Q15811,ITSN1,Intersectin-1,GO:0150007,[GO:0016192],[involved_in],"[{'with_gene_id': 'FB:FBgn0023388', 'reference...",GO_Central,1
36204,UniProtKB:Q96A49,SYAP1,Synapse-associated protein 1,GO:0048172,[],[involved_in],"[{'with_gene_id': 'FB:FBgn0013334', 'reference...",GO_Central,1
17938,UniProtKB:Q6DKJ4,NXN,Nucleoredoxin,GO:0004791,"[GO:0016491, GO:0140096, GO:0016209]",[enables],"[{'with_gene_id': 'MGI:MGI:109331', 'reference...",GO_Central,1
...,...,...,...,...,...,...,...,...,...
2826,UniProtKB:P35590,TIE1,Tyrosine-protein kinase receptor Tie-1,GO:0007275,[GO:0048856],[involved_in],"[{'with_gene_id': 'ZFIN:ZDB-GENE-990415-208', ...",GO_Central,546
1098,UniProtKB:Q12866,MERTK,Tyrosine-protein kinase Mer,GO:0007275,[GO:0048856],[involved_in],"[{'with_gene_id': 'RGD:2556', 'references': ['...",GO_Central,546
3186,UniProtKB:Q01973,ROR1,Inactive tyrosine-protein kinase transmembrane...,GO:0007275,[GO:0048856],[involved_in],"[{'with_gene_id': 'FB:FBgn0020391', 'reference...",GO_Central,546
9013,UniProtKB:P14616,INSRR,Insulin receptor-related protein,GO:0007275,[GO:0048856],[involved_in],"[{'with_gene_id': 'UniProtKB:P10721', 'referen...",GO_Central,546


In [53]:
ann_df2 = ann_df.head(100);
count_uniq = count_unique_refs(ann_df2)
print(len(count_uniq))

anno_json = ann_df2.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)

write_to_json(json_str, ospath.join('.', sample_annotations_fp))

76


In [10]:
terms_df = pd.read_json(terms_fp, dtype={'is_goslim':bool})
terms_df = terms_df.set_index('ID', drop=False)
terms_df = terms_df.rename(columns={'ID': 'id', 'LABEL': 'label', 'hasOBONamespace':'aspect'})
terms_df['aspect'] = terms_df['aspect'].str.replace('_', ' ')
terms_df

#write_to_json(json_str, ospath.join('out.json'))


Unnamed: 0_level_0,id,label,aspect,is_goslim
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000001,GO:0000001,mitochondrion inheritance,biological process,False
GO:0000002,GO:0000002,mitochondrial genome maintenance,biological process,False
GO:0000003,GO:0000003,reproduction,biological process,False
GO:0000005,GO:0000005,obsolete ribosomal chaperone activity,molecular function,False
GO:0000006,GO:0000006,high-affinity zinc transmembrane transporter a...,molecular function,False
...,...,...,...,...
GO:2001313,GO:2001313,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological process,False
GO:2001314,GO:2001314,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological process,False
GO:2001315,GO:2001315,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,biological process,False
GO:2001316,GO:2001316,kojic acid metabolic process,biological process,False


In [4]:
article_df = pd.read_json(articles_fp)
article_df = article_df.set_index('pmid', drop=False)
article_df

Unnamed: 0_level_0,pmid,title,date,authors
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PMID:24361620,PMID:24361620,"Identification, modeling and ligand affinity o...",2014 Jun,"[Morrison AM, Goldstone JV, Lamb DC, Kubota A,..."
PMID:15451575,PMID:15451575,"FLRG, member of the follistatin family, a new ...",2004 Oct 15,"[Maguer-Satta V, Rimokh R]"
PMID:10772789,PMID:10772789,The L63 gene is necessary for the ecdysone-ind...,2000 May 1,"[Stowers RS, Garza D, Rascle A, Hogness DS]"
PMID:21670503,PMID:21670503,Phosphodiesterase 4B in the cardiac L-type Ca²...,2011 Jul,"[Leroy J, Richter W, Mika D, Castro LR, Abi-Ge..."
PMID:7011377,PMID:7011377,Kinetics and protein subunit interactions of E...,1981 Feb 17,[Rizzolo LJ]
...,...,...,...,...
PMID:15465819,PMID:15465819,Ubiquitin ligase activity of c-Cbl guides the ...,2004 Dec 31,"[de Melker AA, van der Horst G, Borst J]"
PMID:12376551,PMID:12376551,RhoG signals in parallel with Rac1 and Cdc42.,2002 Dec 6,"[Wennerberg K, Ellerbroek SM, Liu RY, Karnoub ..."
PMID:15603737,PMID:15603737,BAG5 inhibits parkin and enhances dopaminergic...,2004 Dec 16,"[Kalia SK, Lee S, Smith PD, Liu L, Crocker SJ,..."
PMID:12591913,PMID:12591913,The molecular function of Ase1p: evidence for ...,2003 Feb 17,"[Schuyler SC, Liu JY, Pellman D]"


In [5]:
gene_df = pd.read_json(gene_info_fp)
gene_df = gene_df.set_index('gene', drop=False)
gene_df

Unnamed: 0_level_0,gene,gene_symbol,gene_name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
UniProtKB:P15036,UniProtKB:P15036,ETS2,Protein C-ets-2
MGI:MGI:99423,MGI:MGI:99423,Etv4,ETS translocation variant 4
UniProtKB:Q9HBH0,UniProtKB:Q9HBH0,RHOF,Rho-related GTP-binding protein RhoF
PomBase:SPAC1F7.04,PomBase:SPAC1F7.04,rho1,GTP-binding protein rho1
UniProtKB:P43681,UniProtKB:P43681,CHRNA4,Neuronal acetylcholine receptor subunit alpha-4
...,...,...,...
TAIR:locus:2166449,TAIR:locus:2166449,RPN8A,26S proteasome non-ATPase regulatory subunit 7...
UniProtKB:Q96B45,UniProtKB:Q96B45,BORCS7,BLOC-1-related complex subunit 7
ZFIN:ZDB-GENE-020814-2,ZFIN:ZDB-GENE-020814-2,cxadr,Coxsackievirus and adenovirus receptor homolog
TAIR:locus:2152916,TAIR:locus:2152916,PGL5,Probable 6-phosphogluconolactonase 5


In [36]:
%%time
ann_df = pd.read_json(annotations_fp)
ann_df['aspect'] = ann_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
ann_df['term'] = ann_df['term'].apply(lambda x: get_pd_row(terms_df, x))
ann_df['slim_terms'] = ann_df['slim_terms'].apply(lambda x: denormalize_terms(terms_df, x))
ann_df['relation'] = ann_df['qualifiers'].apply(lambda x: x[0])
ann_df['relation'] = ann_df['relation'].str.replace('_', ' ')
ann_df['evidence'] = ann_df['evidence'].apply(lambda x: get_evidence(article_df, gene_df, x))
ann_df['evidence_type'] = ann_df.apply(lambda x: get_evidence_type(x), axis=1)
ann_df = ann_df.drop(columns=['qualifiers'])
anno_json = ann_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)

write_to_json(json_str, ospath.join('.', clean_annotations_fp))

CPU times: total: 2min 54s
Wall time: 4min 37s


In [34]:
ann_df['evidence_type'] = ann_df.apply(lambda x: get_evidence_type(x), axis=1)
ann_df[ann_df['evidence_type']=='direct']

Unnamed: 0,gene,gene_symbol,gene_name,term,slim_terms,evidence,group,aspect,qualifier,relationship,evidence_type
6,UniProtKB:P51168,SCNN1B,Amiloride-sensitive sodium channel subunit beta,"{'id': 'GO:0035725', 'label': 'sodium ion tran...","[{'id': 'GO:0055085', 'label': 'transmembrane ...","[{'with_gene_id': {'gene': 'UniProtKB:P51168',...",GO_Central,biological process,involved_in,involved in,direct
7,UniProtKB:P51168,SCNN1B,Amiloride-sensitive sodium channel subunit beta,"{'id': 'GO:0005887', 'label': 'integral compon...","[{'id': 'GO:0005886', 'label': 'plasma membran...",[{'with_gene_id': {'gene': 'ZFIN:ZDB-GENE-0405...,GO_Central,cellular component,is_active_in,is active in,direct
8,UniProtKB:P51168,SCNN1B,Amiloride-sensitive sodium channel subunit beta,"{'id': 'GO:0034706', 'label': 'sodium channel ...",[],"[{'with_gene_id': {'gene': 'MGI:MGI:104696', '...",GO_Central,cellular component,part_of,part of,direct
9,UniProtKB:Q00653,NFKB2,Nuclear factor NF-kappa-B p100 subunit,"{'id': 'GO:0000981', 'label': 'DNA-binding tra...",[],"[{'with_gene_id': {'gene': 'FB:FBgn0011274', '...",GO_Central,molecular function,enables,enables,direct
10,UniProtKB:Q00653,NFKB2,Nuclear factor NF-kappa-B p100 subunit,"{'id': 'GO:0006357', 'label': 'regulation of t...","[{'id': 'GO:0006355', 'label': 'regulation of ...","[{'with_gene_id': {'gene': 'UniProtKB:Q00653',...",GO_Central,biological process,involved_in,involved in,direct
...,...,...,...,...,...,...,...,...,...,...,...
68544,UniProtKB:Q8NAV1,PRPF38A,Pre-mRNA-splicing factor 38A,"{'id': 'GO:0071011', 'label': 'precatalytic sp...","[{'id': 'GO:0005634', 'label': 'nucleus', 'asp...","[{'with_gene_id': {'gene': 'UniProtKB:Q8NAV1',...",GO_Central,cellular component,part_of,part of,direct
68549,UniProtKB:Q9BXK5,BCL2L13,Bcl-2-like protein 13,"{'id': 'GO:0016021', 'label': 'integral compon...",[],"[{'with_gene_id': {'gene': 'UniProtKB:Q9BXK5',...",GO_Central,cellular component,is_active_in,is active in,direct
68551,UniProtKB:Q9BSH3,NICN1,Nicolin-1,"{'id': 'GO:0005654', 'label': 'nucleoplasm', '...","[{'id': 'GO:0005654', 'label': 'nucleoplasm', ...","[{'with_gene_id': {'gene': 'UniProtKB:Q9BSH3',...",GO_Central,cellular component,is_active_in,is active in,direct
68554,UniProtKB:Q8TBY9,WDR66,Cilia- and flagella-associated protein 251,"{'id': 'GO:0036126', 'label': 'sperm flagellum...","[{'id': 'GO:0005929', 'label': 'cilium', 'aspe...","[{'with_gene_id': {'gene': 'UniProtKB:Q8TBY9',...",GO_Central,cellular component,is_active_in,is active in,direct


In [None]:
print(dict(ann_df.iloc[1]))

In [None]:
clean_ann_df = pd.read_json(clean_annotations_fp)
clean_ann_df

In [15]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

Unnamed: 0_level_0,id,label,aspect,is_goslim
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GO:0000228,GO:0000228,nuclear chromosome,cellular component,True
GO:0000278,GO:0000278,mitotic cell cycle,biological process,True
GO:0000910,GO:0000910,cytokinesis,biological process,True
GO:0002181,GO:0002181,cytoplasmic translation,biological process,True
GO:0002376,GO:0002376,immune system process,biological process,True
...,...,...,...,...
GO:0140299,GO:0140299,small molecule sensor activity,molecular function,True
GO:0140313,GO:0140313,molecular sequestering activity,molecular function,True
GO:0140657,GO:0140657,ATP-dependent activity,molecular function,True
GO:0140691,GO:0140691,RNA folding chaperone,molecular function,True


In [None]:
temp_df = ann_df[ann_df.duplicated(subset=['gene', 'gene_symbol'])]
temp_df

In [None]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms():
    terms = set()
    for term in list(ann_df['term']):
        terms.add(term['id'])
                    
    return [get_pd_row(terms_df, term) for term in terms]

count_uniq = count_unique_terms()
write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
print(len(count_uniq))

def count_unique_slim_terms():
    terms = set()
        
    for s_terms in list(ann_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return [get_pd_row(terms_df, term) for term in terms]

count_uniq = count_unique_slim_terms()
write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
print(len(count_uniq))

In [None]:
def count_unique_refs(gene):
    refs = set()
        
    for evidences in list(ann_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
        if gene == ann
            
    return list(refs)

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [None]:
ann_df