In [1]:
import json
from os import path as ospath
import pandas as pd
import numpy as np
import random
import pprint
from src.utils import write_to_json
terms_fp = './downloads/full_go_annotated.json'
annotations_fp = './downloads/human_iba_annotations.json'
sample_annotations_fp = './data/sample_human_iba_annotations.json'
articles_fp = './downloads/clean-articles.json'
gene_info_fp = './downloads/human_iba_gene_info.json'
taxon_fp = './downloads/taxon_lkp.json'
clean_annotations_fp = './downloads/human_iba_annotations_clean.json'
filtered_terms_fp = './data/filtered_terms.json'
filtered_slim_terms_fp = './data/filtered_slim_terms.json'
filtered_refs_fp = './data/filtered_refs.json'
filtered_evidences_fp = './data/filtered_evidences.json'

In [33]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def spread_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, genes_df, evidences):
    result = []
    for evidence in evidences:
        result_ref =[]
        for reference in evidence['references']:
            row = get_pd_row_key(df, reference)
            result_ref.append(row)
        gene_row = get_pd_row_key(genes_df, evidence['with_gene_id'])
        result.append({
            'with_gene_id':gene_row,
            'group': evidence['group'],
            'references':result_ref
        })
            
    return result

def get_evidence_type(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
            
    return 'homology'

def get_slim(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
            
    return 'homology'

def count_unique_refs_row(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))

def get_other(row):
    if len(row['slim_terms'])==0:
        return ASPECT_OTHER_MAP[row['term']['apsect']]
            
    


In [None]:
def count_unique_refs(df):
    refs = set()
        
    for evidences in list(df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

In [None]:
%%time
annos_df = pd.read_json(annotations_fp)
#annos_df = annos_df.loc[2000:2010];
annos_df['ref_count'] = annos_df['evidence'].apply(lambda x: count_unique_refs_row(x))
annos_df = annos_df.sort_values(by='ref_count')

annos_df

In [None]:
annos_df2 = annos_df.head(100);
count_uniq = count_unique_refs(annos_df2)
print(len(count_uniq))

anno_json = annos_df2.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)

write_to_json(json_str, ospath.join('.', sample_annotations_fp))

In [3]:
terms_df = pd.read_json(terms_fp, dtype={'is_goslim':bool})
terms_df = terms_df.set_index('ID', drop=False)
terms_df = terms_df.rename(columns={'ID': 'id', 'LABEL': 'label', 'hasOBONamespace':'aspect'})
terms_df['aspect'] = terms_df['aspect'].str.replace('_', ' ')


#write_to_json(json_str, ospath.join('out.json'))


In [4]:
articles_df = pd.read_json(articles_fp)
articles_df = articles_df.set_index('pmid', drop=False)
#articles_df

In [15]:
taxon_df = pd.read_json(taxon_fp, dtype={'taxon_id':str})
taxon_df

Unnamed: 0,taxon_id,taxon_label,taxon_abbr
0,3055,Chlamydomonas reinhardtii,Cre
1,3218,Physcomitrella patens,Ppa
2,3562,Spinacia oleracea,Sol
3,3635,Gossypium hirsutum,Ghi
4,3702,Arabidopsis thaliana,Ath
...,...,...,...
68,273057,Sulfolobus solfataricus,Sso
69,284812,Schizosaccharomyces pombe,Spo
70,367110,Neurospora crassa,Ncr
71,559292,Saccharomyces cerevisiae,Sce


In [6]:
genes_df = pd.read_json(gene_info_fp, dtype={'taxon_id':str})
genes_df = genes_df.merge(taxon_df, how='left', on='taxon_id')
genes_df = genes_df.set_index('gene', drop=False)

In [25]:
annos_df = pd.read_json(annotations_fp)[:1000]
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
write_to_json(json_str, ospath.join(sample_annotations_fp))
#pprint.pprint(json_str)

In [13]:
annos_df

Unnamed: 0,gene_x,gene_symbol_x,gene_name_x,term,slim_terms,qualifier,evidence,group,gene_y,gene_symbol_y,gene_name_y,taxon_id,taxon_label,taxon_abbr,aspect
0,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0008134', 'label': 'transcription f...",[],,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,UniProtKB:P17535,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,molecular function
1,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0000978', 'label': 'RNA polymerase ...","[{'id': 'GO:0003677', 'label': 'DNA binding', ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,UniProtKB:P17535,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,molecular function
2,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0042127', 'label': 'regulation of c...",[],,"[{'with_gene_id': {'gene': 'RGD:2943', 'gene_s...",GO_Central,UniProtKB:P17535,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,biological process
3,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0005667', 'label': 'transcription r...",[],,"[{'with_gene_id': {'gene': 'FB:FBgn0001291', '...",GO_Central,UniProtKB:P17535,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,cellular component
4,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0006357', 'label': 'regulation of t...","[{'id': 'GO:0006355', 'label': 'regulation of ...",,"[{'with_gene_id': {'gene': 'MGI:MGI:96647', 'g...",GO_Central,UniProtKB:P17535,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,biological process
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,"{'id': 'GO:0043005', 'label': 'neuron projecti...",[],,"[{'with_gene_id': {'gene': 'RGD:2345', 'gene_s...",GO_Central,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,9606,Homo sapiens,Hsa,cellular component
996,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,"{'id': 'GO:0050877', 'label': 'nervous system ...","[{'id': 'GO:0050877', 'label': 'nervous system...",,"[{'with_gene_id': {'gene': 'MGI:MGI:87887', 'g...",GO_Central,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,9606,Homo sapiens,Hsa,biological process
997,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,"{'id': 'GO:0008503', 'label': 'benzodiazepine ...","[{'id': 'GO:0060089', 'label': 'molecular tran...",contributes to,"[{'with_gene_id': {'gene': 'RGD:61861', 'gene_...",GO_Central,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,9606,Homo sapiens,Hsa,molecular function
998,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,"{'id': 'GO:0005254', 'label': 'chloride channe...","[{'id': 'GO:0005215', 'label': 'transporter ac...",contributes to,"[{'with_gene_id': {'gene': 'UniProtKB:P18507',...",GO_Central,UniProtKB:P14867,GABRA1,Gamma-aminobutyric acid receptor subunit alpha-1,9606,Homo sapiens,Hsa,molecular function


In [35]:
%%time

annos_df = pd.read_json(annotations_fp)[:100]
annos_df = annos_df.merge(genes_df[['taxon_id', 'taxon_label', 'taxon_abbr']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])



CPU times: total: 2.06 s
Wall time: 3.63 s


In [41]:
%%time

def count_evidence(evidences):
    return len(evidences)

def get_groups(evidences):
    groups = set()
    for evidence in evidences:
        groups.add(evidence['group'])
    return list(groups)
    
annos_df = pd.read_json(annotations_fp)
annos_df = annos_df.merge(genes_df[['taxon_id', 'taxon_label', 'taxon_abbr']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df['term'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
annos_df['slim_terms'] = annos_df['slim_terms'].apply(lambda x: spread_terms(terms_df, x))
annos_df['qualifier'] = annos_df['qualifier'].str.replace('_', ' ')
annos_df['evidence'] = annos_df['evidence'].apply(lambda x: get_evidence(articles_df, genes_df, x))
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df['groups'] = annos_df['evidence'].apply(lambda x: get_groups(x))
annos_df['evidence_count'] = annos_df['evidence'].apply(lambda x: count_evidence(x))

   
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', clean_annotations_fp), indent=2)
annos_df

CPU times: total: 3min 42s
Wall time: 6min 46s


Unnamed: 0,gene,gene_symbol,gene_name,term,slim_terms,qualifier,evidence,group,taxon_id,taxon_label,taxon_abbr,aspect,evidence_type,groups,evidence_count
0,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0008134', 'label': 'transcription f...","[{'id': 'OTHER:0001', 'label': 'Other molecula...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,9606,Homo sapiens,Hsa,molecular function,homology,"[RGD, UniProtKB, WB, MGI]",4
1,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0000978', 'label': 'RNA polymerase ...","[{'id': 'GO:0003677', 'label': 'DNA binding', ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,9606,Homo sapiens,Hsa,molecular function,homology,"[UniProtKB, MGI]",3
2,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0042127', 'label': 'regulation of c...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,"[{'with_gene_id': {'gene': 'RGD:2943', 'gene_s...",GO_Central,9606,Homo sapiens,Hsa,biological process,homology,"[RGD, UniProtKB, MGI]",3
3,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0005667', 'label': 'transcription r...","[{'id': 'OTHER:0003', 'label': 'Other cellular...",,"[{'with_gene_id': {'gene': 'FB:FBgn0001291', '...",GO_Central,9606,Homo sapiens,Hsa,cellular component,direct,"[UniProtKB, ZFIN, FB, MGI]",6
4,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0006357', 'label': 'regulation of t...","[{'id': 'GO:0006355', 'label': 'regulation of ...",,"[{'with_gene_id': {'gene': 'MGI:MGI:96647', 'g...",GO_Central,9606,Homo sapiens,Hsa,biological process,direct,"[MGI, RGD, SGD, ZFIN, CGD, UniProtKB, FB]",11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90901,UniProtKB:V9GZ38,Uncharacterized protein (Fragment),,"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,[],GO_Central,,,,biological process,homology,[],0
90902,UniProtKB:V9GZ38,Uncharacterized protein (Fragment),,"{'id': 'UNKNOWN:0003', 'label': 'Unknown cellu...","[{'id': 'OTHER:0003', 'label': 'Other cellular...",,[],GO_Central,,,,cellular component,homology,[],0
90903,UniProtKB:X6R8D5,Uncharacterized protein,,"{'id': 'UNKNOWN:0001', 'label': 'Unknown molec...","[{'id': 'OTHER:0001', 'label': 'Other molecula...",,[],GO_Central,,,,molecular function,homology,[],0
90904,UniProtKB:X6R8D5,Uncharacterized protein,,"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,[],GO_Central,,,,biological process,homology,[],0


In [None]:
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df[annos_df['evidence_type']=='direct']

In [None]:
print(dict(annos_df.iloc[1]))

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
temp_df = annos_df[annos_df.duplicated(subset=['gene', 'gene_symbol'])]
temp_df

In [None]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms():
    terms = set()
    for term in list(annos_df['term']):
        terms.add(term['id'])
                    
    return [get_pd_row(terms_df, term) for term in terms]

count_uniq = count_unique_terms()
write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
print(len(count_uniq))

def count_unique_slim_terms():
    terms = set()
        
    for s_terms in list(annos_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return [get_pd_row(terms_df, term) for term in terms]

count_uniq = count_unique_slim_terms()
write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
print(len(count_uniq))

In [None]:
def count_unique_refs(gene):
    refs = set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
        if gene == ann
            
    return list(refs)

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [None]:
annos_df