In [1]:
import json
from os import path as ospath
import pandas as pd
import numpy as np
import random
import pprint
from src.utils import write_to_json
terms_fp = './downloads/input/full_go_annotated.json'
annotations_fp = './downloads/input/human_iba_annotations.json'
sample_annotations_fp = './data/sample_human_iba_annotations.json'
articles_fp = './downloads/clean-articles.json'
gene_info_fp = './downloads/input/human_iba_gene_info.json'
taxon_fp = './downloads/input/taxon_lkp.json'
clean_annotations_fp = './downloads/human_iba_annotations_clean.json'
filtered_terms_fp = './data/filtered_terms.json'
filtered_slim_terms_fp = './data/filtered_slim_terms.json'
filtered_refs_fp = './data/filtered_refs.json'
filtered_evidences_fp = './data/filtered_evidences.json'
duplicate_gene_fp = './downloads/duplicate_gene.csv'
duplicate_symbol_fp = './downloads/duplicate_symbol.csv'

In [2]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def spread_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, genes_df, row):
    result = []
    for evidence in row['evidence']:
        result_ref =[]
        for reference in evidence['references']:
            result_ref.append(get_pd_row_key(df, reference))
        gene_row = get_pd_row_key(genes_df, evidence['with_gene_id'])
       
        evidence_item = {
          'with_gene_id':gene_row,
          'group': evidence['group'],
          'references':result_ref
        }
        result.append(evidence_item)
            
    return result

def get_evidence_type(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
        if len(evidence['references']) == 0:
            return 'n/a' 
            
    return 'homology'



def count_unique_refs_row(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))

def get_other(row):
    if len(row['slim_terms'])==0:
        return ASPECT_OTHER_MAP[row['term']['apsect']]
    
    
def count_unique_slims(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))
            
    


In [None]:
def count_unique_refs(df):
    refs = set()
        
    for evidences in list(df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

In [None]:
%%time
annos_df = pd.read_json(annotations_fp)
#annos_df = annos_df.loc[2000:2010];
annos_df['ref_count'] = annos_df['evidence'].apply(lambda x: count_unique_refs_row(x))
annos_df = annos_df.sort_values(by='ref_count')

annos_df

In [None]:
annos_df2 = annos_df.head(100);
count_uniq = count_unique_refs(annos_df2)
print(len(count_uniq))

anno_json = annos_df2.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)

write_to_json(json_str, ospath.join('.', sample_annotations_fp))

In [3]:
terms_df = pd.read_json(terms_fp, dtype={'is_goslim':bool})
terms_df = terms_df.set_index('ID', drop=False)
terms_df = terms_df.rename(columns={'ID': 'id', 'LABEL': 'label', 'hasOBONamespace':'aspect'})
terms_df['aspect'] = terms_df['aspect'].str.replace('_', ' ')


#write_to_json(json_str, ospath.join('out.json'))


In [4]:
articles_df = pd.read_json(articles_fp)
articles_df = articles_df.set_index('pmid', drop=False)
#articles_df

In [5]:
taxon_df = pd.read_json(taxon_fp, dtype={'taxon_id':str})
#taxon_df

In [6]:
genes_df = pd.read_json(gene_info_fp, dtype={'taxon_id':str})
genes_df = genes_df.merge(taxon_df, how='left', on='taxon_id')
genes_df = genes_df.set_index('gene', drop=False)
#genes_df = genes_df['coordinates'].apply(lambda x: x).explode()
genes_df

Unnamed: 0_level_0,gene,gene_symbol,gene_name,taxon_id,coordinates_chr_num,coordinates_start,coordinates_end,coordinates_strand,taxon_label,taxon_abbr
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
UniProtKB:P17535,UniProtKB:P17535,JUND,Transcription factor jun-D,9606,19,18279760.0,18280929.0,-1.0,Homo sapiens,Hsa
UniProtKB:P05412,UniProtKB:P05412,JUN,Transcription factor AP-1,9606,1,58780788.0,58784327.0,-1.0,Homo sapiens,Hsa
UniProtKB:Q92973,UniProtKB:Q92973,TNPO1,Transportin-1,9606,5,72893676.0,72905429.0,1.0,Homo sapiens,Hsa
TAIR:locus:2168586,TAIR:locus:2168586,KPNB1,Importin subunit beta-1,3702,,,,,Arabidopsis thaliana,Ath
UniProtKB:Q03052,UniProtKB:Q03052,POU3F1,"POU domain, class 3, transcription factor 1",9606,1,38043827.0,38046794.0,-1.0,Homo sapiens,Hsa
...,...,...,...,...,...,...,...,...,...,...
UniProtKB:Q99550,UniProtKB:Q99550,MPHOSPH9,M-phase phosphoprotein 9,9606,12,123152324.0,123176716.0,-1.0,Homo sapiens,Hsa
TAIR:locus:2039802,TAIR:locus:2039802,RPL35B,60S ribosomal protein L35-2,3702,,,,,Arabidopsis thaliana,Ath
CGD:CAL0000178448,CGD:CAL0000178448,URA9,"Dihydroorotate dehydrogenase (quinone), mitoch...",237561,,,,,Candida albicans,Cal
SGD:S000005159,SGD:S000005159,IES2,Ino eighty subunit 2,559292,,,,,Saccharomyces cerevisiae,Sce


In [None]:
annos_df = pd.read_json(annotations_fp)[:1000]
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
write_to_json(json_str, ospath.join(sample_annotations_fp))
#pprint.pprint(json_str)

In [None]:
annos_df

In [None]:
%%time

annos_df = pd.read_json(annotations_fp)[:100]
annos_df = annos_df.merge(genes_df[['taxon_id', 'taxon_label',
                                    'taxon_abbr', 
                                    'coordinates_chr_num','coordinates_start','coordinates_end','coordinates_strand']], how='left', left_on="gene", right_index=True)
#annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df



In [7]:
%%time

def count_evidence(evidences):
    return len(evidences)

def get_groups(evidences):
    groups = set()
    for evidence in evidences:
        groups.add(evidence['group'])
    return list(groups)
    
annos_df = pd.read_json(annotations_fp)
annos_df = annos_df.merge(genes_df[
    ['taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df['term'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
annos_df['slim_terms'] = annos_df['slim_terms'].apply(lambda x: spread_terms(terms_df, x))
annos_df['qualifier'] = annos_df['qualifier'].str.replace('_', ' ')
annos_df['evidence'] = annos_df.apply(lambda x: get_evidence(articles_df, genes_df, x),axis=1)
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df['groups'] = annos_df['evidence'].apply(lambda x: get_groups(x))
annos_df['evidence_count'] = annos_df['evidence'].apply(lambda x: count_evidence(x))

   
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', clean_annotations_fp), indent=2)
annos_df

CPU times: total: 3min 25s
Wall time: 4min 43s


Unnamed: 0,gene,gene_symbol,gene_name,term,slim_terms,qualifier,evidence,group,taxon_id,taxon_label,taxon_abbr,coordinates_chr_num,coordinates_start,coordinates_end,coordinates_strand,aspect,evidence_type,groups,evidence_count
0,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0008134', 'label': 'transcription f...","[{'id': 'OTHER:0001', 'label': 'Other molecula...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,9606,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,molecular function,homology,"[UniProtKB, RGD, WB, MGI]",4
1,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0000978', 'label': 'RNA polymerase ...","[{'id': 'GO:0003677', 'label': 'DNA binding', ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,9606,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,molecular function,homology,"[UniProtKB, MGI]",3
2,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0042127', 'label': 'regulation of c...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,"[{'with_gene_id': {'gene': 'RGD:2943', 'gene_s...",GO_Central,9606,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,biological process,homology,"[MGI, UniProtKB, RGD]",3
3,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0005667', 'label': 'transcription r...","[{'id': 'OTHER:0003', 'label': 'Other cellular...",,"[{'with_gene_id': {'gene': 'UniProtKB:P17535',...",GO_Central,9606,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,cellular component,direct,"[MGI, UniProtKB, ZFIN, FB]",6
4,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0006357', 'label': 'regulation of t...","[{'id': 'GO:0006355', 'label': 'regulation of ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P17535',...",GO_Central,9606,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,biological process,direct,"[UniProtKB, MGI, CGD, ZFIN, SGD, RGD, FB]",11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90901,UniProtKB:V9GZ38,,Uncharacterized protein (Fragment),"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'UNKNOWN:0002', 'label': 'Unknown biol...",,[],GO_Central,,,,,,,,biological process,homology,[],0
90902,UniProtKB:V9GZ38,,Uncharacterized protein (Fragment),"{'id': 'UNKNOWN:0003', 'label': 'Unknown cellu...","[{'id': 'UNKNOWN:0003', 'label': 'Unknown cell...",,[],GO_Central,,,,,,,,cellular component,homology,[],0
90903,UniProtKB:X6R8D5,,Uncharacterized protein,"{'id': 'UNKNOWN:0001', 'label': 'Unknown molec...","[{'id': 'UNKNOWN:0001', 'label': 'Unknown mole...",,[],GO_Central,,,,,,,,molecular function,homology,[],0
90904,UniProtKB:X6R8D5,,Uncharacterized protein,"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'UNKNOWN:0002', 'label': 'Unknown biol...",,[],GO_Central,,,,,,,,biological process,homology,[],0


In [None]:
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df[annos_df['evidence_type']=='direct']

In [None]:
print(dict(annos_df.iloc[1]))

In [None]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
temp_df = annos_df[annos_df.duplicated(subset=['gene', 'gene_symbol'])]
temp_df

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [9]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

Unnamed: 0,gene,gene_symbol,gene_name,term,slim_terms,qualifier,evidence,group,taxon_id,taxon_label,taxon_abbr,coordinates_chr_num,coordinates_start,coordinates_end,coordinates_strand,aspect,evidence_type,groups,evidence_count
0,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0008134', 'label': 'transcription f...","[{'id': 'OTHER:0001', 'label': 'Other molecula...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,9606.0,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,molecular function,homology,"[UniProtKB, RGD, WB, MGI]",4
1,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0000978', 'label': 'RNA polymerase ...","[{'id': 'GO:0003677', 'label': 'DNA binding', ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,9606.0,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,molecular function,homology,"[UniProtKB, MGI]",3
2,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0042127', 'label': 'regulation of c...","[{'id': 'OTHER:0002', 'label': 'Other biologic...",,"[{'with_gene_id': {'gene': 'RGD:2943', 'gene_s...",GO_Central,9606.0,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,biological process,homology,"[MGI, UniProtKB, RGD]",3
3,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0005667', 'label': 'transcription r...","[{'id': 'OTHER:0003', 'label': 'Other cellular...",,"[{'with_gene_id': {'gene': 'UniProtKB:P17535',...",GO_Central,9606.0,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,cellular component,direct,"[MGI, UniProtKB, ZFIN, FB]",6
4,UniProtKB:P17535,JUND,Transcription factor jun-D,"{'id': 'GO:0006357', 'label': 'regulation of t...","[{'id': 'GO:0006355', 'label': 'regulation of ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P17535',...",GO_Central,9606.0,Homo sapiens,Hsa,19,18279760.0,18280929.0,-1.0,biological process,direct,"[UniProtKB, MGI, CGD, ZFIN, SGD, RGD, FB]",11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90901,UniProtKB:V9GZ38,,Uncharacterized protein (Fragment),"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'UNKNOWN:0002', 'label': 'Unknown biol...",,[],GO_Central,,,,,,,,biological process,homology,[],0
90902,UniProtKB:V9GZ38,,Uncharacterized protein (Fragment),"{'id': 'UNKNOWN:0003', 'label': 'Unknown cellu...","[{'id': 'UNKNOWN:0003', 'label': 'Unknown cell...",,[],GO_Central,,,,,,,,cellular component,homology,[],0
90903,UniProtKB:X6R8D5,,Uncharacterized protein,"{'id': 'UNKNOWN:0001', 'label': 'Unknown molec...","[{'id': 'UNKNOWN:0001', 'label': 'Unknown mole...",,[],GO_Central,,,,,,,,molecular function,homology,[],0
90904,UniProtKB:X6R8D5,,Uncharacterized protein,"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'UNKNOWN:0002', 'label': 'Unknown biol...",,[],GO_Central,,,,,,,,biological process,homology,[],0


In [10]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms(annos_df):
    terms = set()
    for term in list(annos_df['term']):
        terms.add(term['id'])
    
    return list(terms)

#count_uniq = count_unique_terms()
#write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
#print(len(count_uniq))

def count_unique_slim_terms(annos_df):
    terms = set()
        
    for s_terms in list(annos_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return list(terms)

#count_uniq = count_unique_slim_terms()
#write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
#print(count_uniq)
def count_unique_refs(annos_df):
    refs = set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                if ref is not None:
                    refs.add(ref['pmid'])
            
    return list(refs)

def count_unique_withs(annos_df):
    gene = set()
    gene_name=set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            gene.add(evidence['with_gene_id']['gene'])
            
    return list(gene)

len(count_unique_refs(clean_annos_df))

56454

In [11]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
    'coordinates_strand', 
    'aspect', 
    'evidence_type'
}
    

stats = {k: len(clean_annos_df[k].unique()) for k in scalar_cols }

stats['terms']=len(count_unique_terms(clean_annos_df))
stats['slim_terms']=len(count_unique_slim_terms(clean_annos_df))
stats['references']=len(count_unique_refs(clean_annos_df))
stats['with_gene']=len(count_unique_withs(clean_annos_df))
stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df


Unnamed: 0,Unique Count
gene_symbol,20541
coordinates_start,16446
evidence_type,2
taxon_label,2
gene_name,20460
coordinates_chr_num,26
aspect,3
taxon_id,2
taxon_abbr,2
coordinates_end,16432


In [12]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene', 'gene_symbol', 'gene_name'])
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
unique_genes = unique_genes[~unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene'])]
duplicate_genes = duplicate_genes[['gene']]
res = unique_genes[unique_genes['gene'].isin(list(duplicate_genes['gene']))]
res = res.sort_values(by=['gene'])
res.to_csv(duplicate_gene_fp)
res

Unnamed: 0,gene,gene_symbol,gene_name
1750,UniProtKB:A0A1W2PRP0,A0A1W2PRP0,Uncharacterized protein
1755,UniProtKB:A0A1W2PRP0,,Uncharacterized protein
