In [1]:
import json
from os import path as ospath
import pandas as pd
import numpy as np
import random
import pprint
from src.utils import write_to_json
terms_fp = './downloads/input/full_go_annotated.json'
annotations_fp = './downloads/input/human_iba_annotations.json'
sample_annotations_fp = './data/sample_human_iba_annotations.json'
articles_fp = './downloads/clean-articles.json'
gene_info_fp = './downloads/input/human_iba_gene_info.json'
taxon_fp = './downloads/input/taxon_lkp.json'
clean_annotations_fp = './downloads/human_iba_annotations_clean.json'
filtered_terms_fp = './data/filtered_terms.json'
filtered_slim_terms_fp = './data/filtered_slim_terms.json'
filtered_refs_fp = './data/filtered_refs.json'
filtered_evidences_fp = './data/filtered_evidences.json'
duplicate_gene_fp = './downloads/duplicate_gene.csv'
duplicate_symbol_fp = './downloads/duplicate_symbol.csv'



old_annotations_fp = './downloads/input-old/human_iba_annotations.json'
old_clean_annotations_fp = './downloads/old_human_iba_annotations_clean.json'

In [2]:
def get_pd_row(df, k):
    return dict(df.loc[k].dropna())

def spread_terms(df, terms):
    return [get_pd_row(df, term) for term in terms]

def get_pd_row_key(df, k):
    try:
        return dict(df.loc[k].dropna())
    except KeyError:
        return None

def get_aspect(df, k):
    row = get_pd_row_key(df, k)
    if row is None:
        return 'no' + str(k)
    aspect = row.get('aspect', None)
    return aspect

def get_evidence(df, genes_df, row):
    result = []
    for evidence in row['evidence']:
        result_ref =[]
        for reference in evidence['references']:
            result_ref.append(get_pd_row_key(df, reference))
        gene_row = get_pd_row_key(genes_df, evidence['with_gene_id'])
       
        evidence_item = {
          'with_gene_id':gene_row,
          'group': evidence['group'],
          'references':result_ref
        }
        result.append(evidence_item)
            
    return result

def get_evidence_type(row):
    for evidence in row['evidence']:
        if evidence["with_gene_id"]['gene'] == row['gene']:
            return 'direct'
        if len(evidence['references']) == 0:
            return 'n/a' 
            
    return 'homology'



def count_unique_refs_row(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))

def get_other(row):
    if len(row['slim_terms'])==0:
        return ASPECT_OTHER_MAP[row['term']['apsect']]
    
    
def count_unique_slims(evidences):
    refs = set()
        
    for evidence in evidences:
        for ref in evidence['references']:
            refs.add(ref)
            
    return len(list(refs))
            
    


In [None]:
def count_unique_refs(df):
    refs = set()
        
    for evidences in list(df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                refs.add(ref)
            
    return list(refs)

count_uniq = count_unique_refs()
write_to_json(count_uniq, filtered_refs_fp)
print(len(count_uniq))

In [None]:
%%time
annos_df = pd.read_json(annotations_fp)
#annos_df = annos_df.loc[2000:2010];
annos_df['ref_count'] = annos_df['evidence'].apply(lambda x: count_unique_refs_row(x))
annos_df = annos_df.sort_values(by='ref_count')
annos_df['evidence_type'] = annos_df['evidence_type'].replace(np.nan, 'n/a')

annos_df

In [None]:
annos_df2 = annos_df.head(100);
count_uniq = count_unique_refs(annos_df2)
print(len(count_uniq))

anno_json = annos_df2.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)

write_to_json(json_str, ospath.join('.', sample_annotations_fp))

In [3]:
terms_df = pd.read_json(terms_fp, dtype={'is_goslim':bool})
terms_df = terms_df.set_index('ID', drop=False)
terms_df = terms_df.rename(columns={'ID': 'id', 'LABEL': 'label', 'hasOBONamespace':'aspect'})
terms_df['aspect'] = terms_df['aspect'].str.replace('_', ' ')


#write_to_json(json_str, ospath.join('out.json'))


In [4]:
articles_df = pd.read_json(articles_fp)
articles_df = articles_df.set_index('pmid', drop=False)

In [5]:
taxon_df = pd.read_json(taxon_fp, dtype={'taxon_id':str})
#taxon_df

In [7]:
genes_df = pd.read_json(gene_info_fp, dtype={'taxon_id':str})
genes_df['taxon_id'] = genes_df['taxon_id'].str.replace('taxon:', '')
genes_df = genes_df.merge(taxon_df, how='left', on='taxon_id')
genes_df = genes_df.set_index('gene', drop=False)
#genes_df = genes_df['coordinates'].apply(lambda x: x).explode()
#genes_df['taxon_abbr'].unique()

In [None]:
annos_df = pd.read_json(annotations_fp)[:10]
annos_df = annos_df.merge(genes_df[
    ['taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
#anno_json = annos_df.to_json(orient="records", default_handler=None)
#json_str = json.loads(anno_json)
#write_to_json(json_str, ospath.join(sample_annotations_fp))
#pprint.pprint(json_str)
annos_df

In [None]:
annos_df

In [None]:
%%time

annos_df = pd.read_json(annotations_fp)[:100]
annos_df = annos_df.merge(genes_df[['gene_symbol',
     'gene_name','taxon_id', 'taxon_label',
                                    'taxon_abbr', 
                                    'coordinates_chr_num','coordinates_start','coordinates_end','coordinates_strand']], how='left', left_on="gene", right_index=True)
#annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df



In [None]:
%%time

def count_evidence(evidences):
    return len(evidences)

def get_groups(evidences):
    groups = set()
    for evidence in evidences:
        groups.add(evidence['group'])
    return list(groups)
    
annos_df = pd.read_json(annotations_fp)
annos_df = annos_df.merge(genes_df[
    ['gene_symbol',
     'gene_name',
     'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)
annos_df['aspect'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x)['aspect'])
annos_df['term'] = annos_df['term'].apply(lambda x: get_pd_row(terms_df, x))
annos_df['slim_terms'] = annos_df['slim_terms'].apply(lambda x: spread_terms(terms_df, x))
annos_df['qualifier'] = annos_df['qualifier'].str.replace('_', ' ')
annos_df['evidence'] = annos_df.apply(lambda x: get_evidence(articles_df, genes_df, x),axis=1)
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df['groups'] = annos_df['evidence'].apply(lambda x: get_groups(x))
annos_df['evidence_count'] = annos_df['evidence'].apply(lambda x: count_evidence(x))

   
anno_json = annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', clean_annotations_fp))
annos_df

In [None]:
%%time
    
old_annos_df = pd.read_json(old_annotations_fp)
old_annos_df = old_annos_df.merge(genes_df[
    [ 'taxon_id', 
     'taxon_label', 
     'taxon_abbr' , 
     'coordinates_chr_num',
     'coordinates_start',
     'coordinates_end',
     'coordinates_strand']], how='left', left_on="gene", right_index=True)

   
anno_json = old_annos_df.to_json(orient="records", default_handler=None)
json_str = json.loads(anno_json)
#pprint.pprint(json_str)
write_to_json(json_str, ospath.join('.', old_clean_annotations_fp))
old_annos_df

In [None]:
annos_df['evidence_type'] = annos_df.apply(lambda x: get_evidence_type(x), axis=1)
annos_df[annos_df['evidence_type']=='direct']

In [None]:
print(dict(annos_df.iloc[1]))

In [8]:
clean_annos_df = pd.read_json(clean_annotations_fp)
clean_annos_df

Unnamed: 0,gene,term,slim_terms,qualifier,evidence,group,evidence_type,gene_symbol,gene_name,taxon_id,taxon_label,taxon_abbr,panther_family,long_id,coordinates_chr_num,coordinates_start,coordinates_end,aspect,groups,evidence_count
0,UniProtKB:P17535,"{'id': 'GO:0008134', 'label': 'transcription f...","[{'id': 'OTHER:0001', 'label': 'Other molecula...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,homology,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,PTHR11462,HUMAN|HGNC=6206|UniProtKB=P17535,19,18279760.0,18280929.0,molecular function,"[MGI, UniProtKB, WB, RGD]",4
1,UniProtKB:P17535,"{'id': 'GO:0000978', 'label': 'RNA polymerase ...","[{'id': 'GO:0003677', 'label': 'DNA binding', ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P05412',...",GO_Central,homology,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,PTHR11462,HUMAN|HGNC=6206|UniProtKB=P17535,19,18279760.0,18280929.0,molecular function,"[MGI, UniProtKB]",3
2,UniProtKB:P17535,"{'id': 'GO:0042127', 'label': 'regulation of c...","[{'id': 'GO:0042127', 'label': 'regulation of ...",,"[{'with_gene_id': {'gene': 'RGD:2943', 'gene_s...",GO_Central,homology,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,PTHR11462,HUMAN|HGNC=6206|UniProtKB=P17535,19,18279760.0,18280929.0,biological process,"[RGD, MGI, UniProtKB]",3
3,UniProtKB:P17535,"{'id': 'GO:0005667', 'label': 'transcription r...","[{'id': 'OTHER:0003', 'label': 'Other cellular...",,"[{'with_gene_id': {'gene': 'UniProtKB:P17535',...",GO_Central,direct,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,PTHR11462,HUMAN|HGNC=6206|UniProtKB=P17535,19,18279760.0,18280929.0,cellular component,"[MGI, FB, UniProtKB, ZFIN]",6
4,UniProtKB:P17535,"{'id': 'GO:0006357', 'label': 'regulation of t...","[{'id': 'GO:0006355', 'label': 'regulation of ...",,"[{'with_gene_id': {'gene': 'UniProtKB:P17535',...",GO_Central,direct,JUND,Transcription factor jun-D,9606,Homo sapiens,Hsa,PTHR11462,HUMAN|HGNC=6206|UniProtKB=P17535,19,18279760.0,18280929.0,biological process,"[RGD, FB, UniProtKB, CGD, MGI, SGD, ZFIN]",11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90910,UniProtKB:V9GZ38,"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'UNKNOWN:0002', 'label': 'Unknown biol...",,[],GO_Central,,V9GZ38,Uncharacterized protein (Fragment),9606,Homo sapiens,Hsa,,HUMAN|Ensembl=ENSG00000273088|UniProtKB=V9GZ38,1,155169409.0,155187272.0,biological process,[],0
90911,UniProtKB:V9GZ38,"{'id': 'UNKNOWN:0003', 'label': 'Unknown cellu...","[{'id': 'UNKNOWN:0003', 'label': 'Unknown cell...",,[],GO_Central,,V9GZ38,Uncharacterized protein (Fragment),9606,Homo sapiens,Hsa,,HUMAN|Ensembl=ENSG00000273088|UniProtKB=V9GZ38,1,155169409.0,155187272.0,cellular component,[],0
90912,UniProtKB:X6R8D5,"{'id': 'UNKNOWN:0001', 'label': 'Unknown molec...","[{'id': 'UNKNOWN:0001', 'label': 'Unknown mole...",,[],GO_Central,,X6R8D5,Uncharacterized protein,9606,Homo sapiens,Hsa,PTHR35444,HUMAN|Ensembl=ENSG00000214732|UniProtKB=X6R8D5,6,42155446.0,42163439.0,molecular function,[],0
90913,UniProtKB:X6R8D5,"{'id': 'UNKNOWN:0002', 'label': 'Unknown biolo...","[{'id': 'UNKNOWN:0002', 'label': 'Unknown biol...",,[],GO_Central,,X6R8D5,Uncharacterized protein,9606,Homo sapiens,Hsa,PTHR35444,HUMAN|Ensembl=ENSG00000214732|UniProtKB=X6R8D5,6,42155446.0,42163439.0,biological process,[],0


In [None]:
old_clean_annos_df = pd.read_json(old_clean_annotations_fp)

In [None]:
filtered_terms = terms_df[terms_df['is_goslim']==True]
filtered_terms

In [None]:
temp_df = annos_df[annos_df.duplicated(subset=['gene', 'gene_symbol'])]
temp_df

In [None]:
def add_terms():
    results = list()
    for n in range(0, len(human_df) -1):
        index = random.randrange(0, len(filtered_terms))
        results.append(dict(filtered_terms.iloc[index]))
    
    return results

count = 1

def add_gene():
    return human_df.iloc[count]['gene']

    
                            
#add_terms()

#ann2_df = pd.DataFrame.from_dict(add_terms())
ann2_df

ann3 = human_df.agg('-'.join, axis=1)
ann2_df

add_gene()
ann2_df['gene'] = ann2_df['id'].apply(lambda x: add_gene())
ann2_df

json_chunk = ann2_df.to_json(orient="records", default_handler=None)
json_str = json.loads(json_chunk)

write_to_json(json_str, 'out.json')
                     

In [10]:
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        return super().encode(bool(obj)) \
            if isinstance(obj, np.bool_) \
            else super().default(obj)
    
def count_unique_terms(annos_df):
    terms = set()
    for term in list(annos_df['term']):
        terms.add(term['id'])
    
    return list(terms)

#count_uniq = count_unique_terms()
#write_to_json(count_uniq, filtered_terms_fp, cls=CustomJSONizer)
#print(len(count_uniq))

def count_unique_slim_terms(annos_df):
    terms = set()
        
    for s_terms in list(annos_df['slim_terms']):
        for term in s_terms:
            terms.add(term['id'])
            
    return list(terms)

#count_uniq = count_unique_slim_terms()
#write_to_json(count_uniq, filtered_slim_terms_fp, cls=CustomJSONizer)
#print(count_uniq)
def count_unique_refs(annos_df):
    refs = set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            for ref in evidence['references']:
                if ref is not None:
                    refs.add(ref['pmid'])
            
    return list(refs)

def count_unique_withs(annos_df):
    gene = set()
    gene_name=set()
        
    for evidences in list(annos_df['evidence']):
        for evidence in evidences:
            gene.add(evidence['with_gene_id']['gene'])
            
    return list(gene)

len(count_unique_refs(clean_annos_df))

56498

In [11]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
    'aspect', 
    'evidence_type'
}
    

stats = {k: len(clean_annos_df[k].unique()) for k in scalar_cols }

stats['terms']=len(count_unique_terms(clean_annos_df))
stats['slim_terms']=len(count_unique_slim_terms(clean_annos_df))
stats['references']=len(count_unique_refs(clean_annos_df))
stats['with_gene']=len(count_unique_withs(clean_annos_df))
stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

Unnamed: 0,Unique Count
taxon_abbr,1
taxon_label,1
coordinates_end,19391
evidence_type,3
coordinates_chr_num,26
aspect,3
coordinates_start,19406
taxon_id,1
gene,20851
gene_symbol,20807


In [None]:
scalar_cols = {
    'gene', 
    'gene_symbol',
    'gene_name',
    'taxon_id', 
    'taxon_label', 
    'taxon_abbr', 
    'coordinates_chr_num', 
    'coordinates_start', 
    'coordinates_end', 
}
    

stats = {k: len(old_clean_annos_df[k].unique()) for k in scalar_cols }

stats_df = pd.DataFrame.from_dict(stats, orient ='index', columns=['Unique Count'])
stats_df
#clean_annos_df['evidence_type'].unique()

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene', 'gene_symbol', 'gene_name'])
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
unique_genes = unique_genes[~unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene'])]
duplicate_genes = duplicate_genes[['gene']]
res = unique_genes[unique_genes['gene'].isin(list(duplicate_genes['gene']))]
res = res.sort_values(by=['gene'])
res.to_csv(duplicate_gene_fp)
res

In [None]:
unique_genes = clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes = unique_genes[unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
unique_genes = old_clean_annos_df.drop_duplicates(subset=['gene'])
unique_genes = unique_genes.sort_values(by=['gene_name'])
#unique_genes.to_csv(duplicate_gene_fp)
unique_genes = unique_genes[['gene', 'gene_symbol', 'gene_name']]
duplicate_genes_2 = unique_genes[unique_genes.duplicated(subset=['gene_symbol'])]
duplicate_genes_2
#unique_genes.to_csv('./downloads/genes_symb.csv')

In [None]:
res = duplicate_genes[~duplicate_genes['gene_symbol'].isin(list(duplicate_genes_2['gene_symbol']))]
res

In [None]:
gene_df.loc('UniProtKB:X6R8D5')

In [27]:
clean_annos_df['term_label'] = clean_annos_df['term'].apply(lambda x: x['label'])
unknown_cc_df = clean_annos_df[clean_annos_df['term_label']=='Unknown cellular component']
unique_genes = unknown_cc_df.drop_duplicates(subset=['gene_name'])
unique_genes.shape

(6488, 21)