In [1]:
# Required libraries
import pandas as pd
import numpy as np
import os
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
path_pathways = '../data/pathways/'
path_weights = '../data/weights/'

## GENES from hiPathia PATHWAYS

In [3]:
df_hipathia = pd.read_csv(os.path.join(path_pathways + 'hipathia_entrez_and_symbol.csv'))
df_hipathia['symbol'] = [item.lower() for item in list(df_hipathia['symbol'])]
print('signaling pathway, ',df_hipathia.shape)

signaling pathway,  (3737, 2)


## GENES from KEGG (geneSCF) PATHWAYS

In [4]:
df_scf = pd.read_csv(os.path.join(path_pathways + 'kegg_entrez_and_symbol.csv'))
df_scf['symbol'] = [item.lower() for item in list(df_scf['symbol'])]
print('Metabolic pathway, ', df_scf.shape)

Metabolic pathway,  (7893, 2)


In [5]:
## Checking the na values (to control is there any unnamed symbol name or not. This control helps to prevent any incorrect duplication)
print('hipathia - checking na value ( 0(zero) means OK), ', len(df_hipathia.loc[df_hipathia['symbol'].isna()]))
print('kegg - checking na value ( 0(zero) means OK), ', len(df_scf.loc[df_scf['symbol'].isna()]))

hipathia - checking na value ( 0(zero) means OK),  0
kegg - checking na value ( 0(zero) means OK),  0


In [6]:
print('Union        ,', len(set(df_hipathia['symbol']).union(set(df_scf['symbol']))))
print('Intersection ,', len(set(df_hipathia['symbol']).intersection(set(df_scf['symbol']))))
print('hipathia diff,', len(set(df_hipathia['symbol']).difference(set(df_scf['symbol']))))
print('scf diff     ,', len(set(df_scf['symbol']).difference(set(df_hipathia['symbol']))))

Union        , 7914
Intersection , 3716
hipathia diff, 21
scf diff     , 4177


## Paper dataset

In [7]:
df_tpm_mouse = pd.read_csv('../third_party/PMC5737331/NN_code_release/important_file/TPM_mouse_7_8_10_PPITF_gene_9437.txt', sep='\t')
print(df_tpm_mouse.shape)

(9439, 403)


## EXPORTING KEGG PATHWAY (geneSCF) WEIGHT

In [8]:
df_kegg_pathway = pd.read_csv(os.path.join(path_pathways + 'kegg_pathways_final.csv'), sep=',' )#sep='~|\t'
print('df_genescf_pathway shape, ',df_kegg_pathway.shape)
df_kegg_pathway.head()

df_genescf_pathway shape,  (250, 4)


Unnamed: 0,pathway_id,pathway_name,pathway_genes,edit
0,mmu00010,Glycolysis_/_Gluconeogenesis,"103988,106557,110695,11522,11529,11532,11669,1...",1.0
1,mmu00020,Citrate_cycle_(TCA_cycle),"104112,11428,11429,12974,13382,14194,15926,159...",1.0
2,mmu00030,Pentose_phosphate_pathway,"100198,110208,110639,11674,11676,14120,14121,1...",1.0
3,mmu00040,Pentose_and_glucuronate_interconversions,"100559,100727,102448,110006,112417,11677,11997...",1.0
4,mmu00051,Fructose_and_mannose_metabolism,"110119,11674,11676,11677,11997,14120,14121,141...",1.0


In [9]:
df_kegg_weight = df_scf.copy()
print('df_metabolic_weight shape, ', df_kegg_weight.shape)
df_temp = pd.DataFrame()
for i, p_id in enumerate(df_kegg_pathway['pathway_id']):
    df_temp = df_kegg_pathway.loc[df_kegg_pathway['pathway_id']==p_id]['pathway_genes'].str.split(',', expand=True).T#.reset_index()
    df_temp = df_temp.drop(df_temp.loc[df_temp[i] == ''].index)#.set_index(i)
    df_temp = df_temp.rename(columns={ i: 'gene_id'})
    df_temp['gene_id']= pd.to_numeric(df_temp['gene_id'])
    df_temp[p_id] = 1
    df_kegg_weight = pd.merge(left=df_kegg_weight, right=df_temp, on='gene_id', how='left')

df_kegg_weight = df_kegg_weight.fillna(0)
del(df_temp)
df_kegg_weight = df_kegg_weight.loc[df_kegg_weight['symbol'].isin(df_tpm_mouse['Sample'])].set_index('symbol').drop(columns='gene_id')

## EXPORTING - THE WEIGHT TABLE for METABOLIC PATHWAY
df_kegg_weight.to_csv(os.path.join(path_weights + 'pathway_metabolic_weight.txt'))
print('df_metabolic_weight_common shape (exported), ', df_kegg_weight.shape)
# print('EXPORTED!! - THE WEIGHT TABLE for METABOLIC PATHWAY')

df_metabolic_weight shape,  (7893, 2)
df_metabolic_weight_common shape (exported),  (3922, 250)


## EXPORTING SIGNALING PATHWAY(hipathia) WEIGHT

In [None]:
df_hipathia_weight = df_hipathia.copy()
print('df_signaling_weight shape,', df_hipathia_weight.shape)
df_hipathia_weight['gene_id'] = df_hipathia_weight['gene_id'].astype('object')
df_hipathia_weight = df_hipathia_weight.rename(columns={'gene_id' : 'entrez'})
for gene_list_ in sorted(glob.glob('../data/hipathia_genes_detail/mmu*.txt')):
    df_temp = pd.read_csv(gene_list_ )
    df_hipathia_weight = pd.merge(left=df_hipathia_weight, right=df_temp, on='entrez', how='left')

df_hipathia_weight.drop(columns='entrez', inplace=True)
df_hipathia_weight = df_hipathia_weight.fillna(0)
df_hipathia_weight = df_hipathia_weight.set_index('symbol')
df_hipathia_weight.columns = [pw[1] for pw in df_hipathia_weight.columns.str.split('-')]
df_hipathia_weight = df_hipathia_weight.groupby(df_hipathia_weight.columns, axis=1).max()

df_hipathia_weight = df_hipathia_weight.loc[df_hipathia_weight.index.isin(df_tpm_mouse['Sample'])]

## EXPORTING - THE WEIGHT TABLE for SIGNALING PATHWAY
df_hipathia_weight.to_csv(os.path.join(path_weights + 'pathway_signaling_weight.txt'))
print('df_signaling_weight shape (exported), ', df_hipathia_weight.shape)
print('EXPORTED!! - THE WEIGHT TABLE for SIGNALING PATHWAY')

df_signaling_weight shape, (3737, 2)


In [None]:
print('Union        ,', len(set(df_hipathia_weight.index).union(set(df_kegg_weight.index))))
print('Intersection ,', len(set(df_hipathia_weight.index).intersection(set(df_kegg_weight.index))))
print('hipathia diff,', len(set(df_hipathia_weight.index).difference(set(df_kegg_weight.index))))
print('scf diff     ,', len(set(df_kegg_weight.index).difference(set(df_hipathia_weight.index))))

## EXPORTING GENE LISTS for RETRIEVAL ANALYSIS

In [None]:
pd.DataFrame(df_hipathia_weight.index).to_csv(os.path.join(path_weights + 'gene_signaling_retrieval.txt'), index=False, header=None)
pd.DataFrame(df_kegg_weight.index).to_csv(os.path.join(path_weights + 'gene_met_sig_retrieval.txt'), index=False, header=None)

print(len(df_hipathia_weight.index))
print(len(df_kegg_weight.index))