In [1]:
import os
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
path_file_kegg = '../source/geneSCF/class/lib/db/mmu/kegg_database.txt'
path_disease = '../data/geneSCF/'
path_pathways = '../data/pathways/'

# kept and edit --> 0 means removing
#                   1 means keeping

## PART I

Removed pathways which contain 'cancer' or 'disease' keywords in their pathway names

In [3]:
df_kegg_raw = pd.read_csv(path_file_kegg, error_bad_lines=False, sep='~|\t', header=None)
df_kegg_raw.columns = ['pathway_id', 'pathway_name', 'pathway_genes']
print('The number of pathways (KEGG), ', len(df_kegg_raw))

# Defined disease or cancer related pathways
df_kegg_raw['cancer'] = df_kegg_raw.iloc[:, 1].str.find('cancer')
df_kegg_raw['disease'] = df_kegg_raw.iloc[:, 1].str.find('disease')
df_kegg_raw['kept'] = 0
df_kegg_raw.loc[(df_kegg_raw['cancer']==-1) & (df_kegg_raw['disease']==-1), 'kept'] = 1
print('\n PART I - Disease or cancer related pathways\n',df_kegg_raw.groupby('kept').size())

# Exporting dataset
df_kegg_raw[['pathway_id','pathway_name', 'kept']].to_csv(os.path.join(path_disease + 'kegg_database_wo_disease_related.csv'))
# del(df_kegg_raw)

print(df_kegg_raw[['pathway_id','pathway_name', 'kept']].shape)
df_kegg_raw.head()

The number of pathways (KEGG),  333

 PART I - Disease or cancer related pathways
 kept
0     26
1    307
dtype: int64
(333, 3)


Unnamed: 0,pathway_id,pathway_name,pathway_genes,cancer,disease,kept
0,mmu00010,Glycolysis_/_Gluconeogenesis,"103988,106557,110695,11522,11529,11532,11669,1...",-1,-1,1
1,mmu00020,Citrate_cycle_(TCA_cycle),"104112,11428,11429,12974,13382,14194,15926,159...",-1,-1,1
2,mmu00030,Pentose_phosphate_pathway,"100198,110208,110639,11674,11676,14120,14121,1...",-1,-1,1
3,mmu00040,Pentose_and_glucuronate_interconversions,"100559,100727,102448,110006,112417,11677,11997...",-1,-1,1
4,mmu00051,Fructose_and_mannose_metabolism,"110119,11674,11676,11677,11997,14120,14121,141...",-1,-1,1


## PART II

The dataset, created in PART I, had been reviewed and removed other pathways which are related disease.

In [4]:
df_kegg_edit = pd.read_csv(os.path.join(path_disease+'kegg_database_wo_disease_related_Inma_edit.csv'), index_col='Unnamed: 0')
df_kegg_edit['edit'].fillna(1, inplace=True)
df_kegg_edit.loc[df_kegg_edit['kept'] == 0, 'edit'] = 0
# df_kegg_edit.head()

# df_kegg_edit.groupby('Unnamed: 5').size()
# df_kegg_edit.loc[~df_kegg_edit['Unnamed: 5'].isna()]
df_kegg = pd.merge(left=df_kegg_edit, right=df_kegg_raw[['pathway_id', 'pathway_genes']], on='pathway_id')

print('\nPART II - Disease or cancer related pathways\n', df_kegg_edit.groupby(['kept','edit']).size())

# Exporting dataset
df_kegg[['pathway_id','pathway_name','pathway_genes','edit']].to_csv(os.path.join(path_pathways + 'kegg_pathways_edited.csv'), index=False)
print('KEGG pathways exported!! - ', os.path.join(path_pathways+'kegg_pathways_edited.csv'))

print(df_kegg[['pathway_id','pathway_name','pathway_genes','edit']].shape)
df_kegg.head()


PART II - Disease or cancer related pathways
 kept  edit
0     0.0      26
1     0.0      57
      1.0     250
dtype: int64
KEGG pathways exported!! -  ../data/pathways/kegg_pathways_edited.csv
(333, 4)


Unnamed: 0,pathway_id,pathway_name,kept,edit,Unnamed: 5,pathway_genes
0,mmu00010,Glycolysis_/_Gluconeogenesis,1,1.0,,"103988,106557,110695,11522,11529,11532,11669,1..."
1,mmu00020,Citrate_cycle_(TCA_cycle),1,1.0,,"104112,11428,11429,12974,13382,14194,15926,159..."
2,mmu00030,Pentose_phosphate_pathway,1,1.0,,"100198,110208,110639,11674,11676,14120,14121,1..."
3,mmu00040,Pentose_and_glucuronate_interconversions,1,1.0,,"100559,100727,102448,110006,112417,11677,11997..."
4,mmu00051,Fructose_and_mannose_metabolism,1,1.0,,"110119,11674,11676,11677,11997,14120,14121,141..."
