In [1]:
import urllib3
from bs4 import BeautifulSoup
from Bio.KEGG import REST
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import goatools

In [2]:
main_page = "https://www.kegg.jp/kegg/pathway.html"
path_page = "https://www.genome.jp/dbget-bin/www_bget?pathway+hsa{}" # (HUMAN) inside the brackets goes the id num of the pathway

In [27]:
#human_pathways = REST.kegg_list("pathway", "hsa").read()
mus_pathways = REST.kegg_list("pathway", "mmu").read()

In [4]:
# Filter all human pathways for repair pathways
kegg_paths_human = {}
for line in human_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
    kegg_paths_human[entry[5:]] = [description[:-23]]

In [12]:
# Filter all human pathways for repair pathways
kegg_paths_mouse = {}
for line in mus_pathways.rstrip().split("\n"):
    print(line)
    entry, description = line.split("\t")
    kegg_paths_mouse[entry[5:]] = description[:-23]

path:mmu00010	Glycolysis / Gluconeogenesis - Mus musculus (mouse)
path:mmu00020	Citrate cycle (TCA cycle) - Mus musculus (mouse)
path:mmu00030	Pentose phosphate pathway - Mus musculus (mouse)
path:mmu00040	Pentose and glucuronate interconversions - Mus musculus (mouse)
path:mmu00051	Fructose and mannose metabolism - Mus musculus (mouse)
path:mmu00052	Galactose metabolism - Mus musculus (mouse)
path:mmu00053	Ascorbate and aldarate metabolism - Mus musculus (mouse)
path:mmu00061	Fatty acid biosynthesis - Mus musculus (mouse)
path:mmu00062	Fatty acid elongation - Mus musculus (mouse)
path:mmu00071	Fatty acid degradation - Mus musculus (mouse)
path:mmu00072	Synthesis and degradation of ketone bodies - Mus musculus (mouse)
path:mmu00100	Steroid biosynthesis - Mus musculus (mouse)
path:mmu00120	Primary bile acid biosynthesis - Mus musculus (mouse)
path:mmu00130	Ubiquinone and other terpenoid-quinone biosynthesis - Mus musculus (mouse)
path:mmu00140	Steroid hormone biosynthesis - Mus musculus

In [11]:
kegg_paths_mouse

{'mmu00010': 'Glycolysis / Gluconeogenesis',
 'mmu00020': 'Citrate cycle (TCA cycle)',
 'mmu00030': 'Pentose phosphate pathway',
 'mmu00040': 'Pentose and glucuronate interconversions',
 'mmu00051': 'Fructose and mannose metabolism',
 'mmu00052': 'Galactose metabolism',
 'mmu00053': 'Ascorbate and aldarate metabolism',
 'mmu00061': 'Fatty acid biosynthesis',
 'mmu00062': 'Fatty acid elongation',
 'mmu00071': 'Fatty acid degradation',
 'mmu00072': 'Synthesis and degradation of ketone bodies',
 'mmu00100': 'Steroid biosynthesis',
 'mmu00120': 'Primary bile acid biosynthesis',
 'mmu00130': 'Ubiquinone and other terpenoid-quinone biosynthesis',
 'mmu00140': 'Steroid hormone biosynthesis',
 'mmu00190': 'Oxidative phosphorylation',
 'mmu00220': 'Arginine biosynthesis',
 'mmu00230': 'Purine metabolism',
 'mmu00232': 'Caffeine metabolism',
 'mmu00240': 'Pyrimidine metabolism',
 'mmu00250': 'Alanine, aspartate and glutamate metabolism',
 'mmu00260': 'Glycine, serine and threonine metabolism',
 

### Write pathways to files (SKIP)

In [11]:
for path in kegg_paths_mouse:
    print(kegg_paths_human[path])
    pathway_file = REST.kegg_get(path).read()
    fout = open("./Paths/{}_{}.txt".format(path, kegg_paths_human[path].replace(" / ", "_")), mode='w')
    for line in pathway_file.rstrip().split("\n"):
        fout.write("{}\n".format(line))
    fout.close()

['Glycolysis / Gluconeogenesis', {'3101': ['HK3'], '3098': ['HK1'], '3099': ['HK2'], '80201': ['HKDC1'], '2645': ['GCK'], '2821': ['GPI'], '5213': ['PFKM'], '5214': ['PFKP'], '5211': ['PFKL'], '2203': ['FBP1'], '8789': ['FBP2'], '230': ['ALDOC'], '226': ['ALDOA'], '229': ['ALDOB'], '7167': ['TPI1'], '2597': ['GAPDH'], '26330': ['GAPDHS'], '5232': ['PGK2'], '5230': ['PGK1'], '5223': ['PGAM1'], '5224': ['PGAM2'], '441531': ['PGAM4'], '2027': ['ENO3'], '2026': ['ENO2'], '2023': ['ENO1'], '387712': ['ENO4'], '5315': ['PKM'], '5313': ['PKLR'], '5161': ['PDHA2'], '5160': ['PDHA1'], '5162': ['PDHB'], '1737': ['DLAT'], '1738': ['DLD'], '160287': ['LDHAL6A'], '92483': ['LDHAL6B'], '3939': ['LDHA'], '3945': ['LDHB'], '3948': ['LDHC'], '124': ['ADH1A'], '125': ['ADH1B'], '126': ['ADH1C'], '131': ['ADH7'], '127': ['ADH4'], '128': ['ADH5'], '130': ['ADH6'], '10327': ['AKR1A1'], '217': ['ALDH2'], '224': ['ALDH3A2'], '219': ['ALDH1B1'], '501': ['ALDH7A1'], '223': ['ALDH9A1'], '221': ['ALDH3B1'], '222

AttributeError: 'list' object has no attribute 'replace'

### Add genes to paths_dict

In [6]:
for path in kegg_paths_human:
    print(path)
    temp_genes = {}
    pathway_file = REST.kegg_get(path).read()
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for idx, line in enumerate(pathway_file.rstrip().split("\n")):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            try:
                gene_identifiers, gene_description = line[12:].split("; ")
                gene_id, gene_symbol = gene_identifiers.split()
            except ValueError:
                print("Missing separator in {} at line {}".format(path, idx+1))
                gene_id = line[12:].split()[0]
                gene_symbol = "Not Found"
            
            if gene_id not in temp_genes:
                temp_genes[gene_id] = []
            temp_genes[gene_id].append(gene_symbol)
    kegg_paths_human[path].append(temp_genes)

hsa00010
hsa00020
hsa00030
hsa00040
hsa00051
hsa00052
hsa00053
hsa00061
hsa00062
hsa00071
hsa00072
hsa00100
hsa00120
hsa00130
hsa00140
hsa00190
hsa00220
hsa00230
Missing separator in hsa00230 at line 206
hsa00232
hsa00240
hsa00250
hsa00260
hsa00270
hsa00280
hsa00290
hsa00310
hsa00330
Missing separator in hsa00330 at line 78
hsa00340
hsa00350
hsa00360
hsa00380
hsa00400
hsa00410
hsa00430
hsa00440
hsa00450
hsa00471
hsa00472
hsa00480
hsa00500
hsa00510
hsa00511
hsa00512
hsa00514
hsa00515
hsa00520
hsa00524
hsa00531
hsa00532
hsa00533
hsa00534
hsa00561
hsa00562
hsa00563
hsa00564
hsa00565
hsa00590
hsa00591
hsa00592
hsa00600
hsa00601
hsa00603
hsa00604
hsa00620
hsa00630
hsa00640
hsa00650
hsa00670
hsa00730
Missing separator in hsa00730 at line 21
hsa00740
hsa00750
hsa00760
hsa00770
hsa00780
hsa00785
hsa00790
hsa00830
hsa00860
hsa00900
hsa00910
hsa00920
hsa00970
hsa00980
Missing separator in hsa00980 at line 46
hsa00982
Missing separator in hsa00982 at line 29
hsa00983
hsa01040
hsa01100
hsa01200
hs

In [14]:
kegg_paths_human

{'hsa00010': ['Glycolysis / Gluconeogenesis',
  {'3101': ['HK3'],
   '3098': ['HK1'],
   '3099': ['HK2'],
   '80201': ['HKDC1'],
   '2645': ['GCK'],
   '2821': ['GPI'],
   '5213': ['PFKM'],
   '5214': ['PFKP'],
   '5211': ['PFKL'],
   '2203': ['FBP1'],
   '8789': ['FBP2'],
   '230': ['ALDOC'],
   '226': ['ALDOA'],
   '229': ['ALDOB'],
   '7167': ['TPI1'],
   '2597': ['GAPDH'],
   '26330': ['GAPDHS'],
   '5232': ['PGK2'],
   '5230': ['PGK1'],
   '5223': ['PGAM1'],
   '5224': ['PGAM2'],
   '441531': ['PGAM4'],
   '2027': ['ENO3'],
   '2026': ['ENO2'],
   '2023': ['ENO1'],
   '387712': ['ENO4'],
   '5315': ['PKM'],
   '5313': ['PKLR'],
   '5161': ['PDHA2'],
   '5160': ['PDHA1'],
   '5162': ['PDHB'],
   '1737': ['DLAT'],
   '1738': ['DLD'],
   '160287': ['LDHAL6A'],
   '92483': ['LDHAL6B'],
   '3939': ['LDHA'],
   '3945': ['LDHB'],
   '3948': ['LDHC'],
   '124': ['ADH1A'],
   '125': ['ADH1B'],
   '126': ['ADH1C'],
   '131': ['ADH7'],
   '127': ['ADH4'],
   '128': ['ADH5'],
   '130': ['ADH6

In [15]:
with open("All_Kegs.csv", mode="w") as all_keggs:
    for key in kegg_paths_human:
        all_keggs.write("{},{}\n".format(key, kegg_paths_human[key][0].replace(",",';')))
        for gene in kegg_paths_human[key][1]:
            if len(kegg_paths_human[key][1][gene]) == 1:
                all_keggs.write("{},{}\n".format(gene,kegg_paths_human[key][1][gene][0]))
            else:
                all_keggs.write("{},{}\n".format(gene,",".join(kegg_paths_human[key][1][gene])))
                
all_keggs.close()