# This is a helper Script for getting KEGG pathways and dieases using requests

In [12]:
import requests
import json 
import urllib.request
from urllib.error import HTTPError
import re
import sys
import pandas as pd

In [14]:
# make a request for the KEGG pathway database
pathway = "hsa00534"
url = f"http://rest.kegg.jp/get/{pathway}"
url2 = "http://rest.kegg.jp/list/pathway/hsa"

#retrieve the kegg pathway and the api 
def kegg_genes(pathway:str, url:str)->list:
    """
    This function retrieve the genes corresponding to the KEGG pathway
    Args:
        pathway (str) <- selected pathway from the KEGG Database
    """
    url = url
    genes_liste =[]
    
    try:
        with urllib.request.urlopen(url) as f:
            lines = f.read().decode('utf-8').splitlines()
            want = 0
            for line in lines:
                fields = line.split()
                ## The list of genes starts here
                if fields[0] == 'GENE':
                    want = 1
                    ## The line with GENE is different
                    genes_liste.append(fields[2].rstrip(';'))
                ## We reached the next section of the file
                elif want == 1 and re.match('^\S', line):
                    return genes_liste;
                ## We're still in the list of genes
                if want == 1 and len(fields)>1:
                    genes_liste.append((fields[1].rstrip(';')))

    except HTTPError as e:
        print("Please use a valid pathway")
        
resulting_gene = kegg_genes(pathway, url)

['XYLT1',
 '64131',
 'XYLT2',
 'B4GALT7',
 'B3GALT6',
 'B3GAT3',
 'EXTL2',
 'EXTL3',
 'EXTL1',
 'EXT1',
 'EXT2',
 'NDST1',
 'NDST2',
 'NDST3',
 'NDST4',
 'GLCE',
 'HS2ST1',
 'HS6ST1',
 'HS6ST2',
 'HS6ST3',
 'HS3ST1',
 'HS3ST2',
 'HS3ST3B1',
 'HS3ST3A1',
 'HS3ST5']

In [15]:
# here we get the overall number of genes of a KEGG pathway

def find_all_pathways(url: str)-> dict:
    """This function can be used to identify all the genes belonging to a certain pathway
    args:
        url (str) <- url to the KEGG api holding the pathway"""
    
    try: 
        with urllib.request.urlopen(url) as f:
            lines = f.read().decode("utf-8").splitlines()
            pathway_dictionary = {"ID":[],"gene_name":[]}

            # go through the request files
            for i in lines:
                g = i.split("\t")
                ident = g[0].split(":")[1]
                name = g[1].split(" - ")[0]
                pathway_dictionary["ID"].append(ident)
                pathway_dictionary["gene_name"].append(name)

        return pathway_dictionary
    except HTTPError as e:
        print("no connection established")

def find_disease(url):
    """This should identify all the disease currently in the kegg database
    args:
        url (str) <- url connected to the disease database
        """
    
    try:
        with urllib.request.urlopen(url) as f:
            lines = f.read().decode("utf-8").splitlines()
            pathway_dictionary = {"ID":[],"gene_name":[]}
            for i in lines:
                g = i.split("\t")
                pathway_dictionary["ID"].append(g[0])
                pathway_dictionary["gene_name"].append(g[1])
        return pathway_dictionary
    except HTTPerror as e:
        print("NO connection established")

kegg_genes = find_all_pathways(url2)
url3 = "http://rest.kegg.jp/list/disease/"
find_disease(url3)

{'ID': ['ds:H00001',
  'ds:H00002',
  'ds:H00003',
  'ds:H00004',
  'ds:H00005',
  'ds:H00006',
  'ds:H00007',
  'ds:H00008',
  'ds:H00009',
  'ds:H00010',
  'ds:H00011',
  'ds:H00012',
  'ds:H00013',
  'ds:H00014',
  'ds:H00015',
  'ds:H00016',
  'ds:H00017',
  'ds:H00018',
  'ds:H00019',
  'ds:H00020',
  'ds:H00021',
  'ds:H00022',
  'ds:H00023',
  'ds:H00024',
  'ds:H00025',
  'ds:H00026',
  'ds:H00027',
  'ds:H00028',
  'ds:H00029',
  'ds:H00030',
  'ds:H00031',
  'ds:H00032',
  'ds:H00033',
  'ds:H00034',
  'ds:H00035',
  'ds:H00036',
  'ds:H00037',
  'ds:H00038',
  'ds:H00039',
  'ds:H00040',
  'ds:H00041',
  'ds:H00042',
  'ds:H00043',
  'ds:H00044',
  'ds:H00045',
  'ds:H00046',
  'ds:H00047',
  'ds:H00048',
  'ds:H00049',
  'ds:H00050',
  'ds:H00051',
  'ds:H00052',
  'ds:H00053',
  'ds:H00054',
  'ds:H00055',
  'ds:H00056',
  'ds:H00057',
  'ds:H00058',
  'ds:H00059',
  'ds:H00060',
  'ds:H00061',
  'ds:H00062',
  'ds:H00063',
  'ds:H00064',
  'ds:H00065',
  'ds:H00066',
  'd