# Collect KEGG pathways

In [None]:
# get KEGG pathways via KEGG API and pickle the results
import pickle
def parse_gene_record(gene_record):
    # [:-1] to get rid of the trailing bracket
    try:
        idsym, namekeggec = gene_record[:-1].split(
            '; ',
            maxsplit = 1
        )
        geneid, symbol = idsym.split()
    
    except ValueError:
        idsym, namekeggec = gene_record[:-1].split(maxsplit = 1)
        geneid, symbol = idsym, None
        
    record = {
        'ncbigeneid': geneid,
        'genesymbol': symbol,
    }
    
    try:
        name, keggec = namekeggec.split(
            ' [',
            maxsplit = 1
        )
        for key, val in [s.split(':') for s in keggec.split('] [')]:
            record[key] = val
    
    except ValueError:
        name = namekeggec.strip()
    
    record['name'] = name
        
    return geneid, record


def parse_kegg_pathway(responsetxt):
    keywords = {
        'ENTRY',
        'NAME',
        'DESCRIPTION',
        'CLASS',
        'MODULE',
        'GENE',
        'COMPOUND'
    }
    
    record_parsers = {
        'COMPOUND': lambda x: x.split(maxsplit = 1) if len(x.split(maxsplit = 1)) > 1 else (x.strip(), None),
        'GENE': parse_gene_record
    }
    
    current_key, entries = None, None
    parse_results = {}
    for line in responsetxt.split('\n'):
        if line:
            key = line[:12].strip()
            
        else:
            continue
            
        if key:
            if current_key:
                parse_results[current_key] = entries
                
            current_key = key
            entry = line.split(maxsplit = 1)
            
            if current_key in record_parsers.keys():
                key, val = record_parsers[current_key](entry[1])
                entries = {
                    key: val
                }
            
            else:
                entries = [
                    entry[1] if len(entry) > 1 else ''
                ]
        
        else:
            if current_key in record_parsers.keys():
                key, val = record_parsers[current_key](line.strip()) 
                entries[key] = val
            
            else:
                entries.append(
                    line.strip()
                )
     
    # also save the last bits
    parse_results[current_key] = entries  
    
    # remove anything that is not in keywords
    keys = set(parse_results.keys())
    for key in keys.difference(keywords):
        parse_results.pop(key)
    
    return parse_results


def get_kegg_pathway_maps(taxid = 'mmu'):
    r = requests.get(
        f'http://rest.kegg.jp/list/pathway/{taxid}'
    )
    response_lines = [line.split('\t') for line in r.text.split('\n') if line]
    pathway_map_ids = {
        key.split(':')[1]: val for key, val in response_lines
    }
    
    pathway_maps = {}
    for mapid in pathway_map_ids.keys():
        r = requests.get(
            f'http://rest.kegg.jp/get/{mapid}'
        )
        pathway_maps[mapid] = parse_kegg_pathway(r.text)
        
    return pathway_maps


pathwaymaps = get_kegg_pathway_maps()
with open('../raw/kegg_pathway_maps.pickle', 'wb') as handle:
    pickle.dump(
        pathwaymaps,
        handle
    )