In [1]:
from __future__ import division, print_function
import re
import numpy as np
import pandas as pd

In [2]:
def readin_keg(keg_file_name):
    annotation = []
    record_id = 0
    f = open(keg_file_name, 'r')
    for line in f:
        line = line.strip()
        record = {'Pathway name': None,
                  'level': 0,
                  'id': 0,
                  'parent': None,
                  'parent_id': -1,
                  'code': None,
                  'KO': 0,
                  'KO in spec': 0,
                  'gene_number': 0,
                  'seq_id': None,
                  'Pathway short names': None}
        m = re.match(r'(?P<level>^[ABC])[ \t]*(?P<code>[0-9]+) *(?P<label>[^\[\]]+)(?P<p_code>\[.+])?', line)
        if m:
            record['Pathway name'] = m['label'].strip()
            if m['level'] == 'A':
                record['level'] = 0
                record['parent'] = None
                cat_a = m['label'].strip()
                cat_a_id = record_id
            elif m['level'] == 'B':
                record['level'] = 1
                record['parent'] = cat_a
                record['parent_id'] = cat_a_id
                cat_b = m['label'].strip()
                cat_b_id = record_id
            elif m['level'] == 'C':
                record['level'] = 2
                record['parent'] = cat_b
                record['parent_id'] = cat_b_id
                cat_c = m['label'].strip()
                cat_c_id = record_id
            record['code'] = m['code']
            record['id'] = record_id
            record_id += 1
            annotation.append(record)
            continue
        m = re.match(r'^D *(?P<seq_id>[a-zA-Z0-9_.-]+); *(?P<code>K[0-9]+) *(?P<label>.*)', line)
        if m:
            m_l = re.match(r'(?P<short_labels>.+); *(?P<long_label>[^\[\]]+)(?P<e_code>\[.+])?', m['label'])
            if m_l:
                short_labels = []
                for short_label in m_l['short_labels'].split():
                    if short_label.strip():
                        short_labels.append(short_label.strip().replace(',', ''))
                long_label = m_l['long_label'].strip()
                record['Pathway name'] = long_label
                record['level'] = 3
                record['parent'] = cat_c
                record['parent_id'] = cat_c_id
                record['Pathway short names'] = ', '.join(short_labels)
                record['seq_id'] = m['seq_id']
                record['code'] = m['code']
                record['id'] = record_id
                record_id += 1
                annotation.append(record)
                continue
        m = re.match(r'^D *(?P<code>K[0-9]+) *(?P<label>.*)', line)
        if m:
            m_l = re.match(r'(?P<short_labels>.+); *(?P<long_label>[^\[\]]+)(?P<e_code>\[.+])?', m['label'])
            if m_l:
                short_labels = []
                for short_label in m_l['short_labels'].split():
                    if short_label.strip():
                        short_labels.append(short_label.strip().replace(',', ''))
                long_label = m_l['long_label'].strip()
                record['Pathway name'] = long_label
                record['level'] = 3
                record['parent'] = cat_c
                record['parent_id'] = cat_c_id
                record['Pathway short names'] = ', '.join(short_labels)
                #record['seq_id'] = m['seq_id']
                record['code'] = m['code']
                record['id'] = record_id
                record_id += 1
                annotation.append(record)
                continue
    f.close()
    return annotation

In [3]:
df = pd.DataFrame(readin_keg('/Users/nata/Documents/genome_project/Orhtonectids_project/annotation/KEGG/Intoshia_variabili.keg'))

In [4]:
df = df.drop(columns=['KO', 'KO in spec', 'code', 'gene_number', 'parent_id'])

In [14]:
df_genes = df.loc[df['level'] == 3]
what_to_find = 'potassium channel'
# ищем по полному названию
search_result = df_genes[df_genes['Pathway name'].str.contains(what_to_find)]
# либо по короткому
#search_result = df_genes[df_genes['Pathway short names'].str.contains(what_to_find)]
search_result

Unnamed: 0,Pathway name,Pathway short names,id,level,parent,seq_id
2550,hyperpolarization activated cyclic nucleotide-...,HCN2,2550,3,cAMP signaling pathway,g926.t1
4460,potassium channel subfamily K member 9,"KCNK9, K2P9.1",4460,3,Aldosterone synthesis and secretion,g4164.t1
10633,potassium channel subfamily T member 1,"KCNT1, KNA1.1",10633,3,Ion channels,g4703.t1
10634,potassium channel subfamily K member 1,"KCNK1, K2P1.1",10634,3,Ion channels,g889.t1
10635,potassium channel subfamily K member 9,"KCNK9, K2P9.1",10635,3,Ion channels,g4164.t1
10648,hyperpolarization activated cyclic nucleotide-...,HCN2,10648,3,Ion channels,g926.t1


In [15]:
search_result_no_duplicates = search_result.drop_duplicates(subset=['seq_id'])
search_result_no_duplicates

Unnamed: 0,Pathway name,Pathway short names,id,level,parent,seq_id
2550,hyperpolarization activated cyclic nucleotide-...,HCN2,2550,3,cAMP signaling pathway,g926.t1
4460,potassium channel subfamily K member 9,"KCNK9, K2P9.1",4460,3,Aldosterone synthesis and secretion,g4164.t1
10633,potassium channel subfamily T member 1,"KCNT1, KNA1.1",10633,3,Ion channels,g4703.t1
10634,potassium channel subfamily K member 1,"KCNK1, K2P1.1",10634,3,Ion channels,g889.t1


In [10]:
search_result_no_duplicates.count()

Pathway name           10
Pathway short names    10
id                     10
level                  10
parent                 10
seq_id                 10
dtype: int64