## Procedural Notes  

### Enzyme/Pathway Selection  
Kegg glycolysis/Glucogenesis-Reference Pathway:  

>Enzymes selected for each pathway are:  
>- Glycolysis: K00844, K11645, K01689, K16370  
>- Citric Acid Cycle: K01647, K00030, K01900, K00116  
>- Pentose Phosphate Pathway: K13937, K00036, K01807, K06859  



In [48]:
# Creating the genes, pathways and enzymes table with respective assigned columns
import sqlite3
conn = sqlite3.connect('my.db')
c = conn.cursor()
# to avoid conflict with previously built table
c.execute("""DROP TABLE genes""")
c.execute("""DROP TABLE pathways""")
c.execute("""DROP TABLE enzymes""")

# Caution: use INTEGER instead of INT for primary key generation
c.execute("""CREATE TABLE genes (id INTEGER PRIMARY KEY, name TEXT, description TEXT, organism TEXT, nucleotide_sequence TEXT
chromosome TEXT, start INT, end INT, strand VARCHAR(1), translated_sequence TEXT);""")

c.execute("""CREATE TABLE pathways (id INTEGER PRIMARY KEY, name TEXT, description TEXT);""")

c.execute("""CREATE TABLE enzymes (id INTEGER PRIMARY KEY, kegg_id TEXT, name TEXT, function TEXT, ec_number TEXT);""")

conn.commit()


In [49]:
# Inserting values into the pathway table
import sqlite3
conn = sqlite3.connect('my.db')
c = conn.cursor()
cmd = ''' INSERT INTO pathways (name, description) VALUES(?,?)'''

names = ["Glycolysis","Citric Acid Cycle","Pentose Phosphate"]
descriptions = ["the breakdown of glucose by enzymes, releasing energy and pyruvic acid." , "the sequence of reactions by which most living cells generate energy during the process of aerobic respiration. It takes place in the mitochondria, consuming oxygen, producing carbon dioxide and water as waste products, and converting ADP to energy-rich ATP","a metabolic pathway parallel to glycolysis, It generates NADPH and pentoses (5-carbon sugars) as well as ribose 5-phosphate, the last one a precursor for the synthesis of nucleotides"]


# Take in two list with equal length and insert the into the corresponding field
for i in range(len(names)):
    name = names[i]
    description = descriptions[i]
    temp_value = (name,description)
    c.execute(cmd,temp_value)
    conn.commit()


### Query the name and ec_number of the selected Enzymes from KEGG
1. Make a sample.list file in the current directory with the following lines  
K00844   
K11645   
K01689  
K16370  
K01647  
K00030  
K01900  
K00116  
K13937  
K00036  
K01807  
K06859  

2. In the terminal execute: `curl -g -s -S http://rest.kegg.jp/list/ko | grep -f sample.list | sed "s/cpd\://" > sample_extracted.table.txt`

3. Inspect the output file sample_extracted.table.txt  

In [53]:
# parse in the sample_extracted.table.txt for enzymes table insertion
def parse_extracted_table():
    filename = "sample_extracted.table.txt"
    in_file = open(filename, "rt")
    lines = []
    for line in in_file:
        lines.append(line.rstrip('\n'))
        print(line)
    return lines

lines = parse_extracted_table()



def get_values(lines):
    kegg_ids = []
    names = []
    functions = []
    ec_numbers = []
    for line in lines:
        line_ec = line.split("[")
        ec_number = line_ec[-1]
        line_2_part = line_ec[0].split(";")
        line_part1 = line_2_part[0].split()
        line_part2 = line_2_part[1].split()
#         print(line_part1)
#         print(line_part2)
        kegg_id = line_part1[0]
        temp_name = line_part1[1:]
        temp_name[:] = [''.join(temp_name[:])]
        name = temp_name[0]
        temp_function = line_part2[:]
        temp_function[:] = [" ".join(temp_function[:])]
        function = temp_function[0]     
        kegg_ids.append(kegg_id)
        names.append(name)
        functions.append(function)
        ec_numbers.append(ec_number)
    return kegg_ids,names,functions,ec_numbers

get_values(lines)





(['ko:K00030',
  'ko:K00036',
  'ko:K00116',
  'ko:K00844',
  'ko:K01647',
  'ko:K01689',
  'ko:K01807',
  'ko:K01900',
  'ko:K06859',
  'ko:K11645',
  'ko:K13937',
  'ko:K16370'],
 ['IDH3',
  'G6PD,zwf',
  'mqo',
  'HK',
  'CS,gltA',
  'ENO,eno',
  'rpiA',
  'LSC2',
  'pgi1',
  'fbaB',
  'H6PD',
  'pfkB'],
 ['isocitrate dehydrogenase (NAD+)',
  'glucose-6-phosphate 1-dehydrogenase',
  'malate dehydrogenase (quinone)',
  'hexokinase',
  'citrate synthase',
  'enolase',
  'ribose 5-phosphate isomerase A',
  'succinyl-CoA synthetase beta subunit',
  'glucose-6-phosphate isomerase, archaeal',
  'fructose-bisphosphate aldolase, class I',
  'hexose-6-phosphate dehydrogenase',
  '6-phosphofructokinase 2'],
 ['EC:1.1.1.41]',
  'EC:1.1.1.49 1.1.1.363]',
  'EC:1.1.5.4]',
  'EC:2.7.1.1]',
  'EC:2.3.3.1]',
  'EC:4.2.1.11]',
  'EC:5.3.1.6]',
  'EC:6.2.1.4 6.2.1.5]',
  'EC:5.3.1.9]',
  'EC:4.1.2.13]',
  'EC:1.1.1.47 3.1.1.31]',
  'EC:2.7.1.11]'])

In [54]:
# insert values into the enzymes table
import sqlite3
conn = sqlite3.connect('my.db')
c = conn.cursor()
cmd = ''' INSERT INTO enzymes (kegg_id, name, function, ec_number) VALUES(?,?,?,?)'''


kegg_ids,names,functions,ec_numbers = get_values(lines)
for i in range(len(names)):
    name = names[i]
    function = functions[i]
    kegg_id = kegg_ids[i]
    ec_number = ec_numbers[i]
    temp_value = (kegg_id, name, function, ec_number)
    c.execute(cmd,temp_value)
    conn.commit()

In [13]:
from Bio import Entrez
#Entrez.esearch(db,term, sort)
#Entrez.efetch(db, id, rettype, retmode)
#for enzymes and genes only; Entrez doesn't have pathway info




Entrez.email = 'ych323@berkeley.edu'
handle = Entrez.esearch(db = 'nucleotide',
                       term = 'homo sapiens[ORGN] G6PD',
                       sort= 'relevance',
                       idtype= 'acc')

fetched_dict = Entrez.read(handle)
print(fetched_dict)

handle = Entrez.efetch(db = 'nucleotide', id = fetched_dict["IdList"][1], rettype = 'fasta', retmode = 'text')
print(handle.read())
# TODO: Make a function to iterate through the query
# for i in Entrez.read(handle)["IdList"]:
#     handle = Entrez.efetch(db = 'nucleotide', id = i, rettype = 'fasta', retmode = 'text')
#     print(handle.read())

DictElement({'Count': '303', 'RetMax': '20', 'RetStart': '0', 'IdList': ['L44140.1', 'KJ896841.1', 'S64462.1', 'S58359.1', 'NM_000402.4', 'NM_001360016.1', 'NM_001042351.2', 'AB376963.1', 'X55448.1', 'DQ173568.1', 'DQ839546.1', 'DQ832766.1', 'M12996.1', 'MG772799.1', 'DQ173642.1', 'DQ173641.1', 'DQ173640.1', 'DQ173639.1', 'DQ173638.1', 'DQ173637.1'], 'TranslationSet': [DictElement({'From': 'homo sapiens[ORGN]', 'To': '"Homo sapiens"[Organism]'}, attributes={})], 'TranslationStack': [DictElement({'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '16629335', 'Explode': 'Y'}, attributes={}), DictElement({'Term': 'G6PD[All Fields]', 'Field': 'All Fields', 'Count': '41938', 'Explode': 'N'}, attributes={}), 'AND'], 'QueryTranslation': '"Homo sapiens"[Organism] AND G6PD[All Fields]'}, attributes={})
>KJ896841.1 Synthetic construct Homo sapiens clone ccsbBroadEn_06235 G6PD gene, encodes complete protein
GTTCGTTGCAACAAATTGATGAGCAATGCTTTTTTATAATGCCAACTTTGTACAAAAAAGTTGGCATGGC
AGAG