In [1]:
# FUNCTION IMPLEMENTATIONS

import pandas as pd
import numpy as np

# dataframe of DNA to RNA base pairings
rna_dict = pd.DataFrame({'dna_nuc': ['A', 'T', 'C', 'G'], 
                         'rna_nuc': ['U', 'A', 'G', 'C']})
rna_dict = rna_dict.set_index('dna_nuc')

# returns mRNA based on coding strand DNA 
def dna_to_rna(dna):
    mrna = np.array([])
    
    for i in range(dna.size):
        bp = dna[0][i]
        
        if bp != 'N':
            rna_nuc = rna_dict['rna_nuc'][bp]
            mrna = np.append(mrna, rna_nuc)
        
    return np.expand_dims(mrna, axis=0)       

# dataframe of mRNA codon to amino acid pairings
aa_dict = pd.read_excel('amino_acid_dict.xlsx')
aa_dict = aa_dict.set_index('codon')

# returns the corresponding polypeptide chain to mRNA 
def rna_to_polypeptide(mrna): 
    polypeptide = np.array([])
    
    total_codons = mrna.size // 3
    base_sequence = mrna[0][:(total_codons * 3)]
#     print(mrna.size)
#     print(base_sequence.size)
#     print(total_codons * 3)
    codon_sequence = base_sequence.reshape((total_codons, 3))
    
    for i in range(total_codons):
        codon = codon_sequence[i]
        codon = str(codon[0]) + str(codon[1]) + str(codon[2])
        peptide = aa_dict['amino acid'][codon]
        polypeptide = np.append(polypeptide, peptide)
    
    return np.expand_dims(polypeptide, axis=0)

# sum total number of each key phrase in keys found in sequence
def total_instances_of(sequence, keys):
    count = 0
    
    for i in range(len(keys)):
        count += np.sum(np.char.count(sequence, keys[i]))
        
    return count

# test cases
# dna = np.expand_dims(np.array(list('AAAGGGCCCTTT')), axis=0)
# mrna = dna_to_rna(dna)
# polypeptide = rna_to_polypeptide(mrna)

# print(mrna)
# print(polypeptide)
# print(total_instances_of(polypeptide, ['phe', 'pro']))

In [171]:
# FEATURE EXTRACTION

genomic_data = pd.read_table('human_data.txt') # columnns: 'sequence', 'class'
genomic_data.head()
num_sequences = genomic_data.shape[0]
print("Num of gene sequences: " + str(num_sequences))

training_data = np.empty((0, 25))

# convert all gene training examples into feature vectors
for i in range(num_sequences):
    dna = np.expand_dims(np.array(list(human_data['sequence'][i])), axis=0)
    mrna = dna_to_rna(dna)
    polypeptide = rna_to_polypeptide(mrna)

    non_polar_aa = ['gly', 'ala', 'val', 'cys', 'pro', 'leu', 'ile', 'met', 'trp', 'phe']
    polar_aa = ['ser', 'thr', 'tyr', 'asn', 'gln']
    pos_aa = ['lys', 'arg', 'his']
    neg_aa = ['asp', 'glu']

    total_bp = total_instances_of(dna, ['A', 'T', 'C', 'G'])
    total_A = total_instances_of(dna, ['A'])
    total_T = total_instances_of(dna, ['T'])
    total_C = total_instances_of(dna, ['C'])
    total_G = total_instances_of(dna, ['G'])
    total_gly = total_instances_of(polypeptide, ['gly'])
    total_ala = total_instances_of(polypeptide, ['ala'])
    total_val = total_instances_of(polypeptide, ['val'])
    total_cys = total_instances_of(polypeptide, ['cys'])
    total_pro = total_instances_of(polypeptide, ['pro'])
    total_leu = total_instances_of(polypeptide, ['leu'])
    total_ile = total_instances_of(polypeptide, ['ile'])
    total_met = total_instances_of(polypeptide, ['met'])
    total_trp = total_instances_of(polypeptide, ['trp'])
    total_phe = total_instances_of(polypeptide, ['phe'])
    total_ser = total_instances_of(polypeptide, ['ser'])
    total_thr = total_instances_of(polypeptide, ['thr'])
    total_tyr = total_instances_of(polypeptide, ['tyr'])
    total_asn = total_instances_of(polypeptide, ['asn'])
    total_gln = total_instances_of(polypeptide, ['gln'])
    total_lys = total_instances_of(polypeptide, ['lys'])
    total_arg = total_instances_of(polypeptide, ['arg'])
    total_his = total_instances_of(polypeptide, ['his'])
    total_asp = total_instances_of(polypeptide, ['asp'])
    total_glu = total_instances_of(polypeptide, ['glu'])
    
#     total_non_polar = total_instances_of(polypeptide, non_polar_aa)
#     total_polar = total_instances_of(polypeptide, polar_aa)
#     total_pos = total_instances_of(polypeptide, pos_aa)
#     total_neg = total_instances_of(polypeptide, neg_aa)
    
#     feature_vector1 = np.array([total_A, total_T, total_C, total_G]) #/ total_bp
#     feature_vector2 = np.array([total_non_polar, total_polar, total_pos, total_neg]) #/ (total_bp / 3)
#     feature_vector = np.append(feature_vector1, feature_vector2, axis=0)

    feature_vector = np.array([total_bp, total_A, total_T, total_C, total_G, total_gly, total_ala, total_val, total_cys, total_pro, total_leu,
                            total_ile, total_met, total_trp, total_phe, total_ser, total_thr, total_tyr, total_asn, total_gln, total_lys,
                            total_arg, total_his, total_asp, total_glu]) 
    
    feature_vector = np.expand_dims(feature_vector, axis=0)
    # print(feature_vector.shape)
    training_data = np.append(training_data, feature_vector, axis=0)
    # print('iteration ' + str(i))

print(training_data.shape)

# save training data to Excel file
labels = np.expand_dims(genomic_data['class'], axis=1)
training_data = np.append(training_data, labels, axis=1)

df = pd.DataFrame(training_data)
df.to_excel(excel_writer = r'C:\Users\Nathan\Deep Learning\genome-to-protein-classifier\gene_features_dataset.xlsx')

Num of gene sequences: 4380
(4380, 25)


In [170]:
df.to_excel(excel_writer = r'C:\Users\Nathan\Deep Learning\genome-to-protein-classifier\gene_features_dataset.xlsx')

In [8]:
genomic_data = pd.read_table('human_data.txt') # columnns: 'sequence', 'class'
dna = np.expand_dims(np.array(list(genomic_data['sequence'][0])), axis=0)

result = rna_to_polypeptide(dna_to_rna(dna))
result = np.reshape(result, (1, result.shape[0] * result.shape[1]))


[['tyr' 'gly' 'val' 'asp' 'leu' 'stop' 'trp' 'his' 'thr' 'gly' 'trp'
  'tyr' 'stop' 'trp' 'gly' 'tyr' 'glu' 'glu' 'cys' 'asp' 'lys' 'glu'
  'stop' 'trp' 'val' 'asp' 'phe' 'tyr' 'asn' 'leu' 'cys' 'leu' 'met'
  'val' 'asp' 'gly' 'gly' 'ser' 'gly' 'phe' 'gly' 'tyr' 'phe' 'tyr' 'phe'
  'leu' 'ile' 'leu' 'phe' 'gly' 'thr' 'leu' 'gly' 'phe' 'thr' 'cys' 'phe'
  'stop' 'thr' 'ser' 'glu' 'val' 'ser' 'asn' 'gly' 'gly' 'val' 'arg'
  'ile']]
