In [17]:
import torch.nn as nn
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from keras.preprocessing import sequence

In [15]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
pass

In [2]:
df = pd.read_csv(r'C:\Users\neele\OneDrive\Documents\Dataset\human_dna_sequence\human.txt', sep='\t')
df.head()

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3


In [3]:
dna_numerical = {}

dna_numerical['A'] = 1
dna_numerical['C'] = 2
dna_numerical['G'] = 3
dna_numerical['T'] = 4

In [4]:
def dna_conversion(seq):
    sql_arr = []
    for i in range(len(seq)):
        if seq[i] in dna_numerical.keys():
            sql_arr.append(dna_numerical[seq[i]])

    return sql_arr

In [5]:
df['num_sequence'] = df['sequence'].apply(dna_conversion)

In [34]:
df['num_sequence'] = df['num_sequence'].apply(lambda x : x[:1000])

In [46]:
df['w_count'] = df['num_sequence'].apply(lambda x : len(x))
df.head()

Unnamed: 0,sequence,class,num_sequence,w_count
42,ATGAGGCCCGAGCGTCCCCGGCCGCGCGGCAGCGCCCCCGGCCCGA...,0,"[1, 4, 3, 1, 3, 3, 2, 2, 2, 3, 1, 3, 2, 3, 4, ...",543
43,ATGAGGCCCGAGCGTCCCCGGCCGCGCGGCAGCGCCCCCGGCCCGA...,0,"[1, 4, 3, 1, 3, 3, 2, 2, 2, 3, 1, 3, 2, 3, 4, ...",1000
44,NTGCAGGTGATTTTCAAAGCCAAGTCAAAATATTCTCCAGAATTAC...,0,"[4, 3, 2, 1, 3, 3, 4, 3, 1, 4, 4, 4, 4, 2, 1, ...",692
70,ATGCTCCAGTTCCCTCACATCAGCCAGTGCGAAGAGCTGCGGCTCA...,0,"[1, 4, 3, 2, 4, 2, 2, 1, 3, 4, 4, 2, 2, 2, 4, ...",190
71,ATGCTCCAGTTCCCTCACATCAGCCAGTGCGAAGAGCTGCGGCTCA...,0,"[1, 4, 3, 2, 4, 2, 2, 1, 3, 4, 4, 2, 2, 2, 4, ...",759


In [55]:
df = df[(df['w_count'] >= 300) & (df['w_count'] <= 600)]

In [56]:
df = df[(df['class'] == 0) | (df['class'] == 1)]

In [57]:
df.w_count.max()

600

In [58]:
X_train, X_test, y_train, y_test = train_test_split(sequence.pad_sequences(df['num_sequence'], maxlen=df.w_count.max()), df['class'], test_size = 0.20, random_state=42)

In [59]:
X_train = list(X_train)
X_test = list(X_test)
y_train = list(y_train)
y_test = list(y_test)

In [60]:
x_train = torch.FloatTensor(X_train).to(device)
y_train = torch.LongTensor(y_train).to(device)

x_test = torch.FloatTensor(X_test).to(device)
y_test = torch.LongTensor(y_test).to(device)

In [61]:
x_train.shape

torch.Size([174, 600])

In [65]:
class LSTMModule(nn.Module):

    def __init__(self):
        super(LSTMModule, self).__init__()

        self.input_layer_size = 600
        self.hidden_layer_size = 200
        self.output_layer_size = 1

        self.lstm = nn.LSTM(self.input_layer_size, self.hidden_layer_size)
        self.fc = nn.Linear(self.hidden_layer_size, self.output_layer_size)

        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size).to(device), torch.zeros(1,1,self.hidden_layer_size).to(device))

        self.sigmoid = nn.Sigmoid()

    def forward(self, input_seq):

        lstm_out, self.hidden_cell = self.lstm(input_seq.view(1 ,1, -1), self.hidden_cell)
        predictions = self.sigmoid(self.fc(lstm_out.view(1, -1)))
        return predictions[-1]

In [66]:
model = LSTMModule().to(device)

criterion = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [68]:
print(model)

LSTMModule(
  (lstm): LSTM(600, 200)
  (fc): Linear(in_features=200, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [69]:
epochs = 10

running_loss_history = []
epoch_list = []
running_corrects_history = []
val_running_loss_history = []
val_running_corrects_history = []

model.train()

for i in range(epochs):

    running_corrects = 0.0
    val_running_loss = 0.0
    val_running_corrects = 0.0

    running_loss = 0.0

    for seq, labels in zip(x_train, y_train):

        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, 200).to(device), torch.zeros(1, 1, 200).to(device))

        y_pred = model(seq)


        single_loss = criterion(y_pred, labels.unsqueeze(0).float())
        single_loss.backward()
        optimizer.step()

        preds = torch.round(y_pred).to(int).squeeze(0)
        running_corrects += torch.sum(preds == labels)

        running_loss += single_loss.item()


    else:
        with torch.no_grad():
            for val_inputs, val_labels in zip(x_test, y_test):

                val_outputs = model(val_inputs)
                val_loss = criterion(val_outputs, val_labels.unsqueeze(0).float())
                
                val_preds = torch.round(val_outputs).to(int).squeeze(0)

                val_running_loss += val_loss.item()
                val_running_corrects += torch.sum(val_preds == val_labels)

        
    epoch_loss = running_loss/len(x_train)
    epoch_acc = running_corrects.float()/ len(x_train)
    running_loss_history.append(epoch_loss)
    running_corrects_history.append(epoch_acc)

    val_epoch_acc = val_running_corrects.float()/ len(x_test)
    val_epoch_loss = val_running_loss/ len(x_test)
    val_running_loss_history.append(val_epoch_loss)
    val_running_corrects_history.append(val_epoch_acc)

    epoch_list.append(i + 1)

    print("Epoch : ",i + 1)
    print('training loss: {:.4f}, acc {:.4f} '.format(running_loss/len(x_train), epoch_acc.item()))
    print('validation loss: {:.4f}, validation acc {:.4f} '.format(val_epoch_loss, val_epoch_acc.item()))
    print('_'*20)
    

Epoch :  1
training loss: 0.8200, acc 0.5115 
validation loss: 0.6876, validation acc 0.6136 
____________________
Epoch :  2
training loss: 0.7467, acc 0.5402 
validation loss: 0.6857, validation acc 0.6136 
____________________
Epoch :  3
training loss: 0.7468, acc 0.5402 
validation loss: 0.6852, validation acc 0.6136 
____________________
Epoch :  4
training loss: 0.7469, acc 0.5402 
validation loss: 0.6849, validation acc 0.6136 
____________________
Epoch :  5
training loss: 0.7469, acc 0.5402 
validation loss: 0.6848, validation acc 0.6136 
____________________
Epoch :  6
training loss: 0.7469, acc 0.5402 
validation loss: 0.6847, validation acc 0.6136 
____________________
Epoch :  7
training loss: 0.7470, acc 0.5402 
validation loss: 0.6846, validation acc 0.6136 
____________________
Epoch :  8
training loss: 0.7470, acc 0.5402 
validation loss: 0.6846, validation acc 0.6136 
____________________
Epoch :  9
training loss: 0.7470, acc 0.5402 
validation loss: 0.6846, validatio

(218, 4)