In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dna-sequence-dataset/dog.txt
/kaggle/input/dna-sequence-dataset/example_dna.fa
/kaggle/input/dna-sequence-dataset/human.txt
/kaggle/input/dna-sequence-dataset/chimpanzee.txt


In [15]:
human = pd.read_table('/kaggle/input/dna-sequence-dataset/human.txt')
human.head()

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG...,4
2,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
3,ATGTGTGGCATTTGGGCGCTGTTTGGCAGTGATGATTGCCTTTCTG...,3
4,ATGCAACAGCATTTTGAATTTGAATACCAGACCAAAGTGGATGGTG...,3


In [16]:
chimpanzee = pd.read_table('/kaggle/input/dna-sequence-dataset/chimpanzee.txt')
chimpanzee.head()

Unnamed: 0,sequence,class
0,ATGCCCCAACTAAATACCGCCGTATGACCCACCATAATTACCCCCA...,4
1,ATGAACGAAAATCTATTCGCTTCATTCGCTGCCCCCACAATCCTAG...,4
2,ATGGCCTCGCGCTGGTGGCGGTGGCGACGCGGCTGCTCCTGGAGGC...,4
3,ATGGCCTCGCGCTGGTGGCGGTGGCGACGCGGCTGCTCCTGGAGGC...,4
4,ATGGGCAGCGCCAGCCCGGGTCTGAGCAGCGTGTCCCCCAGCCACC...,6


In [17]:
dog = pd.read_table('/kaggle/input/dna-sequence-dataset/dog.txt')
dog.head()

Unnamed: 0,sequence,class
0,ATGCCACAGCTAGATACATCCACCTGATTTATTATAATCTTTTCAA...,4
1,ATGAACGAAAATCTATTCGCTTCTTTCGCTGCCCCCTCAATAATAG...,4
2,ATGGAAACACCCTTCTACGGCGATGAGGCGCTGAGCGGCCTGGGCG...,6
3,ATGTGCACTAAAATGGAACAGCCCTTCTACCACGACGACTCATACG...,6
4,ATGAGCCGGCAGCTAAACAGAAGCCAGAACTGCTCCTTCAGTGACG...,0


In [18]:
def get_kmers(sequence, size=8):
    return [sequence[x:x+size].lower() for x in range(len(sequence) - size + 1)]

In [19]:
def generate_dataset(dfs,kmer_size,max_features,split=5):
    kmer_dfs=[]
    for cur_df in dfs:
        cur_df['words']=cur_df.apply(lambda x: get_kmers(x['sequence'],size=kmer_size), axis=1)
        cur_df=cur_df.drop('sequence',axis=1)
        kmer_dfs.append(cur_df)
    all_data=pd.concat(kmer_dfs).reset_index(drop=True)
    perm=np.random.permutation(len(all_data)) #shuffle the data
    test_data=all_data[:len(all_data)//split]
    train_data=all_data[len(all_data)//split:]
    train_kmers=[]
    for cur_kmer_list in train_data.words.values:
        train_kmers.extend(cur_kmer_list)
    vectorizer = CountVectorizer(max_features=max_features).fit(train_kmers) 
    
    print(train_data["class"].value_counts())
    print(test_data["class"].value_counts())
    
    X_train=[]
    Y_train=[]
    X_test=[]
    Y_test=[]
    for cur_data, label in zip(train_data['words'],train_data['class']):
        cur_transformed=vectorizer.transform(cur_data)
        X_train.append(cur_transformed.toarray().sum(axis=0))
        Y_train.append(label)
    for cur_data, label in zip(test_data['words'],test_data['class']):
        cur_transformed=vectorizer.transform(cur_data)
        X_test.append(cur_transformed.toarray().sum(axis=0))
        Y_test.append(label)  
    return X_train, Y_train, X_test, Y_test

In [20]:
X_train, Y_train, X_test, Y_test=generate_dataset([human,chimpanzee,dog],kmer_size=9,max_features=1500)

6    1702
4     910
3     772
0     753
1     551
2     476
5     342
Name: class, dtype: int64
6    422
1    243
3    223
4    197
0    143
2     81
5     67
Name: class, dtype: int64


In [32]:
X_train[:1][:1]

[array([0, 0, 0, ..., 0, 0, 0])]

In [43]:
import torch
from torch import nn
import torch.nn.functional as F

In [75]:

class DnaSequencer(nn.Module):
	def __init__(self,input_dim,hidden_dim1,hidden_dim2,output_dim):
		super().__init__()
		self.input_dim = input_dim
		self.hidden_dim1 = hidden_dim1
		self.hidden_dim2 = hidden_dim2
		self.output_dim =output_dim
		self.linear1 = nn.Linear(input_dim,hidden_dim1)
		self.linear2 = nn.Linear(hidden_dim1,hidden_dim2)
		self.fc = nn.Linear(hidden_dim2,output_dim)

	def forward(self,x:torch.Tensor):
		x = F.relu(x)
		x = self.linear1(x)
		x = self.linear2(x)
		x = F.relu(x)
		x = self.fc(x)
		return x

# Converting our data into tensors
import torch.utils.data as data_utils
train_data = data_utils.TensorDataset(torch.tensor(X_train), torch.tensor(Y_train))
train_loader = data_utils.DataLoader(train_data, batch_size=4, shuffle=True)
test_data=data_utils.TensorDataset(torch.tensor(X_test), torch.tensor(Y_test))
test_loader=data_utils.DataLoader(test_data, batch_size=4, shuffle=True)


# loading into a dataloader so that we can batchify our data

train_loader = DataLoader(train_data,batch_size=32,shuffle=True)
test_loader = DataLoader(test_data,batch_size=32,shuffle=False)
num_features=len(train_data[0][0])
num_labels=7
model = DnaSequencer(input_dim = num_features,hidden_dim1=num_features//5,hidden_dim2=num_features//10,output_dim=num_labels)
optimizer = torch.optim.SGD(model.parameters(),lr=0.02)
criterion = nn.CrossEntropyLoss()



In [76]:
def train_model(model,num_epochs,train_loader,criterion,optimizer,verbose=False,learning_rate=5e-3):
    for epoch in range(num_epochs):
        if(verbose):
            print('*' * 10)
            print(f'epoch {epoch+1}')
        running_loss = 0.0
        running_acc = 0.0
        for i, data in enumerate(train_loader, 1):
            cur_seq, label = data
            cur_seq=cur_seq.float()
            label=label.squeeze_()
            optimizer.zero_grad()
            out = model(cur_seq)
            loss = criterion(out, label)
            running_loss += loss.item()
            _, pred = torch.max(out, 1)
            running_acc += (pred == label).float().mean()
            loss.backward()
            optimizer.step()
            if (verbose):
                if i % 400 == 0:
                    print(f'[{epoch+1}/{num_epochs}] Loss: {running_loss/i:.6f}, Acc: {running_acc/i:.6f}')
        if(verbose):
            print(f'Finish {epoch+1} epoch, Loss: {running_loss/i:.6f}, Acc: {running_acc/i:.6f}')
    return running_acc/i, model
def validate_model(model,test_loader):
        # eval
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            cur_seq, labels = data
            cur_seq=cur_seq.float()
            labels=labels.squeeze_()
            # calculate outputs by running images through the network
            out = model(cur_seq)
            _, predicted = torch.max(out, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct/total

In [77]:
train_model(model=model,num_epochs=20,train_loader=train_loader,criterion=criterion,optimizer=optimizer)

(tensor(0.9713),
 DnaSequencer(
   (linear1): Linear(in_features=1500, out_features=300, bias=True)
   (linear2): Linear(in_features=300, out_features=150, bias=True)
   (fc): Linear(in_features=150, out_features=7, bias=True)
 ))

In [55]:
accuracy,model=train_model(model,20,train_loader,criterion,optimizer,verbose=False,learning_rate=5e-3)
print("Model training accuracy is %.4f"%accuracy)

Model training accuracy is 0.9948


In [65]:
model_valuation = validate_model(model,test_loader)
print(f"The model validation stands as {model_valuation*100} %")


The model validation stands as 79.36046511627907 %


In [74]:
#THE NEURAL NETWORK LEARNS WELL FROM OUR DATA . A HIGHER TRAIN ACCURACY ,HOWEVER INDICATES THAT WE ARE
#OVERFITTING OUR MODEL .
#WE CAN TRY TO IMPROVE OUR MODEL BY ADJUSTING THE HYPERPARAMETERS

In [73]:
# ADJUSTING OUR HYPERPARAMETERS
accuracy,model=train_model(model,12,train_loader,criterion,optimizer,verbose=False,learning_rate=0.002)
print("Model training accuracy is %.4f"%accuracy) # there is still overfitting,

Model training accuracy is 0.9948


REFERENCES :

https://www.kaggle.com/code/chuckzzzz/dna-classification-with-deep-learning-part-1
https://www.kaggle.com/code/satyanarayanam/dna-sequencing