In [234]:
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
#import matplotlib.pyplot as plt
import numpy as np
import random

In [235]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [236]:
class MiniCNN(torch.nn.Module):
  def __init__(self):
    super(MiniCNN, self).__init__()
    self.conv1 = torch.nn.Conv1d(4, 6, kernel_size=5) 
    self.pool = torch.nn.MaxPool1d(kernel_size=2, stride=2)  
    self.conv2 = torch.nn.Conv1d(6, 16, kernel_size=3)  
    self.fc1 = torch.nn.Linear(16 * 43, 120)  
    self.fc2 = torch.nn.Linear(120, 2) 

  def forward(self, x):
    x = self.pool(torch.nn.functional.relu(self.conv1(x)))  
    x = self.pool(torch.nn.functional.relu(self.conv2(x)))
    x = x.view(-1,16 * 43)
    x = torch.nn.functional.relu(self.fc1(x))
    x = self.fc2(x)
    return x


In [237]:
# Generating random data

# random_train_data = np.random.rand(32,1,28, 28)
# print(random_train_data.dtype)
# random_test_data = np.random.rand(16,1,28, 28)
# print(random_test_data.dtype)

In [238]:
# Converting the data to tensor type and floating point type

# tensor_train_data = torch.from_numpy(random_train_data).float()
# tensor_test_data = torch.from_numpy(random_test_data).float()

In [239]:
# print(tensor_train_data.shape)
# print(tensor_test_data.dtype)
# print(len(tensor_data))

In [240]:
# Creating random binary labels. and converting it to tensor

# label_test = np.random.choice([0, 1], size=len(tensor_test_data))
# label_train = np.random.choice([0, 1], size=len(tensor_train_data))

# print(label_train.dtype)
# label_test = torch.from_numpy(label_test)
# label_train = torch.from_numpy(label_train)
# print(label_test.dtype)

In [241]:

# The most important class, a custom data loader, understand how it is working.

class data_class(Dataset):
    def __init__(self,data,labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,id):
        data_set=self.data[id]
        labels=self.labels[id]

        return data_set,labels

In [242]:
# calling the data_class for the raw random data
with open("/users/nsantil/smol_dataset.txt", "r") as dataset_file:
    dataset = dataset_file.readlines()

random.shuffle(dataset)

total_samples = len(dataset)
train_size = int(0.75 * total_samples)
test_size = int(0.20 * total_samples)
validation_size = total_samples - train_size - test_size

train_data = dataset[:train_size]
test_data = dataset[train_size:train_size + test_size]
validation_data = dataset[train_size + test_size:]

with open("train_file.txt", "w") as train_file:
    train_file.writelines(train_data)

with open("test_file.txt", "w") as test_file:
    test_file.writelines(test_data)

with open("validation_file.txt", "w") as validation_file:
    validation_file.writelines(validation_data)


In [243]:
#Create one_hot_encoder and feed in the data
def one_hot_encoder(sequence):
    nucleotide_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
    one_hot_encoded_seq = np.zeros((4, len(sequence)), dtype=np.float32)
    
    for i, nucleotide in enumerate(sequence):
        if nucleotide in nucleotide_map:
            one_hot_encoded_seq[nucleotide_map[nucleotide], i] = 1
    
    return one_hot_encoded_seq



In [244]:
def load_data(file_path):
    with open(file_path, "r") as file:
        sequences = file.readlines()

    data = []
    labels = []
    for line in sequences:
        parts = line.split(maxsplit=1)
        label = int(parts[0])
        sequence = parts[1].strip()
        one_hot_encoded_seq = one_hot_encoder(sequence)
        data.append(one_hot_encoded_seq)
        labels.append(label)

    return np.array(data), np.array(labels)

In [245]:
train_data, train_labels = load_data("train_file.txt")
test_data, test_labels = load_data("test_file.txt")
validation_data, validation_labels = load_data("validation_file.txt")

In [246]:
#convert the data to tensor

train_data = torch.tensor(train_data, dtype=torch.float)
train_labels = torch.tensor(train_labels, dtype=torch.long)

test_data = torch.tensor(test_data, dtype=torch.float)
test_labels = torch.tensor(test_labels, dtype=torch.long)

validation_data = torch.tensor(validation_data, dtype=torch.float)
validation_labels = torch.tensor(validation_labels, dtype=torch.long)

In [247]:
# Creating the data loader which is going to load the data to the AI model

train_dataloader = DataLoader(data_class(train_data, train_labels), batch_size=2, shuffle=True)
test_dataloader = DataLoader(data_class(test_data, test_labels), batch_size=2, shuffle=True)
validation_dataloader = DataLoader(data_class(validation_data, validation_labels), batch_size=2, shuffle=False)


In [248]:
#Print one label and one one_hot_encoding

# For test dataloader
for data, label in test_dataloader:
    print("Test Label:", label[0])
    print("Data shape:", data[0].shape)
    print("One-hot encoded data sample:", data[0])
    break

# For train dataloader
for data, label in train_dataloader:
    print("Train Label:", label[0])
    print("Data shape:", data[0].shape)
    print("One-hot encoded data sample:", data[0])
    break

# For validation dataloader
for data, label in validation_dataloader:
    print("Validation Label:", label[0])
    print("Data shape:", data[0].shape)
    print("One-hot encoded data sample:", data[0])
    break


Test Label: tensor(1)
Data shape: torch.Size([4, 180])
One-hot encoded data sample: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1.,
         1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0.,

In [249]:
# Instantiating the model and assigning an optimizer to the model and creating a loss function

model=MiniCNN().to(device)
optimizer=optim.Adam(params=model.parameters(),lr=0.0001)
loss_fn = nn.CrossEntropyLoss()

In [250]:
def train(model,device,train_dataloader,optimizer,epochs):
    print("inside train")
    model.train()
    for batch_ids, (img, classes) in enumerate(train_dataloader):
        classes=classes.type(torch.LongTensor)

        img,classes=img.to(device),classes.to(device)
        torch.autograd.set_detect_anomaly(True)     
        optimizer.zero_grad()
        output=model(img)
        loss = loss_fn(output,classes)                
        
        loss.backward()
        optimizer.step()
    if(batch_ids +1) % 2 == 0:
        print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
            epochs, batch_ids* len(img), len(train_dataloader.dataset),
            100.*batch_ids / len(train_dataloader),loss.item()))

In [251]:
def test(model, device, test_dataloader):
    model.eval()
    test_loss=0
    correct=0
    with torch.no_grad():
        for img,classes in test_dataloader:
            img,classes=img.to(device), classes.to(device)
            y_hat=model(img)
            test_loss+=F.nll_loss(y_hat,classes,reduction='sum').item()
            _,y_pred=torch.max(y_hat,1)
            correct+=(y_pred==classes).sum().item()
        test_loss/=len(test_dataloader)
        print("\n Test set: Avarage loss: {:.0f},Accuracy:{}/{} ({:.0f}%)\n".format(
            test_loss,correct,len(test_dataloader),100.*correct/len(test_dataloader)))
        print('='*30)


In [252]:
# WE ARE USING RANDOM DATA SO THE TRAINING AND TESTING DOES NOT MATTER, THE AIM IS TO SHOWCASE THE USE OF A CUSTOM DATASET
# SINCE IN PRACTICAL SENSE YOU HAVE TO CLEAN THE DATA AND LOAD THE DATA INTO THE MODEL.


if __name__=='__main__':
    seed=42
    EPOCHS=2

    for epoch in range(1,EPOCHS+1):
        train(model,device,train_dataloader,optimizer,epoch)
        test(model,device,test_dataloader)

inside train
