# Loading data and traing a viral classifier on metagenomic sequences of mosquitoes

## Import dependencies 

In [56]:
import pandas as pd 
import numpy as np 

from torchmetagen.datasets.utils import FastaHandler, DatasetSplit, InflateDataset
from torchmetagen.datasets import metagenomicdataset as meta
from torchmetagen.models import DeepVirFinder, deepvirfinder
from torchmetagen.transforms import *
from torchvision import transforms as tf
import torch

from utils import *

## Check for GPU devices

In [57]:
device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [58]:
path_to_file = 'Dataset_v1_2'

viral = FastaHandler(path_to_file, 'viral.fasta',)
nonviral= FastaHandler(path_to_file, 'nonviral.fasta',)



In [59]:
splitter = DatasetSplit({'train':0.7,'val':0.3 })

viral_train, viral_test= splitter(viral)
nonviral_train, nonviral_test= splitter(nonviral)


In [60]:

inflate=InflateDataset(method='truncated', tol=0.5, chunk_size=1000)

viral_train_inflated = inflate(viral_train)
viral_test_inflated = inflate(viral_test)


nonviral_train_inflated =  inflate(nonviral_train)
nonviral_test_inflated = inflate(nonviral_test)

In [61]:
transforms_train=tf.Compose([
    ReverseComplement(),
    ToOneHot(['G','T', 'C', 'A']),
    ToTensor('one-hot')
    # ToKmer(),
    # ToTensor()
])

transforms_test=tf.Compose([
    ReverseComplement(),
    ToOneHot(['G','T', 'C', 'A']),
    ToTensor('one-hot')
    # ToKmer(),
    # ToTensor()
])


dataset_train= meta.MetagenomicSequenceData(pd.DataFrame({"data":np.concatenate((nonviral_train_inflated, viral_train_inflated)),
                                                          "class":np.concatenate((np.repeat("nonviral",len(nonviral_train_inflated)),
                                                                                  np.repeat("viral",len(viral_train_inflated))))}),
                                                     labels=['nonviral', 'viral'], transform=transforms_train)

dataset_test= meta.MetagenomicSequenceData(pd.DataFrame({"data":np.concatenate((nonviral_test_inflated, viral_test_inflated)),
                                                         "class":np.concatenate((np.repeat("nonviral",len(nonviral_test_inflated)),
                                                                                 np.repeat("viral",len(viral_test_inflated))))}),
                                                     labels=['nonviral', 'viral'], transform=transforms_test)

dataset={'train': dataset_train, 'val': dataset_test}
dataset_sizes = {'train':len(dataset_train), 'val':len(dataset_test)}

In [62]:
dataloaders = genDataLoader(dataset, {'train':256, 'val':64})

In [63]:
dataloaders['train'].dataset[10]




((tensor([[1., 1., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 1.],
          [0., 0., 1.,  ..., 0., 1., 0.]]),
  tensor([[1., 0., 0.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 1., 0., 0.],
          [0., 0., 1.,  ..., 0., 1., 1.],
          [0., 0., 0.,  ..., 0., 0., 0.]])),
 0)

In [9]:
model_torch = deepvirfinder(pretrained=False, progress=True, M = 1000, K = 10, N = 1000)


In [12]:

#device =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.Adam(model_torch.parameters(), lr = 1e-4)
criterion = torch.nn.BCELoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,4)
per_epoch, per_batch = train_model(model_torch.to(device), criterion, optimizer, 
                      scheduler, dataloaders, device, dataset_sizes, num_epochs=15)

Epoch 0/14
----------
train Loss: 0.6943 Acc: 0.5044
val Loss: 0.6938 Acc: 0.2393

Epoch 1/14
----------
train Loss: 0.6931 Acc: 0.5012
val Loss: 0.6821 Acc: 0.8347

Epoch 2/14
----------
train Loss: 0.6927 Acc: 0.5039
val Loss: 0.6784 Acc: 0.8347

Epoch 3/14
----------
train Loss: 0.6925 Acc: 0.5310
val Loss: 0.6830 Acc: 0.8347

Epoch 4/14
----------
train Loss: 0.6924 Acc: 0.5090
val Loss: 0.7021 Acc: 0.1721

Epoch 5/14
----------
train Loss: 0.6919 Acc: 0.5440
val Loss: 0.6727 Acc: 0.8347

Epoch 6/14
----------
train Loss: 0.6917 Acc: 0.5148
val Loss: 0.7085 Acc: 0.1716

Epoch 7/14
----------
train Loss: 0.6926 Acc: 0.5029
val Loss: 0.6839 Acc: 0.8499

Epoch 8/14
----------
train Loss: 0.6915 Acc: 0.5174
val Loss: 0.6605 Acc: 0.8347

Epoch 9/14
----------
train Loss: 0.6903 Acc: 0.5431
val Loss: 0.6972 Acc: 0.3380

Epoch 10/14
----------
train Loss: 0.6899 Acc: 0.5649
val Loss: 0.6915 Acc: 0.5158

Epoch 11/14
----------
train Loss: 0.6892 Acc: 0.5545
val Loss: 0.6691 Acc: 0.8358

Ep

In [18]:
model = DeepVirFinder(M = 1000, K = 10, N = 1000)
model_dvf = torch.load('torchmetaagen/models/DeepVirFinder_300.pth', map_location=device)


In [39]:
node_name_mapping = {
    'conv1.weight': 'conv1d_1.weight',
    'conv1.bias': 'conv1d_1.bias',
    'fully_connected.fc1.weight': 'dense_1.weight',
    'fully_connected.fc1.bias': 'dense_1.bias',
    'fully_connected.fc2.weight': 'dense_2.weight',
    'fully_connected.fc2.bias': 'dense_2.bias'
}

from collections import OrderedDict
new_model = OrderedDict(
    {node_name_mapping[k]: v for k, v in model_dvf.items() }
)


In [41]:
model.load_state_dict(new_model)

<All keys matched successfully>

In [42]:
model

DeepVirFinder(
  (conv1d_1): Conv1d(4, 1000, kernel_size=(10,), stride=(1,))
  (maxpool): GlobalMaxPooling1D()
  (dropout): Dropout(p=0.1, inplace=False)
  (dense_1): Linear(in_features=1000, out_features=1000, bias=True)
  (dense_2): Linear(in_features=1000, out_features=1, bias=True)
  (relu1): ReLU()
  (relu2): ReLU()
  (sigmoid): Sigmoid()
)

In [47]:



pd.DataFrame(evaluate(model.to(device), dataloaders['val'], device))


RuntimeError: stack expects each tensor to be equal size, but got [4, 821] at entry 0 and [4, 1523] at entry 1