In [1]:
import torch
import torchvision
from torch import nn 
#from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image 
import numpy as np
import random
import dataset_utils
from torch.utils.data import Dataset, DataLoader
import tqdm

In [2]:
all_images = torch.load('tensor_dataset/all_images.pt')
all_dnas = torch.load('tensor_dataset/all_dnas.pt')
all_labels = torch.load('tensor_dataset/all_labels.pt')
train_loc = torch.load('tensor_dataset/train_loc.pt')
val_seen_loc = torch.load('tensor_dataset/val_seen_loc.pt')
val_unseen_loc = torch.load('tensor_dataset/val_unseen_loc.pt')
test_seen_loc = torch.load('tensor_dataset/test_seen_loc.pt')
test_unseen_loc = torch.load('tensor_dataset/test_unseen_loc.pt')
species2genus = torch.load('tensor_dataset/species2genus.pt')


In [3]:
all_dnas.unique().shape

torch.Size([2])

In [4]:
dna_train = torch.clone(all_dnas[train_loc].data)
dna_val = torch.clone(torch.cat((all_dnas[val_seen_loc],all_dnas[val_unseen_loc])).data)
dna_test = torch.clone(torch.cat((all_dnas[test_seen_loc],all_dnas[test_unseen_loc])).data)

labels_train = torch.clone(all_labels[train_loc].data)
labels_val = torch.clone(torch.cat((all_labels[val_seen_loc],all_labels[val_unseen_loc])).data)
labels_test = torch.clone(torch.cat((all_labels[test_seen_loc],all_labels[test_unseen_loc])).data)
#un_train = torch.unique(dna_train.view(dna_train.size(0), -1), dim=0)
#un_val = torch.unique(dna_val.view(dna_val.size(0), -1), dim=0)
#un_test = torch.unique(dna_test.view(dna_test.size(0), -1), dim=0)

In [5]:
class DNAdataset(Dataset):
    def __init__(self, data, targets):
        self.data = data.float()
        self.targets = torch.tensor(targets.clone().detach())
        
    def __getitem__(self, index):
        x = self.data[index].unsqueeze(0)
        y = self.targets[index]
        return x, y
    def __len__(self):
        return len(self.data)
        
d_train = DNAdataset(dna_train, labels_train)
d_val = DNAdataset(dna_val, labels_val)
d_train_val = DNAdataset(torch.cat((dna_train,dna_val)), torch.cat((labels_train,labels_val)))
d_test = DNAdataset(dna_test,labels_test)

  self.targets = torch.tensor(targets.clone().detach())


In [6]:
dataloader_train = DataLoader(d_train, batch_size=32,shuffle=True)
dataloader_val = DataLoader(d_val, batch_size=32,shuffle=True)
dataloader_train_val = DataLoader(d_train_val, batch_size=32,shuffle=True)
dataloader_test = DataLoader(d_test, batch_size=32,shuffle=True)
dataloaders = {'train':dataloader_train,'val':dataloader_val,'train_val':dataloader_train_val,'test':dataloader_test}
dataset_sizes = {'train': d_train.data.shape[0], 'val':d_val.data.shape[0],'train_val':d_train_val.data.shape[0],'test':d_test.data.shape[0]}

In [7]:
is_train_val = False # SET TO TRUE IF YOU WANT TO USE TRAIN+VAL FOR TRAINING
if is_train_val:
    dataloaders['train'] = dataloaders['train_val']
    dataloaders['val'] = dataloaders['test']
    dataset_sizes['train'] = dataset_sizes['train_val']
    dataset_sizes['val'] = dataset_sizes['test']

In [8]:

from tqdm.notebook import tqdm
def fit(epochs,dataloaders,optimizer,model,start_idx=0):
    criterion = torch.nn.CrossEntropyLoss()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()
    
    train_losses = []
    train_scores = []
    val_losses = []
    val_scores = []
    for epoch in range(epochs):
        running_train_corrects = 0
        for dnas,labels in tqdm(dataloaders['train']):
            model.train()
            dnas = dnas.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            #print(dnas.shape)
            predicted_labels = model(dnas)
            train_loss = criterion(predicted_labels,labels)
            train_loss.backward()
            optimizer.step()
            
            _, preds = torch.max(predicted_labels, 1)
            #print(preds)
            #print(labels.data)
            running_train_corrects += torch.sum(preds == labels.data)
        train_losses.append(train_loss)
        
        running_val_corrects = 0
        for dnas,labels in tqdm(dataloaders['val']):
            
            model.eval()
            with torch.no_grad():
                dnas = dnas.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                
                predicted_labels = model(dnas)
                val_loss = criterion(predicted_labels,labels)
                
                _, preds = torch.max(predicted_labels, 1)
                #print(preds)
                #print(labels.data)
                running_val_corrects += torch.sum(preds == labels.data)
        val_losses.append(val_loss)
        
        
        
        #real_scores.append(real_score)
        #fit_p.writer.add_scalar('loss_g', loss_g, epoch)
        # Log losses & scores (last batch)
        
        epoch_train_acc = running_train_corrects.double() / dataset_sizes['train']
        epoch_val_acc = running_val_corrects.double() / dataset_sizes['val']
        print("Epoch [{}/{}], train_loss: {:.4f},  train_score: {:.4f},val_loss: {:.4f},  val_score: {:.4f}".format(
            epoch+1, epochs, train_loss, epoch_train_acc,val_loss,epoch_val_acc))
        #print(f"class accuracy real {class_accuracy_real}")
    
    return train_losses

In [9]:
class TinyModel(torch.nn.Module):
    def __init__(self):
        super(TinyModel, self).__init__()

        self.conv1 = torch.nn.Conv2d(1,16,(5,1))
        self.activation1 = torch.nn.LeakyReLU()
        self.norm1 = torch.nn.BatchNorm2d(16)
        self.conv2 = torch.nn.Conv2d(16,1,(5,1))
        self.activation2 = torch.nn.LeakyReLU()
        self.norm2 = torch.nn.BatchNorm2d(1)
        self.dropout1= torch.nn.Dropout(0.70)
        self.flat = torch.nn.Flatten()
        self.linear = torch.nn.Linear(3250,1500)
        self.dropout2= torch.nn.Dropout(0.70)
        self.activation3 = torch.nn.LeakyReLU()
        self.linear2 = torch.nn.Linear(1500,1050)
        #self.softmax = torch.nn.Softmax()
    def forward(self, x):
        #print(x.shape)
        x = self.conv1(x)
        x = self.activation1(x)
        x = self.norm1(x)
        x = self.conv2(x)
        x = self.activation2(x)
        x = self.norm2(x)
        x = self.dropout1(x)
        x = self.flat(x)
        x = self.linear(x)
        x = self.dropout2(x)
        x = self.activation3(x)
        x = self.linear2(x)
        return x
    def feature_extract(self,x):
        x = self.conv1(x)
        x = self.activation1(x)
        x = self.norm1(x)
        x = self.conv2(x)
        x = self.flat(x)
        return x




tinymodel = TinyModel()
tinymodel.cuda()
 
optimizer = torch.optim.Adam(tinymodel.parameters(),weight_decay=1e-5)
n_params = dataset_utils.count_trainable_parameters(tinymodel);
print(n_params)

6452761


In [23]:
fit(50,dataloaders,optimizer,tinymodel)

  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [1/50], train_loss: 3.6584,  train_score: 0.1795,val_loss: 6.6115,  val_score: 0.2296


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [2/50], train_loss: 2.9625,  train_score: 0.3999,val_loss: 7.1019,  val_score: 0.3405


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [3/50], train_loss: 1.5370,  train_score: 0.5678,val_loss: 7.4406,  val_score: 0.4079


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [4/50], train_loss: 1.9220,  train_score: 0.6883,val_loss: 8.5836,  val_score: 0.4315


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [5/50], train_loss: 0.8355,  train_score: 0.7705,val_loss: 7.9490,  val_score: 0.4446


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [6/50], train_loss: 0.8697,  train_score: 0.8290,val_loss: 8.4919,  val_score: 0.4512


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [7/50], train_loss: 0.1395,  train_score: 0.8628,val_loss: 8.9707,  val_score: 0.4523


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [8/50], train_loss: 0.2098,  train_score: 0.8895,val_loss: 4.9831,  val_score: 0.4539


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [9/50], train_loss: 0.0271,  train_score: 0.9080,val_loss: 5.3196,  val_score: 0.4549


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [10/50], train_loss: 0.2506,  train_score: 0.9187,val_loss: 10.2004,  val_score: 0.4561


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [11/50], train_loss: 0.4058,  train_score: 0.9317,val_loss: 6.9908,  val_score: 0.4558


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [12/50], train_loss: 0.1509,  train_score: 0.9406,val_loss: 5.9707,  val_score: 0.4561


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [13/50], train_loss: 0.0237,  train_score: 0.9443,val_loss: 7.4404,  val_score: 0.4574


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [14/50], train_loss: 0.3026,  train_score: 0.9521,val_loss: 2.8087,  val_score: 0.4564


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [15/50], train_loss: 0.8947,  train_score: 0.9561,val_loss: 3.4457,  val_score: 0.4564


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [16/50], train_loss: 0.0152,  train_score: 0.9597,val_loss: 7.3586,  val_score: 0.4574


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [17/50], train_loss: 0.0417,  train_score: 0.9611,val_loss: 10.0662,  val_score: 0.4569


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [18/50], train_loss: 0.0044,  train_score: 0.9634,val_loss: 4.3055,  val_score: 0.4571


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [19/50], train_loss: 0.0086,  train_score: 0.9673,val_loss: 5.6134,  val_score: 0.4569


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [20/50], train_loss: 0.8174,  train_score: 0.9672,val_loss: 4.3430,  val_score: 0.4574


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [21/50], train_loss: 0.0240,  train_score: 0.9693,val_loss: 4.8968,  val_score: 0.4578


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [22/50], train_loss: 0.0170,  train_score: 0.9683,val_loss: 2.7679,  val_score: 0.4578


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [23/50], train_loss: 0.0500,  train_score: 0.9740,val_loss: 4.9642,  val_score: 0.4582


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [24/50], train_loss: 0.0015,  train_score: 0.9737,val_loss: 2.9272,  val_score: 0.4571


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [25/50], train_loss: 0.1496,  train_score: 0.9748,val_loss: 5.5792,  val_score: 0.4578


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [26/50], train_loss: 0.0802,  train_score: 0.9766,val_loss: 6.8140,  val_score: 0.4577


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [27/50], train_loss: 0.0122,  train_score: 0.9755,val_loss: 2.2980,  val_score: 0.4578


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [28/50], train_loss: 0.0001,  train_score: 0.9775,val_loss: 4.2163,  val_score: 0.4577


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [29/50], train_loss: 1.9064,  train_score: 0.9761,val_loss: 6.7406,  val_score: 0.4581


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [30/50], train_loss: 0.2466,  train_score: 0.9795,val_loss: 2.8009,  val_score: 0.4590


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [31/50], train_loss: 0.0068,  train_score: 0.9811,val_loss: 6.9539,  val_score: 0.4585


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [32/50], train_loss: 0.0062,  train_score: 0.9801,val_loss: 6.2957,  val_score: 0.4585


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [33/50], train_loss: 0.1432,  train_score: 0.9797,val_loss: 9.4805,  val_score: 0.4587


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [34/50], train_loss: 0.0516,  train_score: 0.9788,val_loss: 4.1179,  val_score: 0.4571


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [35/50], train_loss: 0.0773,  train_score: 0.9806,val_loss: 4.7643,  val_score: 0.4585


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [36/50], train_loss: 0.0001,  train_score: 0.9833,val_loss: 9.2821,  val_score: 0.4588


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [37/50], train_loss: 1.1215,  train_score: 0.9832,val_loss: 5.3419,  val_score: 0.4587


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [38/50], train_loss: 0.0084,  train_score: 0.9821,val_loss: 1.5947,  val_score: 0.4585


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [39/50], train_loss: 0.0067,  train_score: 0.9841,val_loss: 4.2913,  val_score: 0.4587


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [40/50], train_loss: 0.0400,  train_score: 0.9833,val_loss: 3.4368,  val_score: 0.4584


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [41/50], train_loss: 0.0004,  train_score: 0.9855,val_loss: 5.2015,  val_score: 0.4578


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [42/50], train_loss: 0.1077,  train_score: 0.9829,val_loss: 8.7787,  val_score: 0.4579


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [43/50], train_loss: 0.2479,  train_score: 0.9826,val_loss: 6.6061,  val_score: 0.4582


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [44/50], train_loss: 0.0001,  train_score: 0.9848,val_loss: 5.3597,  val_score: 0.4590


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [45/50], train_loss: 0.0007,  train_score: 0.9854,val_loss: 5.4327,  val_score: 0.4581


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [46/50], train_loss: 0.1402,  train_score: 0.9870,val_loss: 7.2213,  val_score: 0.4585


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [47/50], train_loss: 0.0034,  train_score: 0.9857,val_loss: 10.4260,  val_score: 0.4588


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [48/50], train_loss: 0.0039,  train_score: 0.9846,val_loss: 2.1063,  val_score: 0.4582


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [49/50], train_loss: 0.0426,  train_score: 0.9829,val_loss: 5.8326,  val_score: 0.4584


  0%|          | 0/408 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

Epoch [50/50], train_loss: 0.2047,  train_score: 0.9849,val_loss: 7.4212,  val_score: 0.4585


[tensor(3.6584, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(2.9625, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.5370, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(1.9220, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.8355, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.8697, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.1395, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.2098, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.0271, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.2506, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.4058, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.1509, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.0237, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.3026, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.8947, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(0.0152, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor(

In [10]:
state_dict = torch.load('checkpoints/CNN_DNA_weights_for_unseen')
tinymodel.load_state_dict(state_dict['model_state_dict'])
optimizer.load_state_dict(state_dict['optimizer_state_dict'])

In [11]:
import scipy.io as io
matdataset = io.loadmat("get_unseen_fresh_samples/unseen_insect_dataset.mat")
all_dnas = torch.tensor(matdataset['all_dnas']).float()
all_genus_labels = matdataset['all_genus_labels'].squeeze()-1
all_dnas.shape


torch.Size([40050, 658, 5])

In [21]:
import scipy.io as io
def new_extract_expanded_dna_features(model : nn.Module,device :str ,save_to_disk : bool = False, save_name_prefix : str = ""):
    class DNAdataset(Dataset):
        def __init__(self, data, targets):
            self.data = torch.tensor(data)
            self.targets = torch.tensor(targets)
            
        def __getitem__(self, index):
            x = self.data[index].unsqueeze(0)
            y = self.targets[index]
            
            
            return x, y
        
        def __len__(self):
            return len(self.data)


    #matdataset = io.loadmat("get_unseen_fresh_samples/unseen_insect_dataset.mat")
    #all_dnas = matdataset['all_dnas']
    #all_genus_labels = matdataset['all_genus_labels'].squeeze()-1
    dataset = DNAdataset(all_dnas, all_genus_labels)
    dataloader= DataLoader(dataset, batch_size=32,shuffle=False)
    print(all_dnas.shape)
    print(all_genus_labels.shape)
    ###actual extraction of the feature from the model
    model.eval()
    with torch.no_grad():
        features = []
        labels = np.array([]) 
        for dnas,batch_labels in dataloader:
            #print(dnas.shape)
            dnas = dnas
            dnas = dnas.to(device)
            fts = model.feature_extract(dnas)
            labels = np.concatenate((labels, batch_labels.cpu().numpy()))
            features.append(fts.cpu().numpy())
            torch.cuda.empty_cache()
        features = torch.tensor(np.concatenate(features))
        labels = torch.tensor(labels)
    return features

In [22]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import importlib 
all_dna_features = new_extract_expanded_dna_features(tinymodel,device,
                                               save_to_disk=False)

  self.data = torch.tensor(data)


torch.Size([40050, 658, 5])
(40050,)


In [17]:
all_dnas[0].shape

torch.Size([658, 5])

In [29]:
all_dnas[0].unsqueeze(0).unsqueeze(0).shape

torch.Size([1, 1, 658, 5])

In [30]:
tinymodel.eval()
with torch.no_grad():
   print( tinymodel.feature_extract(all_dnas[0].unsqueeze(0).unsqueeze(0).to(device)))

tensor([[ 0.6922, -0.1530,  0.7426,  ...,  0.7426,  0.0632,  0.7426]],
       device='cuda:0')


In [25]:
all_dna_features[0]

tensor([ 0.6922, -0.1530,  0.7426,  ...,  0.7426,  0.0632,  0.7426])

In [57]:
olddataset = io.loadmat('matlab_dataset/insect_dataset.mat')
oldfeatures = olddataset['all_dna_features_cnn_new']
oldgenus_labels = species2genus[olddataset['all_labels']-1].numpy().squeeze()

In [58]:
oldfeatures.shape

(32424, 3250)

In [59]:
oldgenus_labels.shape

(32424,)

In [60]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(min_samples_leaf=2,n_jobs=-1)
clf = clf.fit(oldfeatures, oldgenus_labels)
train_predicted_labels = clf.predict(oldfeatures)
print(f"Training genus accuracy:{np.count_nonzero(train_predicted_labels==oldgenus_labels)/len(oldgenus_labels)}")


Training genus accuracy:0.9999383172958303


In [61]:

val_predicted_labels= clf.predict(all_dna_features)
print(f"Validation genus accuracy:{np.count_nonzero(val_predicted_labels==all_genus_labels)/len(all_genus_labels)}")

Validation genus accuracy:0.2549063670411985


In [62]:
type(all_dna_features)

torch.Tensor

# Save extracted features in .mat

In [63]:
import scipy.io as io
all_dataset = io.loadmat('get_unseen_fresh_samples/unseen_insect_dataset.mat')
all_dataset['all_dna_features_cnn_new'] = all_dna_features.numpy()
io.savemat('get_unseen_fresh_samples/unseen_insect_dataset.mat',all_dataset)



# Random Forest (only used to try features, not indicative of final results)

In [14]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(min_samples_leaf=2,n_jobs=-1)
clf = clf.fit(expanded_train_dna_features,expanded_train_dna_labels )

In [15]:
train_predicted_labels = clf.predict(expanded_train_dna_features)
print(f"Training species accuracy:\
{np.count_nonzero(train_predicted_labels==expanded_train_dna_labels.numpy())/len(expanded_train_dna_labels)}")

Training species accuracy:0.9993864560165657


In [16]:
val_predicted_labels= clf.predict(expanded_val_dna_features)
print(f"Validation species accuracy:{np.count_nonzero(val_predicted_labels==expanded_val_dna_labels.numpy())/len(expanded_val_dna_labels)}")

Validation species accuracy:0.4567936736161035


In [17]:

temp_val_predicted_probs = clf.predict_proba(expanded_val_dna_features)
val_predicted_probs = np.zeros((len(temp_val_predicted_probs),1050))
for i, cls in enumerate(np.arange(1050)):
    if cls in clf.classes_:
        val_predicted_probs[:, cls] = temp_val_predicted_probs[:, list(clf.classes_).index(cls)]
import math
n_correct_genus = 0
for i in range(len(expanded_val_dna_labels)):
    #label_best_specie = val_predicted_probs[i].argmax()
    label_best_specie = val_predicted_labels[i]
    assert(val_predicted_labels[i]==val_predicted_probs[i].argmax())
    #print(label_best_specie.item())
    genus_of_best_species = species2genus[int(label_best_specie.item())]
    #species_same_genus = [k for k,v in species2genus.items() if v == genus_of_best_species]
    #reduced_species = val_predicted_probs[i][species_same_genus]
    #normalized_reduced_species = reduced_species/(reduced_species.sum())
    
    real_genus = species2genus[int(expanded_val_dna_labels[i].item())]
    predicted_genus = genus_of_best_species
    if real_genus == predicted_genus:
        n_correct_genus+=1
print(f"Validation genus accuracy: {n_correct_genus/len(expanded_val_dna_labels)}")


Validation genus accuracy: 0.8046010064701653
