In [1]:
import torch
import torchvision
from torch import nn 
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image 
import numpy as np
from scipy.io import loadmat
import dataset_utils

from bayesian_cls import *

In [2]:
df = pd.read_csv('final_dataset.csv',index_col=0)
tform = transforms.Compose([transforms.Resize((64,64)),transforms.PILToTensor(),transforms.ConvertImageDtype(torch.float),transforms.Normalize(0.5,0.5)])
image_dataset = torchvision.datasets.ImageFolder("image_dataset/",transform=tform)
species2genus = dataset_utils.species_label_to_genus_label(df,image_dataset)
batch_size = 1000 
import random
import dataset_utils
img2dna = dataset_utils.get_imgs_bold_id(image_dataset,df)

nucleotides = df[['nucleotide','species_name','genus_name','processid','image_urls']]
colonna_dna = df.loc[:,"nucleotide"]
nucleotides.loc[:,'nucleotide'] = colonna_dna.apply(dataset_utils.one_hot_encoding)
random.seed(42)

X_train_1, X_test, y_train_1, y_test = dataset_utils.data_split(nucleotides,0.2,random_state=42)
#print(y_test)
train_data = X_train_1
train_data['species_name'] = y_train_1

X_train, X_validation, y_train, y_validation = dataset_utils.data_split(train_data,0.2,drop_labels=False,random_state=42)
train_indices, val_indices, test_indices = dataset_utils.image_splits_from_df(X_train,X_validation,X_test,image_dataset)

In [3]:
all_labels = [element[1] for element in image_dataset.imgs]

In [4]:
paper_data = loadmat('data/INSECTS/data.mat')
paper_splits = loadmat('data/INSECTS/splits.mat')

In [5]:
expanded_train_dna_features = torch.load('new_features/dna_train_features.pt')
expanded_train_dna_labels = torch.load('new_features/dna_train_labels.pt')
image_train_features = torch.load('new_features/img_train_features.pt')
image_train_labels= torch.load('new_features/img_train_labels.pt')
assert((image_train_labels == expanded_train_dna_labels).all())

In [6]:
expanded_val_dna_features = torch.load('new_features/dna_val_features.pt')
expanded_val_dna_labels = torch.load('new_features/dna_val_labels.pt')
image_val_features = torch.load('new_features/img_val_features.pt')
image_val_labels = torch.load('new_features/img_val_labels.pt')
assert((image_val_labels == expanded_val_dna_labels).all())

In [7]:
expanded_test_dna_features = torch.load('new_features/dna_test_features.pt')
expanded_test_dna_labels = torch.load('new_features/dna_test_labels.pt')
image_test_features = torch.load('new_features/img_test_features.pt')
image_test_labels= torch.load('new_features/img_test_labels.pt')
assert((image_test_labels == expanded_test_dna_labels).all())

In [8]:
complete_train_features = torch.cat((expanded_train_dna_features,image_train_features),dim=1)
complete_val_features = torch.cat((expanded_val_dna_features,image_val_features),dim=1)

In [9]:
sorted_keys = sorted(species2genus.keys())
species2genus = np.array([species2genus[key] for key in sorted_keys])
print(species2genus)
species2genus = species2genus[...,np.newaxis]
species2genus.shape

[234 234 235 ...  44  44 311]


(1050, 1)

In [10]:
described_species_labels_test = np.array([image_dataset.targets[i] for i in train_indices + val_indices])
described_species_labels_test = np.unique(described_species_labels_test)

In [11]:
described_species_labels_val = np.array([image_dataset.targets[i] for i in train_indices])
described_species_labels_val = np.unique(described_species_labels_val)

In [12]:
described_species_labels_val.shape

(703,)

In [13]:
described_species_labels_test.shape

(835,)

In [14]:
train_loc = train_indices
train_loc = np.array(train_loc)[np.newaxis,...]

In [15]:
train_loc.shape

(1, 13039)

In [16]:
paper_splits['train_loc'].shape

(1, 12481)

In [17]:
trainval_loc = train_indices + val_indices
trainval_loc = np.array(trainval_loc)[np.newaxis,...]

In [18]:
trainval_loc.shape

(1, 19994)

In [19]:
test_seen_loc = []
test_unseen_loc = []
for element in range(len(test_indices)):
    if element in described_species_labels_test:
        test_seen_loc.append(element)
    else:
        test_unseen_loc.append(element)
test_seen_loc = np.array(test_seen_loc)[np.newaxis,...]
test_unseen_loc = np.array(test_unseen_loc)[np.newaxis,...]

In [20]:
test_seen_loc.shape

(1, 835)

In [21]:
test_unseen_loc.shape

(1, 11595)

In [22]:
val_seen_loc = []
val_unseen_loc = []
for element in range(len(val_indices)):
    if element in described_species_labels_val:
        val_seen_loc.append(element)
    else:
        val_unseen_loc.append(element)
val_seen_loc = np.array(val_seen_loc)[np.newaxis,...]
val_unseen_loc = np.array(val_unseen_loc)[np.newaxis,...]
    

In [23]:
train_dna_features = expanded_train_dna_features.numpy()
train_image_features = image_train_features.numpy()

In [24]:
val_seen_dna_features = []
val_unseen_dna_features = []

val_seen_image_features = []
val_unseen_image_features = []

for index in val_seen_loc:
    val_seen_dna_features.append(expanded_val_dna_features[index])
    val_seen_image_features.append(image_val_features[index])

for index in val_unseen_loc:
    val_unseen_dna_features.append(expanded_val_dna_features[index])
    val_unseen_image_features.append(image_val_features[index])

val_seen_dna_features = np.squeeze(np.array(val_seen_dna_features), axis=0)
val_unseen_dna_features = np.squeeze(np.array(val_unseen_dna_features), axis=0)

val_seen_image_features = np.squeeze(np.array(val_seen_image_features), axis=0)
val_unseen_image_features = np.squeeze(np.array(val_unseen_image_features), axis=0)

In [25]:
test_seen_dna_features = []
test_unseen_dna_features = []

test_seen_image_features = []
test_unseen_image_features = []

for index in test_seen_loc:
    test_seen_dna_features.append(expanded_test_dna_features[index])
    test_seen_image_features.append(image_test_features[index])

for index in test_unseen_loc:
    test_unseen_dna_features.append(expanded_test_dna_features[index])
    test_unseen_image_features.append(image_test_features[index])

test_seen_dna_features = np.squeeze(np.array(test_seen_dna_features), axis=0)
test_unseen_dna_features = np.squeeze(np.array(test_unseen_dna_features), axis=0)

test_seen_image_features = np.squeeze(np.array(test_seen_image_features), axis=0)
test_unseen_image_features = np.squeeze(np.array(test_unseen_image_features), axis=0)

In [26]:
print(train_dna_features.shape)
print(val_seen_dna_features.shape)
print(val_unseen_dna_features.shape)
print(test_seen_dna_features.shape)
print(test_unseen_dna_features.shape)

(13039, 3250)
(703, 3250)
(6252, 3250)
(835, 3250)
(11595, 3250)


In [27]:
print(train_image_features.shape)
print(val_seen_image_features.shape)
print(val_unseen_image_features.shape)
print(test_seen_image_features.shape)
print(test_unseen_image_features.shape)

(13039, 2048)
(703, 2048)
(6252, 2048)
(835, 2048)
(11595, 2048)


In [28]:
all_labels = np.array(all_labels)[...,np.newaxis]
all_labels.shape

(32424, 1)

In [29]:
data = {
    'G': species2genus+1,
    'embeddings_dna': np.concatenate((train_dna_features, val_seen_dna_features, val_unseen_dna_features, test_seen_dna_features, test_unseen_dna_features)),
    'embeddings_img': np.concatenate((train_image_features, val_seen_image_features, val_unseen_image_features, test_seen_image_features, test_unseen_image_features)),
    'labels': np.array(all_labels)+1,
}

In [30]:
print(paper_data['G'].shape)
print(paper_data['embeddings_dna'].shape)
print(paper_data['embeddings_img'].shape)
print(paper_data['labels'].shape)
print('\n')
print(data['G'].shape)
print(data['embeddings_dna'].shape)
print(data['embeddings_img'].shape)
print(data['labels'].shape)

(1040, 1)
(32848, 500)
(32848, 2048)
(32848, 1)


(1050, 1)
(32424, 3250)
(32424, 2048)
(32424, 1)


In [31]:
splits = {
    'train_loc': np.arange(0,len(train_dna_features))[np.newaxis,...]+1,
    'trainval_loc': np.arange(0,len(train_dna_features)+len(val_seen_dna_features)+len(val_unseen_dna_features))[np.newaxis,...]+1,

    'val_seen_loc': np.arange(len(train_dna_features),len(train_dna_features)+len(val_seen_dna_features))[np.newaxis,...]+1,
    'val_unseen_loc': np.arange(len(train_dna_features)+len(val_seen_dna_features),len(train_dna_features)+len(val_seen_dna_features)+len(val_unseen_dna_features))[np.newaxis,...]+1,

    'test_seen_loc': np.arange(len(train_dna_features)+len(val_seen_dna_features)+len(val_unseen_dna_features),len(train_dna_features)+len(val_seen_dna_features)+len(val_unseen_dna_features)+len(test_seen_dna_features))[np.newaxis,...]+1,
    'test_unseen_loc': np.arange(len(train_dna_features)+len(val_seen_dna_features)+len(val_unseen_dna_features)+len(test_seen_dna_features),len(train_dna_features)+len(val_seen_dna_features)+len(val_unseen_dna_features)+len(test_seen_dna_features)+len(test_unseen_dna_features))[np.newaxis,...]+1,
    
}

In [32]:
print(paper_splits['train_loc'].shape)
print(paper_splits['trainval_loc'].shape)
print(paper_splits['val_seen_loc'].shape)
print(paper_splits['val_unseen_loc'].shape)
print(paper_splits['test_seen_loc'].shape)
print(paper_splits['test_unseen_loc'].shape)
print('\n')
print(splits['train_loc'].shape)
print(splits['trainval_loc'].shape)
print(splits['val_seen_loc'].shape)
print(splits['val_unseen_loc'].shape)
print(splits['test_seen_loc'].shape)
print(splits['test_unseen_loc'].shape)

(1, 12481)
(1, 19420)
(1, 2786)
(1, 4153)
(1, 4965)
(1, 8463)


(1, 13039)
(1, 19994)
(1, 703)
(1, 6252)
(1, 835)
(1, 11595)


In [275]:
from scipy.io import savemat

savemat('data_f.mat', data)
savemat('splits_f.mat', splits)