In [1]:
import torch
import torchvision
from torch import nn 
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image 
import numpy as np
from scipy.io import loadmat
import dataset_utils

#from bayesian_cls import *

In [2]:
df = pd.read_csv('final_dataset.csv',index_col=0)
tform = transforms.Compose([transforms.Resize((64,64)),transforms.PILToTensor(),transforms.ConvertImageDtype(torch.float),transforms.Normalize(0.5,0.5)])
image_dataset = torchvision.datasets.ImageFolder("image_dataset/",transform=tform)
species2genus = dataset_utils.species_label_to_genus_label(df,image_dataset)
batch_size = 1000 
import random
import dataset_utils
img2dna = dataset_utils.get_imgs_bold_id(image_dataset,df)

nucleotides = df[['nucleotide','species_name','genus_name','processid','image_urls']]
colonna_dna = df.loc[:,"nucleotide"]
nucleotides.loc[:,'nucleotide'] = colonna_dna.apply(dataset_utils.one_hot_encoding)
random.seed(42)

X_train_1, X_test, y_train_1, y_test = dataset_utils.data_split(nucleotides,0.2,random_state=42)
#print(y_test)
train_data = X_train_1
train_data['species_name'] = y_train_1

X_train, X_validation, y_train, y_validation = dataset_utils.data_split(train_data,0.2,drop_labels=False,random_state=42)
train_indices, val_indices, test_indices = dataset_utils.image_splits_from_df(X_train,X_validation,X_test,image_dataset)

In [3]:
all_labels = [element[1] for element in image_dataset.imgs]

In [4]:
paper_data = loadmat('data/INSECTS/data.mat')
paper_splits = loadmat('data/INSECTS/splits.mat')

In [5]:
expanded_train_dna_features = torch.load('dna_train_features.pt')
expanded_train_dna_labels = torch.load('dna_train_labels.pt')
image_train_features = torch.load('img_train_features.pt')
image_train_labels= torch.load('img_train_labels.pt')
assert((image_train_labels == expanded_train_dna_labels).all())

In [6]:
expanded_val_dna_features = torch.load('dna_val_features.pt')
expanded_val_dna_labels = torch.load('dna_val_labels.pt')
image_val_features = torch.load('img_val_features.pt')
image_val_labels = torch.load('img_val_labels.pt')
assert((image_val_labels == expanded_val_dna_labels).all())

In [7]:
expanded_test_dna_features = torch.load('dna_test_features.pt')
expanded_test_dna_labels = torch.load('dna_test_labels.pt')
image_test_features = torch.load('img_test_features.pt')
image_test_labels= torch.load('img_test_labels.pt')
assert((image_test_labels == expanded_test_dna_labels).all())

In [8]:
#complete_train_features = torch.cat((expanded_train_dna_features,image_train_features),dim=1)
#complete_val_features = torch.cat((expanded_val_dna_features,image_val_features),dim=1)

In [9]:
sorted_keys = sorted(species2genus.keys())
species2genus = np.array([species2genus[key] for key in sorted_keys])
print(species2genus)
species2genus = species2genus[...,np.newaxis]
species2genus.shape

[234 234 235 ...  44  44 311]


(1050, 1)

In [10]:
described_species_labels_train = set(np.array([image_dataset.targets[i] for i in train_indices]))
described_species_labels_trainval = set(np.array([image_dataset.targets[i] for i in train_indices+val_indices]))

In [11]:
train_loc = train_indices
trainval_loc = train_indices+val_indices

In [12]:
n_seen_val = 0
val_all_loc = set()
val_seen_loc = set() 
for i in val_indices:
    val_all_loc.add(i)
    if image_dataset[i][1] in described_species_labels_train:
        n_seen_val+=1
        val_seen_loc.add(i)
val_unseen_loc = val_all_loc - val_seen_loc

In [13]:
n_seen_test = 0
test_all_loc = set()
test_seen_loc = set() 
for i in test_indices:
    test_all_loc.add(i)
    if image_dataset[i][1] in described_species_labels_trainval:
        n_seen_test+=1
        test_seen_loc.add(i)
test_unseen_loc = test_all_loc - test_seen_loc

In [14]:
# Features training set
train_dna_features = expanded_train_dna_features.numpy()
train_image_features = image_train_features.numpy()

In [15]:
# Features validation set
val_seen_dna_features = expanded_val_dna_features.numpy()[:len(val_seen_loc)]
val_unseen_dna_features = expanded_val_dna_features.numpy()[len(val_seen_loc):]
val_seen_image_features = image_val_features.numpy()[:len(val_seen_loc)]
val_unseen_image_features= image_val_features.numpy()[len(val_seen_loc):]

In [16]:
# Features test set
test_seen_dna_features = expanded_test_dna_features.numpy()[:len(test_seen_loc)]
test_unseen_dna_features = expanded_test_dna_features.numpy()[len(test_seen_loc):]

test_seen_image_features = image_test_features.numpy()[:len(test_seen_loc)]
test_unseen_image_features= image_test_features.numpy()[len(test_seen_loc):]

In [17]:
print(train_dna_features.shape)
print(val_seen_dna_features.shape)
print(val_unseen_dna_features.shape)
print(test_seen_dna_features.shape)
print(test_unseen_dna_features.shape)

(13039, 3250)
(3234, 3250)
(3721, 3250)
(4990, 3250)
(7440, 3250)


In [18]:
print(train_image_features.shape)
print(val_seen_image_features.shape)
print(val_unseen_image_features.shape)
print(test_seen_image_features.shape)
print(test_unseen_image_features.shape)

(13039, 2048)
(3234, 2048)
(3721, 2048)
(4990, 2048)
(7440, 2048)


In [19]:
all_labels = np.array(all_labels)[...,np.newaxis]
all_labels.shape

(32424, 1)

In [20]:
data = {
    'G': species2genus+1,
    'embeddings_dna': np.concatenate((train_dna_features, val_seen_dna_features, val_unseen_dna_features, test_seen_dna_features, test_unseen_dna_features)),
    'embeddings_img': np.concatenate((train_image_features, val_seen_image_features, val_unseen_image_features, test_seen_image_features, test_unseen_image_features)),
    'labels': np.array(all_labels)+1,
}

In [21]:
print(paper_data['G'].shape)
print(paper_data['embeddings_dna'].shape)
print(paper_data['embeddings_img'].shape)
print(paper_data['labels'].shape)
print('\n')
print(data['G'].shape)
print(data['embeddings_dna'].shape)
print(data['embeddings_img'].shape)
print(data['labels'].shape)

(1040, 1)
(32848, 500)
(32848, 2048)
(32848, 1)


(1050, 1)
(32424, 3250)
(32424, 2048)
(32424, 1)


In [22]:
train_loc = (np.array(train_loc)+1)[...,np.newaxis]
trainval_loc = (np.array(trainval_loc)+1)[...,np.newaxis]

val_seen_loc = (np.array(list(val_seen_loc))+1)[...,np.newaxis]
val_unseen_loc = (np.array(list(val_unseen_loc))+1)[...,np.newaxis]



In [23]:

test_seen_loc = (np.array(list(test_seen_loc))+1)[...,np.newaxis]
test_unseen_loc = (np.array(list(test_unseen_loc))+1)[...,np.newaxis]

In [28]:
test_seen_loc

array([[   10],
       [   20],
       [   22],
       ...,
       [32408],
       [32416],
       [32419]])

In [29]:
val_seen_loc

array([[24577],
       [24580],
       [24582],
       ...,
       [24562],
       [24564],
       [24570]])

In [25]:
splits = {
    'train_loc': train_loc,
    'trainval_loc': trainval_loc,

    'val_seen_loc': val_seen_loc,
    'val_unseen_loc': val_unseen_loc,

    'test_seen_loc': test_seen_loc,
    'test_unseen_loc': test_unseen_loc,
    
}

In [26]:
print(paper_splits['train_loc'].shape)
print(paper_splits['trainval_loc'].shape)
print(paper_splits['val_seen_loc'].shape)
print(paper_splits['val_unseen_loc'].shape)
print(paper_splits['test_seen_loc'].shape)
print(paper_splits['test_unseen_loc'].shape)
print('\n')
print(splits['train_loc'].shape)
print(splits['trainval_loc'].shape)
print(splits['val_seen_loc'].shape)
print(splits['val_unseen_loc'].shape)
print(splits['test_seen_loc'].shape)
print(splits['test_unseen_loc'].shape)

(1, 12481)
(1, 19420)
(1, 2786)
(1, 4153)
(1, 4965)
(1, 8463)


(13039, 1)
(19994, 1)
(3234, 1)
(3721, 1)
(4990, 1)
(7440, 1)


In [27]:
from scipy.io import savemat

savemat('data_f.mat', data)
savemat('splits_f.mat', splits)

In [30]:
train_loc

array([[    1],
       [    2],
       [    3],
       ...,
       [32422],
       [32423],
       [32424]])