In [19]:
import optuna
from tqdm import tqdm
import logging
import sys
import torch
import os
from datetime import datetime
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#import torch.utils.data
#from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
from torch.utils.data import random_split, Dataset, DataLoader
from PIL import Image

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Define Dataset function

In [20]:
class GalaxyJungle(Dataset):
    
    #the init function initializes the directory containing the image,
    #the annotations file,
    #and both transforms
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None, is_rgb=False):
        self.rgb = is_rgb
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    #returns number of samples in the dataset
    def __len__(self):
        return (self.img_labels).shape[0]

    #loads a sample from the dataset
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, str(self.img_labels.iloc[idx, 0])) + '.jpg'
        #retrieves the image
        image = Image.open(img_path)
        if not self.rgb: image = image.convert('L')
        #retrieves corresponding label
        label = self.img_labels.iloc[idx, 1:]
        #if possible, transform the image and the label into a tensor.
        if self.transform:
            image = self.transform(image)
        label = torch.tensor(label.values, dtype=torch.float32)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label, self.img_labels.iloc[idx, 0]
    

transfs = transforms.Compose([
    transforms.ToTensor(), # Riscala le immagini tra 0 e 1
    # sarebbe interessante implementare un random crop prima del center crop per decentrare un poco le immagini????
    transforms.RandomHorizontalFlip(), # horizontal flip
    transforms.RandomVerticalFlip(), # vertical flip
    transforms.CenterCrop(324)          #CROP
    ]) #transforms.compose per fare una pipe di transformazioni

## NEURAL NETWORK

In [22]:
class GalaxyNet(nn.Module):
    def __init__(self, n_conv_layers, num_filters, num_neurons1, num_neurons2, activation, is_rgb=False, verbose=False):
        super().__init__()
        rgb = 3 if is_rgb else 1
        input_size = 324
        num_labels = 37
        self.loss_dict = {'batch' : [], 'epoch' : [], 'vbatch' : [], 'vepoch' : []}
        self.activation = activation
        self.num_convs = n_conv_layers


        stride = 2
        kernel_size = 3
        kernel_size_pool = 2
        
        ## convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv2d(rgb, num_filters[0], kernel_size=kernel_size, stride=stride),
            nn.BatchNorm2d(num_filters[0])
            ])
        output_size = (input_size - kernel_size + stride) // (stride*kernel_size_pool)

        if verbose: print('output size after first conv layer: ', output_size)

        for i in range(1,n_conv_layers):
            self.convs.append(nn.Conv2d(num_filters[i-1], num_filters[i], kernel_size=kernel_size, stride=stride)) # num filters are the number of channel the conv layer outputs.
            self.convs.append(nn.BatchNorm2d(num_filters[i]))
            output_size = (output_size - kernel_size + stride) // (stride*kernel_size_pool) #padding 0, dilation = 1
            if verbose: 
                if i != n_conv_layers - 1: print('output size after conv layer {}: '.format(i), output_size)
                else: 
                    print('output size of the last conv layer: ', output_size)
                    print('len self convs: ',len(self.convs))
        if output_size in (0, -1): output_size = 1
        print(output_size)
        self.pool = nn.MaxPool2d(kernel_size=kernel_size_pool)
        #self.convs.append(nn.dropout(p= value)) ## to be added in the future to test claims of BatchnOrm paper
        
        self.out_feature = num_filters[-1] *output_size * output_size # output size of the last conv layer, should be 38
        self.fc1 = nn.Linear(self.out_feature, num_neurons1) # fully connected layer
        # dropout here if you want
        self.fc2 = nn.Linear(num_neurons1, num_neurons2)
        #dropout here if u want
        self.fc3 = nn.Linear(num_neurons2, num_labels)
        

    def init_weights(self):
        if self.activation == 'ReLU': # perchè kaiming normal e non uniform??1
            nonlin = 'relu'
            for i in range(0, self.num_convs*2, 2):
                nn.init.kaiming_normal_(self.convs[i].weight, nonlinearity=nonlin)
                if self.convs[i].bias is not None:
                    nn.init.constant_(self.convs[i].bias,0)
            nn.init.kaiming_normal_(self.fc1.weight, nonlinearity=nonlin)
            if self.fc1.bias is not None:
                nn.init.constant_(self.fc1.bias,0)
            nn.init.kaiming_normal_(self.fc2.weight, nonlinearity=nonlin)
            if self.fc2.bias is not None:
                nn.init.constant_(self.fc2.bias,0)

        elif self.activation == 'LeakyReLU':
            nonlin = 'leaky_relu'
            for i in range(0, self.num_convs*2, 2):
                nn.init.kaiming_normal_(self.convs[i].weight, a = 0.01, nonlinearity=nonlin)
                if self.convs[i].bias is not None:
                    nn.init.constant_(self.convs[i].bias,0)
            nn.init.kaiming_normal_(self.fc1.weight, a = 0.01, nonlinearity=nonlin)
            if self.fc1.bias is not None:
                nn.init.constant_(self.fc1.bias,0)
            nn.init.kaiming_normal_(self.fc2.weight, a = 0.01, nonlinearity=nonlin)
            if self.fc2.bias is not None:
                nn.init.constant_(self.fc2.bias,0) 
        nn.init.xavier_uniform_(self.fc3.weight)
        if self.fc3.bias is not None:
            nn.init.constant_(self.fc3.bias,0)
        return print('weights initialized with {}'.format(self.activation))         
        

    def forward(self, x):
        verbose=False
        for i in range(0, len(self.convs),2):
            x = self.convs[i](x)  # conv
            x = self.activation(x)  # act
            x = self.convs[i+1](x)  # batch norm
            x = self.pool(x)  # pool
            if verbose: print(x.shape)
        x = torch.flatten(x,1) # flatten operation -> 1 dimensional
        if verbose: print('last conv layer flattened',x.shape)
        if verbose: print('out-feature: ',self.out_feature)
        x = self.activation(self.fc1(x)) # apply relu al'output dei fully connected
        if verbose: print(x.shape)
        x = self.activation(self.fc2(x)) # idem sopra
        if verbose: print(x.shape)
        x = self.fc3(x)
        # x = nn.Sigmoid()(x)
        return x

    
    def log_the_loss(self, item,epoch=False): # per avere una history della loss???
        verbose=False
        train = self.__getstate__()['training']
        if verbose: print(train)
        if epoch and train:
            self.loss_dict['epoch'].append(item) ### get state of the model so you can ditch the validation parameter
        elif not epoch and train:
            self.loss_dict['batch'].append(item)
        elif not train and epoch:
            self.loss_dict['vepoch'].append(item)
        elif not train and not epoch:
            self.loss_dict['vbatch'].append(item)
        return item

            

img, lab, indx = DS.__getitem__(0)
#print(lab)
#print(img)         #3D TENSOR    
if DS.rgb:
    fig, ax = plt.subplots(1,3, figsize=(24,7))
    color = ['Reds', 'Greens', 'Blues']
    for i,j in enumerate(img):
        ax[i].imshow(j, cmap=color[i])
else:
    fig, ax = plt.subplots(1,1, figsize=(24,7))
    ax.imshow(img[0], cmap='magma')
#print(img.shape)


## TRAINING + VALIDATION

In [None]:
def one_epoch_train(model, train_loader, optimizer, loss_function, verbose=False):
    running_loss = 0
    last_loss = 0
    model.train()
    for i, data in tqdm(enumerate(train_loader)):
        inputs,labels, _ = data
        inputs,labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs) #, activation=F.relu)
        loss=loss_function(outputs, labels)
        loss.backward()
        optimizer.step() # fa update del parameter
        RMSEloss = np.sqrt(loss.item())
        running_loss += RMSEloss
        if verbose and i%10 ==0: print(f'Batch {i+1}/{len(train_loader)} - Loss: {RMSEloss:.3f}')

        model.log_the_loss(RMSEloss, epoch=False)
    epochmean_loss = running_loss / len(train_loader)
    print(f'\nLoss: {epochmean_loss:.3f}')
    model.log_the_loss(epochmean_loss, epoch=True)
    last_loss = RMSEloss
    print(f"Last loss: {last_loss:.3f}")
    return epochmean_loss



def one_epoch_eval(model, test_loader, loss_function, verbose=False):
    model.eval()
    running_validation_loss = 0.
   
    with torch.no_grad(): # deactivates gradient evaluation
        for i, vdata in enumerate(test_loader):
            inputs,labels, _ = vdata
            inputs,labels= inputs.to(device), labels.to(device)
            outputs = model(inputs)#, activation=F.relu)
            loss = loss_function(outputs,labels)
            RMSEloss = np.sqrt(loss.item())
            running_validation_loss +=RMSEloss
            model.log_the_loss(RMSEloss,epoch=False)
    mean_vloss=model.log_the_loss(running_validation_loss/len(test_loader),epoch=True)
    if verbose: print(f"Validation Loss: {mean_vloss:.3f}\n---")
    return mean_vloss

## OPTUNA


In [26]:
DS = GalaxyJungle('../data/training/training_solutions_rev1.csv', '../data/training/', transfs)
training, test, true_test = random_split(DS, [.65, .2, .15])

def objective(trial:optuna.Trial):
   
    ## Hyperspace
    num_conv_layers = 3
    #qui tuniamo il numero di filri, per layer più profondi ci vogliono più filtri (64-28 è consigliato per pattern astratti e combinazioni, mentre fino a 32 per dettagli locali) quindi proviamo (VGG usa fino a 512 per esempio).
    num_filters = [int(trial.suggest_int("num_filters_"+str(i), 16, 128, step=16)) for i in range(num_conv_layers)]
    ## abbiamo numneurons1 e numn neurons2,se mettiamo un grid sampler o un random sampler con num_neurons e basta penso che lui provi diverse combinazioni
    num_neurons1 = trial.suggest_int("num_neurons",50,200,step=10) 
    num_neurons2 = trial.suggest_int("num_neurons2",10,150,step=10)
    ### abbiamo chiamato mode l'activation function nell'initialization dei pesi o la chiamiamo activation o FUNZIONEDIATTIVAZIONE così optuna poi iniializza in base a quello
    activation = trial.suggest_categorical("activation", ["ReLU", "LeakyReLU"])
    optimizer = trial.suggest_categorical("optimizer", ["Adam", "SGD", "AdamW"]) #AdamW è suggerito per CNN.
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True) #log true cerca i valori in scala logaritmica
    momentum = trial.suggest_float("momentum", 0.5, 0.9, step=0.1) #per SGD
    # batch size da tunare?
    batch_size = 32
    epochs = 50
    loss_function = nn.MSELoss()
    
    ##### Training phase
    
    
    
    train_loader = DataLoader(training, batch_size=batch_size, shuffle=True, num_workers=8)
    test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, num_workers=8) 

    activation = getattr(nn, activation)()
    model = GalaxyNet(num_conv_layers, num_filters, num_neurons1, num_neurons2, activation).to(device)
    if optimizer == 'SGD': optimizer = getattr(optim, optimizer)(model.parameters(), lr=learning_rate, momentum = momentum)
    else: optimizer = getattr(optim, optimizer)(model.parameters(), lr=learning_rate)
    model.init_weights()

    for epoch in range(0, epochs):
        print(f'Training epoch {epoch}')
        one_epoch_train(model, train_loader, optimizer, loss_function, verbose=False)
        print(f'Validation epoch {epoch}')
        epoch_last_val_loss = one_epoch_eval(model, test_loader, loss_function, verbose=True)
        trial.report(epoch_last_val_loss,epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    
    score = epoch_last_val_loss
    return score

In [27]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study_name = "first"  # Unique identifier of the study.
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name, load_if_exists=True)
study.optimize(objective, n_trials=5)

[I 2025-05-19 12:01:51,961] Using an existing study with name 'first' instead of creating a new one.


Using an existing study with name 'first' instead of creating a new one.
Using an existing study with name 'first' instead of creating a new one.
Using an existing study with name 'first' instead of creating a new one.
4
weights initialized with LeakyReLU(negative_slope=0.01)
Training epoch 0


1251it [00:43, 28.90it/s]


Loss: 0.144
Last loss: 0.131
Validation epoch 0





Validation Loss: 0.128
---
Training epoch 1


1251it [00:44, 28.25it/s]


Loss: 0.126
Last loss: 0.142
Validation epoch 1





Validation Loss: 0.128
---
Training epoch 2


1251it [00:43, 28.98it/s]


Loss: 0.120
Last loss: 0.114
Validation epoch 2





Validation Loss: 0.118
---
Training epoch 3


1251it [00:45, 27.38it/s]


Loss: 0.116
Last loss: 0.109
Validation epoch 3





Validation Loss: 0.113
---
Training epoch 4


1251it [00:43, 28.86it/s]


Loss: 0.114
Last loss: 0.114
Validation epoch 4





Validation Loss: 0.116
---
Training epoch 5


1251it [01:33, 13.37it/s]


Loss: 0.112
Last loss: 0.123
Validation epoch 5





Validation Loss: 0.116
---
Training epoch 6


1251it [04:32,  4.60it/s]


Loss: 0.111
Last loss: 0.114
Validation epoch 6





Validation Loss: 0.112
---
Training epoch 7


1251it [04:32,  4.60it/s]


Loss: 0.110
Last loss: 0.100
Validation epoch 7





Validation Loss: 0.110
---
Training epoch 8


1251it [00:43, 28.84it/s]


Loss: 0.109
Last loss: 0.089
Validation epoch 8





Validation Loss: 0.113
---
Training epoch 9


1251it [00:37, 33.44it/s]


Loss: 0.108
Last loss: 0.103
Validation epoch 9





Validation Loss: 0.112
---
Training epoch 10


1251it [00:38, 32.31it/s]


Loss: 0.107
Last loss: 0.106
Validation epoch 10





Validation Loss: 0.111
---
Training epoch 11


1251it [00:39, 31.38it/s]


Loss: 0.106
Last loss: 0.103
Validation epoch 11





Validation Loss: 0.110
---
Training epoch 12


1251it [00:40, 31.23it/s]


Loss: 0.106
Last loss: 0.091
Validation epoch 12





Validation Loss: 0.108
---
Training epoch 13


1251it [00:39, 31.79it/s]


Loss: 0.105
Last loss: 0.103
Validation epoch 13





Validation Loss: 0.106
---
Training epoch 14


1251it [00:40, 30.67it/s]


Loss: 0.105
Last loss: 0.095
Validation epoch 14





Validation Loss: 0.107
---
Training epoch 15


1251it [00:40, 30.76it/s]


Loss: 0.104
Last loss: 0.111
Validation epoch 15





Validation Loss: 0.107
---
Training epoch 16


1251it [00:46, 26.63it/s]


Loss: 0.103
Last loss: 0.105
Validation epoch 16





Validation Loss: 0.108
---
Training epoch 17


259it [00:11, 22.57it/s]
[W 2025-05-19 12:27:06,196] Trial 13 failed with parameters: {'num_filters_0': 16, 'num_filters_1': 128, 'num_filters_2': 128, 'num_neurons': 130, 'num_neurons2': 80, 'activation': 'LeakyReLU', 'optimizer': 'Adam', 'learning_rate': 0.0002738011654544197, 'momentum': 0.5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/teobaldo/miniconda3/envs/jungle/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_20210/3746711845.py", line 38, in objective
    one_epoch_train(model, train_loader, optimizer, loss_function, verbose=False)
  File "/tmp/ipykernel_20210/2758126940.py", line 7, in one_epoch_train
    inputs,labels = inputs.to(device), labels.to(device)
                    ^^^^^^^^^^^^^^^^^
KeyboardInterrupt


Trial 13 failed with parameters: {'num_filters_0': 16, 'num_filters_1': 128, 'num_filters_2': 128, 'num_neurons': 130, 'num_neurons2': 80, 'activation': 'LeakyReLU', 'optimizer': 'Adam', 'learning_rate': 0.0002738011654544197, 'momentum': 0.5} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/teobaldo/miniconda3/envs/jungle/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_20210/3746711845.py", line 38, in objective
    one_epoch_train(model, train_loader, optimizer, loss_function, verbose=False)
  File "/tmp/ipykernel_20210/2758126940.py", line 7, in one_epoch_train
    inputs,labels = inputs.to(device), labels.to(device)
                    ^^^^^^^^^^^^^^^^^
KeyboardInterrupt
Trial 13 failed with parameters: {'num_filters_0': 16, 'num_filters_1': 128, 'num_filters_2': 128, 'num_neurons': 130, 'num_neurons2': 80, 

[W 2025-05-19 12:27:06,199] Trial 13 failed with value None.


Trial 13 failed with value None.
Trial 13 failed with value None.
Trial 13 failed with value None.


KeyboardInterrupt: 