In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torchvision import transforms
import zookeeper as zk
import os
from tqdm import tqdm
from PIL import Image
import numpy as np

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('spada e s' + str(device))

spada e scuda


In [3]:
class GalaxyJungle(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, mappy=False, is_rgb=False):
        self.rgb = is_rgb
        self.mappy = mappy
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        
    
    def __len__(self): return (self.img_labels).shape[0]

    
    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, str(self.img_labels.iloc[idx, 0])) + '.jpg'
        image = Image.open(img_path)
        if not self.rgb: image = image.convert('L')
        if self.transform: image = self.transform(image)
        
        label = self.img_labels.iloc[idx, 1:]        
        label = torch.tensor(label.values, dtype=torch.float32)
        if self.mappy: label = zk.mappy(label)

        gal_id = self.img_labels.iloc[idx, 0]

        return image, label, gal_id

In [4]:
class GalaxyNet(nn.Module):
    def __init__(self, activation, initialization=False, mappy=False, is_rgb=False):
        super().__init__()
        
        self.mappy = mappy
        rgb = 3 if is_rgb else 1
        input_size = 128
        num_labels = 37
        self.loss_dict = {'batch' : [], 'epoch' : [], 'vbatch' : [], 'vepoch' : []}
        self.activation = activation
        
        self.convs = nn.Sequential(
            nn.Conv2d(rgb, 64, 3, bias=False),
            self.activation(),
            nn.BatchNorm2d(64),
            
            nn.MaxPool2d(2)
            )

        for layer in self.convs:
            if layer.__class__.__name__ == 'Conv2d': input_size = zk.convool_size(input_size, 3, 1, 'same' if layer.padding == 'same' else 0)
            elif layer.__class__.__name__ == 'MaxPool2d': input_size = zk.convool_size(input_size, 2, 2)
        if input_size < 2: raise ValueError('You shrank too much dude.')
        print(f'Convs output size: {input_size}')

        input_linear = 64 * input_size * input_size
        
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_linear, 100),
            self.activation(),
            nn.Linear(100, num_labels)
            )

        if initialization: self.init_weights()


    def forward(self, x):
        x = self.convs(x)
        x = self.fc(x)
        if self.mappy: x = zk.mappy2D(x)
        return x


    def init_weights(self):
        if self.activation == nn.ReLU:
            nonlin = 'relu'
            a = 0
        elif self.activation == nn.LeakyReLU:
            nonlin = 'leaky_relu'
            a = .01
        
        for layer in self.convs: 
            if layer.__class__.__name__ == 'Conv2d': nn.init.kaiming_normal_(layer.weight, a=a, nonlinearity=nonlin)        

        for i in (1, -1): nn.init.constant_(self.fc[i].bias, 0)
 
        nn.init.kaiming_normal_(self.fc[1].weight, a=a, nonlinearity=nonlin)
        nn.init.xavier_uniform_(self.fc[-1].weight)      
        

    def log_the_loss(self, item,epoch=False):
        train = self.__getstate__()['training']
        if epoch and train:
            self.loss_dict['epoch'].append(item)
        elif not epoch and train:
            self.loss_dict['batch'].append(item)
        elif not train and epoch:
            self.loss_dict['vepoch'].append(item)
        elif not train and not epoch:
            self.loss_dict['vbatch'].append(item)
        return item

In [None]:
model = GalaxyNet(nn.ReLU, initialization=False, mappy=False, is_rgb=False).to(device)
loss_function = nn.MSELoss()

### to resume training uncomment next lines
## NOTE: the loss_dict will be empty, remember which is the last loss_{}.pickle file of previous trainings to append the values all together
loader = torch.load('../PADella/model_optim_110.pt', weights_only=True)
model.load_state_dict(loader['model_state_dict'])

Convs output size: 63


<All keys matched successfully>

In [7]:
transform= transforms.Compose([
    transforms.ToTensor(), # Riscala le immagini tra 0 e 1
    transforms.CenterCrop(324),
    transforms.Resize(128),
])

DS = GalaxyJungle('../data/test_2/test_solutions_rev1.csv', '../data/test_2/', transform, mappy=False, is_rgb=False)
test_loader = DataLoader(DS, batch_size=512, shuffle=False, num_workers=os.cpu_count())   

true_results=pd.DataFrame(columns=['GalaxyID','E0','E3','E6','S0a_eon','SB0a_eon','Scd_eon','SoB','SoA','SAa','SAb','SAc','SAd','SBa','SBb','SBc','SBd','A'])
train_results= pd.DataFrame(columns=['GalaxyID','E0','E3','E6','S0a_eon','SB0a_eon','Scd_eon','SoB','SoA','SAa','SAb','SAc','SAd','SBa','SBb','SBc','SBd','A'])

train_list=[]
tens=torch.empty((0,18),dtype=torch.float32)

model.eval()
running_validation_loss = 0
with torch.no_grad():
        for i, vdata in tqdm(enumerate(test_loader)):
            inputs,labels, GalaxyID = vdata
            inputs,labels= inputs.to(device), zk.mappy2D(labels.to(device))
            outputs = zk.mappy2D(model(inputs))
            GalaxyID=GalaxyID.unsqueeze(1)
            batch= torch.cat((GalaxyID,outputs.cpu()), dim=-1)
            tens=torch.cat((tens,batch),dim=0)

print(tens.shape)



13it [00:02,  4.89it/s]

torch.Size([6158, 18])





In [8]:
train_results= pd.DataFrame(tens, columns=['GalaxyID','E0','E3','E6','S0a_eon','SB0a_eon','Scd_eon','SoB','SoA','SAa','SAb','SAc','SAd','SBa','SBb','SBc','SBd','A'])
train_results[train_results<0]=0
train_results['GalaxyID'] = train_results['GalaxyID'].astype(int)
train_results.sort_values('GalaxyID', inplace=True, ignore_index=True)

path = '../data/test_2/test_solutions_rev1.csv' #path Ã¨ universale
dataframe = pd.read_csv(path, sep=',') 
true_results=zk.mappy_df(dataframe)
true_results.sort_values('GalaxyID', inplace=True, ignore_index=True)

lbs_train=train_results.iloc[:,1:].idxmax(axis=1)
train_results['train_label']=lbs_train.values
val_train=train_results.iloc[:,1:-1].max(axis=1)
lbs_test=true_results.iloc[:,1:].idxmax(axis=1)
true_results['true_label']=lbs_test.values
val_true=true_results.iloc[:,1:-1].max(axis=1)
train_results['train_value']=val_train.values
true_results['true_value']=val_true.values



  class4_2 / class4_2.sum(axis=1, keepdims=True),
  class5_4 / class5_4.sum(axis=1, keepdims=True),


In [12]:
true_new=true_results[true_results['true_value']>0.6]
#train_new=train_results[train_results['train_value']>0.4]
confronto=pd.merge(train_results[['GalaxyID','train_label']], true_new[['GalaxyID','true_label']], on='GalaxyID')
total_count=confronto.groupby('true_label').size().reset_index(name='total_count')
mislabels = confronto[confronto['train_label'] != confronto['true_label']]

count=mislabels.groupby('true_label').size().reset_index(name='num_misclassified').sort_values(by='num_misclassified', ascending=False)
final_count= pd.merge(total_count, count, on='true_label', how='left').sort_values(by='num_misclassified', ascending=False)
final_count['score'] = (np.round((1 - final_count['num_misclassified'] / final_count['total_count']),2)*100).astype('str') + '%'
final_count.sort_values(by='total_count', ascending=False, inplace=True)
display(final_count)



Unnamed: 0,true_label,total_count,num_misclassified,score
1,E0,605,34,94.0%
2,E3,439,86,80.0%
4,S0a_eon,103,18,83.0%
10,SoA,78,37,53.0%
6,SAc,40,3,92.0%
9,Scd_eon,39,7,82.0%
11,SoB,22,16,27.0%
3,E6,14,10,28.999999999999996%
5,SAb,10,7,30.0%
8,SBc,10,8,20.0%


In [13]:
x=confronto.groupby(['true_label','train_label']).count()
display(x)

Unnamed: 0_level_0,Unnamed: 1_level_0,GalaxyID
true_label,train_label,Unnamed: 2_level_1
A,A,1
A,SoA,1
E0,E0,571
E0,E3,15
E0,SAc,2
E0,SBb,1
E0,SoA,16
E3,E0,61
E3,E3,353
E3,SAc,1
