## Redes Neurais e Aprendizado Profundo

#### Dataset custom pytorch

Moacir A Ponti - 2022

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

Para definir um dataset personalizado, para qualquer tipo de dados, projetamos uma classe herdando de `Dataset` e contendo os métodos:
- `__init__` que vai atribuir os dados
- `__len__` que retorna o tamanho do dataset
- `__getitem__` que retorna um item baseado em um índice

In [3]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

Vamos usar essa classe para armazenar dados

In [5]:
random_data = np.random.randint(0, 20, (20))
print('Dados: ', random_data)

dataset = MyDataset(random_data)
print('Total de exemplos no dataset: ', len(dataset))
print('Podemos usar colchetes pois implementamos o __getitem__, exemplo:', dataset[2])
print(dataset[0:5])

Dados:  [ 8 16  3  0  9 11  6  0 17 15  4 16  3  7  2 19  3 10 19  5]
Total de exemplos no dataset:  20
Podemos usar colchetes pois implementamos o __getitem__, exemplo: 3
[ 8 16  3  0  9]


In [6]:
# usando dataloader
train_dataloader = DataLoader(dataset, batch_size=6, shuffle=False)
next_batch = next(iter(train_dataloader))
print(next_batch)

tensor([ 8, 16,  3,  0,  9, 11])


In [13]:
iterator_train = iter(train_dataloader)
next_batch = next(iterator_train)
print(next_batch)
next_batch = next(iterator_train)
print(next_batch)
next_batch = next(iterator_train)
print(next_batch)
next_batch = next(iterator_train)
print(next_batch)
## next_batch = next(iterator_train) # gera erro pois nao há mais items para retornar
## print(next_batch)

tensor([ 8, 16,  3,  0,  9, 11])
tensor([ 6,  0, 17, 15,  4, 16])
tensor([ 3,  7,  2, 19,  3, 10])
tensor([19,  5])


In [14]:
# outro dataset contendo 10 instancias com 4 features cada
random_data2 = np.random.randint(1,5, (10, 4))
print('Dados: ', random_data2)

dataset2 = MyDataset(random_data2)
print('Total de exemplos no dataset: ', len(dataset2))
print('Podemos usar colchetes pois implementamos o __getitem__, exemplo:', dataset2[2])
print(dataset2[:3])

Dados:  [[2 3 1 3]
 [1 4 3 4]
 [3 3 3 1]
 [2 2 2 4]
 [1 1 4 3]
 [1 4 2 2]
 [2 1 3 1]
 [3 3 3 3]
 [4 2 1 4]
 [4 4 3 3]]
Total de exemplos no dataset:  10
Podemos usar colchetes pois implementamos o __getitem__, exemplo: [3 3 3 1]
[[2 3 1 3]
 [1 4 3 4]
 [3 3 3 1]]


In [29]:
# dataloader 2
train_dataloader2 = DataLoader(dataset2, batch_size=3, shuffle=True)
iterator_train = iter(train_dataloader2)

In [30]:
next_batch = next(iterator_train, None)
while next_batch != None:
  print(next_batch)
  next_batch = next(iterator_train, None)

tensor([[1, 4, 2, 2],
        [1, 4, 3, 4],
        [3, 3, 3, 3]])
tensor([[2, 3, 1, 3],
        [2, 2, 2, 4],
        [3, 3, 3, 1]])
tensor([[2, 1, 3, 1],
        [1, 1, 4, 3],
        [4, 2, 1, 4]])
tensor([[4, 4, 3, 3]])


---
### Um dataset mais completo, 

Agora com rótulos e usando pandas DataFrame

In [32]:
dataframe_breast_cancer = pd.read_csv('data.csv')
cols_dataframe = list(dataframe_breast_cancer)
print(dataframe_breast_cancer.columns)

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')


In [36]:
# verificando como é a coluna diagnosis que nos dá o rótulo
dataframe_breast_cancer['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [45]:
# transformando o rotulo em numerico
dataframe_breast_cancer['label'] = (dataframe_breast_cancer['diagnosis'] == 'M').astype(float)

In [38]:
# manter as colunas que vamos utilizar apenas
col_features = list(dataframe_breast_cancer)
col_features.remove('Unnamed: 32')
col_features.remove('id')
col_features.remove('diagnosis')

In [46]:
dataframe_breast_cancer[col_features].sample(5)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,label
167,16.78,18.8,109.3,886.3,0.08865,0.09182,0.08422,0.06576,0.1893,0.05534,...,26.3,130.7,1260.0,0.1168,0.2119,0.2318,0.1474,0.281,0.07228,1.0
133,15.71,13.93,102.0,761.7,0.09462,0.09462,0.07135,0.05933,0.1816,0.05723,...,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071,0.0
524,9.847,15.68,63.0,293.2,0.09492,0.08419,0.0233,0.02416,0.1387,0.06891,...,22.99,74.32,376.5,0.1419,0.2243,0.08434,0.06528,0.2502,0.09209,0.0
543,13.21,28.06,84.88,538.4,0.08671,0.06877,0.02987,0.03275,0.1628,0.05781,...,37.17,92.48,629.6,0.1072,0.1381,0.1062,0.07958,0.2473,0.06443,0.0
68,9.029,17.33,58.79,250.5,0.1066,0.1413,0.313,0.04375,0.2111,0.08046,...,22.65,65.5,324.7,0.1482,0.4365,1.252,0.175,0.4228,0.1175,0.0


In [47]:
# gero split aleatorio entre treinamento e teste e reseto indices
train, test = train_test_split(dataframe_breast_cancer[col_features], test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [48]:
train

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,label
0,14.540,27.54,96.73,658.8,0.11390,0.15950,0.163900,0.073640,0.2303,0.07077,...,37.13,124.10,943.2,0.16780,0.65770,0.70260,0.17120,0.4218,0.13410,1.0
1,18.010,20.56,118.40,1007.0,0.10010,0.12890,0.117000,0.077620,0.2116,0.06077,...,26.06,143.40,1426.0,0.13090,0.23270,0.25440,0.14890,0.3251,0.07625,1.0
2,9.777,16.99,62.50,290.2,0.10370,0.08404,0.043340,0.017780,0.1584,0.07065,...,21.47,71.68,367.0,0.14670,0.17650,0.13000,0.05334,0.2533,0.08468,0.0
3,15.700,20.31,101.20,766.6,0.09597,0.08799,0.065930,0.051890,0.1618,0.05549,...,32.82,129.30,1269.0,0.14140,0.35470,0.29020,0.15410,0.3437,0.08631,1.0
4,12.950,16.02,83.14,513.7,0.10050,0.07943,0.061550,0.033700,0.1730,0.06470,...,19.93,88.81,585.4,0.14830,0.20680,0.22410,0.10560,0.3380,0.09584,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,12.250,22.44,78.18,466.5,0.08192,0.05200,0.017140,0.012610,0.1544,0.05976,...,31.99,92.74,622.9,0.12560,0.18040,0.12300,0.06335,0.3100,0.08203,0.0
451,9.173,13.86,59.20,260.9,0.07721,0.08751,0.059880,0.021800,0.2341,0.06963,...,19.23,65.59,310.1,0.09836,0.16780,0.13970,0.05087,0.3282,0.08490,0.0
452,15.300,25.27,102.40,732.4,0.10820,0.16970,0.168300,0.087510,0.1926,0.06540,...,36.71,149.30,1269.0,0.16410,0.61100,0.63350,0.20240,0.4027,0.09876,1.0
453,11.080,14.71,70.21,372.7,0.10060,0.05743,0.023630,0.025830,0.1566,0.06669,...,16.82,72.01,396.5,0.12160,0.08240,0.03938,0.04306,0.1902,0.07313,0.0


In [51]:
class ClassificationDataset(Dataset):
    def __init__(self, features, labels=None):
        self.X = torch.tensor(np.array(features), dtype=torch.float32)
        self.y = torch.tensor(np.array(labels), dtype=torch.float32) if labels is not None else None
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        data = self.X[idx]

        if self.y is not None:
           return (data, self.y[idx])
        else:
           return data

In [55]:
bc_train = ClassificationDataset(train.iloc[:, :-1], train.loc[:, 'label'])
bc_test = ClassificationDataset(test.iloc[:, :-1], test.loc[:, 'label'])
print('Dataset treinamento: ', len(bc_train))
print('Dataset teste: ', len(bc_test))

Dataset treinamento:  455
Dataset teste:  114


In [56]:
train_dataloader = DataLoader(bc_train, batch_size=10, shuffle=True)
test_dataloader = DataLoader(bc_test, batch_size=2, shuffle=True)

In [58]:
bc_iterator_train = iter(train_dataloader)
next_train_batch = next(bc_iterator_train)
print(next_train_batch[0].shape, next_train_batch[1].shape)

torch.Size([10, 30]) torch.Size([10])
