In [1]:
from tqdm.notebook import tqdm
import numpy as np

import sklearn
import sklearn.datasets
import sklearn.model_selection
import sklearn.metrics

import torch
import torchvision
from torch import nn
from torch.nn import functional as F


PYTORCH_DEVICE = torch.device("cpu" if not torch.cuda.is_available() else "cuda:0")
print(PYTORCH_DEVICE)

cuda:0


In [2]:
breast_cancer_datset = sklearn.datasets.load_breast_cancer()

X = breast_cancer_datset['data']
Y = breast_cancer_datset['target']

print("X_shape", X.shape)
print("Y_shape", Y.shape)

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.3, random_state=67)

X_shape (569, 30)
Y_shape (569,)


In [3]:
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        assert X.shape[0] == Y.shape[0]
        self.X = torch.FloatTensor(X)
        self.Y = torch.LongTensor(Y)

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

    def __len__(self):
        return self.X.shape[0]

train_dataset = SimpleDataset(X_train, Y_train)
test_dataset = SimpleDataset(X_test, Y_test)

BATCH_SIZE = 64

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [4]:
def infinite_dataloader_wrapper(dataloader):
    while True:
        for batch in dataloader:
            yield batch
            
infinite_train_dataloader = infinite_dataloader_wrapper(train_dataloader)
infinite_test_dataloader = infinite_dataloader_wrapper(test_dataloader)

In [5]:
class SimpleClassificationModel(torch.nn.Module):
    def __init__(self, input_features_count, num_classes):
        super(SimpleClassificationModel, self).__init__()
        self.some_sequential_block = nn.Sequential(
            nn.Linear(input_features_count, input_features_count),
            nn.ReLU(),
        )
        self.final_fc = nn.Linear(input_features_count, num_classes)

    def forward(self, x):
        x = self.some_sequential_block(x)
        x = self.final_fc(x)
        return x

In [6]:
model = SimpleClassificationModel(X.shape[1], max(Y.tolist()) + 1).to(PYTORCH_DEVICE)
loss_function = torch.nn.CrossEntropyLoss().to(PYTORCH_DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [7]:
from torch.utils.tensorboard import SummaryWriter

USE_TENSORBOARD = False
tensorboard_writer = SummaryWriter(comment=f"_{str(model.__class__.__name__)}") if USE_TENSORBOARD else None

def process_batch_metrics(results, is_train, iteration):
    train_or_test = "train" if is_train else "test"

    accuracy = sklearn.metrics.accuracy_score(results["y_numpy"], results["y_predicted_class_numpy"])
    
    if iteration is not None:
        if tensorboard_writer:
            tensorboard_writer.add_scalar(f'{train_or_test}/loss', results["loss_numpy"], iteration)
            tensorboard_writer.add_scalar(f'{train_or_test}/accuracy', accuracy, iteration)
        else:
            if iteration % 250 == 0:
                print(iteration, train_or_test)
                print("loss", results["loss_numpy"])
                print("accuracy", accuracy)
    

def process_batch(batch, is_train, iteration):
    if is_train:
        model.train()
    else:
        model.eval()

    x, y = batch
    x = x.to(PYTORCH_DEVICE)
    y = y.to(PYTORCH_DEVICE)

    if is_train:
        optimizer.zero_grad()

    y_predicted = model(x)
    assert not torch.any(torch.isnan(y_predicted))
    loss = loss_function(y_predicted, y)

    if is_train:
        loss.backward()
        optimizer.step()
    
    results = {}
    results["y_numpy"] = y.cpu().detach().numpy()
    results["y_predicted_numpy"] = y_predicted.cpu().detach().numpy()
    results["y_predicted_class_numpy"] = results["y_predicted_numpy"].argmax(axis=1)
    results["loss_numpy"] = loss.cpu().detach().numpy()
    
    process_batch_metrics(results, is_train, iteration)

    return results
    
iteration = None
for iteration in tqdm(range(1_000)):
    process_batch(next(infinite_train_dataloader), True, iteration)
    with torch.no_grad():
        process_batch(next(infinite_test_dataloader), False, iteration)

  0%|          | 0/1000 [00:00<?, ?it/s]

0 train
loss 31.68948
accuracy 0.34375
0 test
loss 27.595242
accuracy 0.34375
250 train
loss 0.120098464
accuracy 0.953125
250 test
loss 0.24515913
accuracy 0.921875
500 train
loss 0.18063208
accuracy 0.890625
500 test
loss 0.09391493
accuracy 0.9534883720930233
750 train
loss 0.20087877
accuracy 0.9375
750 test
loss 0.11801361
accuracy 0.96875


In [8]:
# hacking of last layer on neural network and train only him
for param in model.parameters():
    model.requires_grad = False

model.fc = nn.Sequential(
    nn.Linear(X.shape[1], 10),
    nn.ReLU(),
    nn.Linear(X.shape[1], 2),
)

for iteration in tqdm(range(iteration, iteration + 100)):
    process_batch(next(infinite_train_dataloader), True, iteration)
    with torch.no_grad():
        process_batch(next(infinite_test_dataloader), False, iteration)

  0%|          | 0/100 [00:00<?, ?it/s]

1000 train
loss 0.17885324
accuracy 0.921875
1000 test
loss 0.18177253
accuracy 0.8837209302325582


In [9]:
# final metrics

def predict_for_dataloader(model, dataloader):
    Y = []
    Y_predicted = []
    for batch in tqdm(dataloader):
        result = process_batch(batch, False, None)
        Y.append(result["y_numpy"])
        Y_predicted.append(result["y_predicted_numpy"])
    Y = np.concatenate(Y)
    Y_predicted = np.concatenate(Y_predicted)
    Y_predicted_class = Y_predicted.argmax(axis=1)
    return Y, Y_predicted_class


train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Y_true_train, Y_predicted_class_train = predict_for_dataloader(model, train_dataloader)
Y_true_test, Y_predicted_class_test = predict_for_dataloader(model, test_dataloader)

print("Train accuracy:", sklearn.metrics.accuracy_score(Y_true_train, Y_predicted_class_train))
print("Test accuracy:", sklearn.metrics.accuracy_score(Y_true_test, Y_predicted_class_test))

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Train accuracy: 0.9371859296482412
Test accuracy: 0.935672514619883


In [10]:
# bonus for image tasks. for getting dataset for images and transform it. you coud use such code
# from torchvision import transforms as T

# transform = T.Compose([
#     T.transforms.ToTensor(), 
#     T.transforms.Normalize([0.4, 0.4, 0.4], [0.4, 0.4, 0.4])]

#     # Augmentations,
#     # https://pytorch.org/vision/main/auto_examples/plot_transforms.html#sphx-glr-auto-examples-plot-transforms-py
    
# )

# dataset = ImageFolder("your/folder", transform=transform)