In [1]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch import optim
from torch.utils.data import Dataset, DataLoader

from matplotlib import pyplot as plt

from torch.utils.data import Dataset, DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import pickle
from glob import glob
import os

In [3]:
# Get data
labeled_portion = 0.8
train_portion = 0.8
full_train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(),  download=True)

labeled_size = int(labeled_portion * len(full_train_dataset))
unlabeled_size = len(full_train_dataset) - labeled_size
labeled_data, unlabeled_data = torch.utils.data.random_split(full_train_dataset, [labeled_size, unlabeled_size])

train_size = int(train_portion * len(labeled_data))
val_size = len(labeled_data) - train_size
train_data, val_data = torch.utils.data.random_split(labeled_data, [train_size, val_size])

test_data = torchvision.datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

# Here we take out all unlabeled data and ignore the labels
unlabeled_data = [unlabeled_data[i][0] for i in range(len(unlabeled_data))]

# Print the number of unlabeled data
print(f'There are {len(unlabeled_data)} unlabeled data')

# create dataset
class labeled_dataset(Dataset):
    def __init__(self, labeled_data, labels):
        self.samples = labeled_data
        self.labels = labels
        self.n_samples = len(labeled_data)

    def __getitem__(self, index):
        # print(self.samples[index].unsqueeze(0).shape)
        return self.samples[index], self.labels[index]
    
    def __len__(self):
        return self.n_samples

class unlabeled_datasest(Dataset):
    def __init__(self, unlabeled_data):
        self.samples = labeled_data
        self.n_samples = len(new_dataset)

    def __getitem__(self, index):
        # print(self.samples[index].unsqueeze(0).shape)
        return self.samples[index]
    
    def __len__(self):
        return self.n_samples

batch_size = 64

train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

unlabeled_loader = torch.utils.data.DataLoader(unlabeled_data, batch_size=batch_size, shuffle=True)

val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Preprocess


There are 12000 unlabeled data


In [4]:
# Model setting

# Hyperparameters
in_dim = 784
out_dim = 10 
hid_dim = 300
n_epoch = 5
lr = 1e-4

MAX_ESC = 10
class FC1(nn.Module):
    
    def __init__(self, in_dim, hid_dim, out_dim):
        super(FC1, self).__init__()
        self.shrink = torch.nn.Sequential(
            torch.nn.Linear(in_dim, hid_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hid_dim, out_dim)
            )
        self.expand = torch.nn.Sequential(
            torch.nn.Linear(out_dim, hid_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hid_dim, in_dim)
            )

    
    def forward(self, x):
        x = self.shrink(x)
        return x

# model = torch.nn.Sequential(
#     torch.nn.Linear(in_dim, hid_dim),
#     torch.nn.ReLU(),
#     torch.nn.Linear(hid_dim, out_dim),
# )
model = FC1(in_dim, hid_dim, out_dim)
loss_fcn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# optimizer = optim.Adam(model.parameters(), lr=lr)

# print(model)
# print(next(model.parameters()).device)
model.cuda()

FC1(
  (shrink): Sequential(
    (0): Linear(in_features=784, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=10, bias=True)
  )
  (expand): Sequential(
    (0): Linear(in_features=10, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=784, bias=True)
  )
)

In [5]:
#----------------#
# Initialization #
#----------------#

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(init_weights)

def train(self):
    ############
    # Training #
    ############

    n_batch = len(train_loader)
    best_val_acc = 0
    esc = 0
    val_acc = 0

    #----------------#
    # Start training #
    #----------------#

    model.train()

    for epoch in range(n_epoch):
        total_cnt, correct_cnt, train_loss, total_loss = 0, 0, 0, 0
        
        for batch, (images, labels) in enumerate(train_loader):
            if torch.cuda.is_available():
                images = images.to(device)
                labels = labels.to(device)
            
            predictions = model(images.view(-1, in_dim))
            loss = loss_fcn(predictions, labels)
        
            model.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate the training loss and accuracy of each iteration
            _, pred_labels = torch.max(predictions, 1)
            total_cnt += images.size(0)
            correct_cnt += (pred_labels == labels).sum().item()
            train_loss += loss.item()
            
            # Show the training information
            # if batch % 100 == 0 or batch == len(train_loader):
            #     acc = correct_cnt / total_cnt
            #     print(
            #         f"Epoch [{epoch+1}/{n_epoch}], Step [{batch}/{n_batch}], Train loss: {loss.item():.6f}, Train acc: {acc * 100:.3f} %")
            acc = correct_cnt / total_cnt
            print(f"\rEpoch [{epoch+1}/{n_epoch}], Step [{batch:3}/{n_batch}], Train loss: {loss.item():.6f}, Train acc: {acc * 100:.3f} % val_acc: {val_acc * 100:.3f} %, {esc} / {MAX_ESC}", end='')
        #------------#
        # Validating #
        #------------#
        
        model.eval()

        with torch.no_grad():  # No need BP
            total_cnt, correct_cnt, val_loss = 0, 0, 0
            
            for batch, (images, labels) in enumerate(val_loader, 1):
                
                # Put input tensor to GPU if it's available
                if torch.cuda.is_available():
                    images = images.to(device)
                    labels = labels.to(device)
                    # images, labels = images.cuda(), labels.cuda()

                # Forward pass
                predictions = model(images.view(-1, in_dim))
                loss = loss_fcn(predictions, labels)
                
                # Calculate the training loss and accuracy of each iteration
                _, pred_labels = torch.max(predictions, 1)
                total_cnt += images.size(0)
                correct_cnt += (pred_labels == labels).sum().item()
                val_loss += loss.item()

            val_acc = correct_cnt / total_cnt
            # print(f"\rval_acc: {val_acc * 100:.3f} %, {esc} / {MAX_ESC}", end='')
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc

                # Save trained model
                torch.save(model.state_dict(), f"./checkpoint/NN.pth" )
                # print('(model updated!)')
                esc = 0
            else:
#                 print('(model dropped)')
                esc += 1

            
        if esc > MAX_ESC:
            break

    
    
    print('\nFinish training')
    print(f'Best val acc = {best_val_acc}')

In [6]:
###########################
# Pseudo-labeling by rank #
###########################

top = 50

def pseudoLabel(self):

    global train_loader

    pseudo_labels = []
    all_confidence = []
    pred_list = []
    img_list = []

    print('Getting pseudolabels', end=': ')
    with torch.no_grad(): # No need BP
        
        # Record variables and containers
        n_correct = 0
        n_samples = 0
        n_class_correct = [0] * 10
        n_class_samples = [0] * 10
        
        # top = itertools.islice(unlabeled_loader, 5)
        for images in unlabeled_loader:
            
            # Get the GPU support
            if torch.cuda.is_available():
                images = images.to(device)
            
            # Predict via forward pass
            predictions = model(images.view(-1, 784))
            
            confidence, predicted_labels = torch.max(predictions, dim=1)
            all_confidence += confidence
            pred_list += predicted_labels
            img_list += images
    print('done')

    print(f'Finding the top {top}', end=': ')
    top_c = np.argpartition(all_confidence, -top)[-top:]
    pseudo_labels = np.array(pred_list)[top_c]
    confident_unlabeled = np.array(img_list)[top_c]
    print('done')


    print('Updating U-data', end=': ')
    all_indices = np.arange(len(unlabeled_data))
    remain_indices = np.delete(all_indices, top_c)
    unlabeled_data[:] = [unlabeled_data[i] for i in remain_indices]
    print('done')

    print('Updating L-data', end=': ')
    original_dataset = None
    original_labels = None

    if os.path.isfile('x.pkl') is False or os.path.isfile('y.pkl') is False:
        for images, labels in train_loader:
            if original_dataset is None:
                original_dataset = images
            else:
                original_dataset = torch.cat((original_dataset, images))
            
            if original_labels is None:
                original_labels = labels
            else:
                original_labels = torch.cat((original_labels, labels))

    with open('x.pkl', 'rb') as handle:
        original_dataset = pickle.load(handle)
    with open('y.pkl', 'rb') as handle:
        original_labels = pickle.load(handle)

    confident_unlabeled = torch.stack(list(confident_unlabeled))
    pseudo_labels = torch.stack(list(pseudo_labels))
    new_dataset = torch.cat((original_dataset, confident_unlabeled.cpu()))
    new_labels = torch.cat((original_labels, pseudo_labels.cpu()))
    train_data = labeled_dataset(new_dataset, new_labels)    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    
    # print(len(train_loader))
    
    with open('x.pkl', 'wb') as handle:
        pickle.dump(new_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('y.pkl', 'wb') as handle:
        pickle.dump(new_labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print('done')
    
    # print(f'Accepting rate: {len(confident_unlabeled) / len(unlabeled_data) * 100:.2f} %')

In [7]:
class SSL():

    def __init__(self):
        self.labeled_data = labeled_data
    
    def describe(self):
        pass
        # describe data and model

    train = train
    
    def validate(self):
        pass
        # todo
    
    def test(self):
        pass
        # todo
    
    pseudoLabel = pseudoLabel
    
    def selection(self):
        pass
        # todo
    
    def oneRun(self):
        self.train()
        self.pseudoLabel()
        # todo

    def shouldTerminate(self):
        pass
        # todo

In [8]:
ssl = SSL()

In [None]:
for i in range(100):
    print(f'--- Round {i} ---')
    ssl.oneRun()

--- Round 0 ---
Epoch [5/5], Step [599/600], Train loss: 2.090021, Train acc: 26.505 % val_acc: 22.302 %, 0 / 10
Finish training
Best val acc = 0.3103125
Getting pseudolabels: done
Finding the top 50: done
Updating U-data: done
Updating L-data: done
--- Round 1 ---
Epoch [5/5], Step [613/614], Train loss: 1.812702, Train acc: 57.646 % val_acc: 54.844 %, 0 / 10
Finish training
Best val acc = 0.5827083333333334
Getting pseudolabels: done
Finding the top 50: done
Updating U-data: done
Updating L-data: done
--- Round 2 ---
Epoch [5/5], Step [614/615], Train loss: 1.469839, Train acc: 68.855 % val_acc: 67.312 %, 0 / 10
Finish training
Best val acc = 0.6859375
Getting pseudolabels: done
Finding the top 50: done
Updating U-data: done
Updating L-data: done
--- Round 3 ---
Epoch [4/5], Step [614/615], Train loss: 1.396303, Train acc: 73.337 % val_acc: 71.740 %, 0 / 10