# The Pursuit of Human Labeling: A New Perspective on Unsupervised Learning

A new approach to unsupervised classification based on existing vision models

Implementation of the research paper published by Artyom Gadetsky and Maria Brbic from the EPFL AI Reasearch Center. The abstract can be found [here](https://openreview.net/pdf?id=3GpIeVYw8X).

**Approach** : Human labeled points are linearly separable in a sufficiently strong
representation space, and are invariant to the underlying model and resulting representation space.

In [1]:
import torch
import torchvision.models as models
import numpy as np

phi2model=torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')

phi2model.eval()
statedict=phi2model.state_dict()


Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:01<00:00, 77.5MB/s]


In [2]:
# convert to numpy
numpy_params = {key: value.numpy() for key, value in statedict.items()}
#save npy file
np.save('dinov2basic.npy', numpy_params)

## Getting the First representation phi1

Finetuned on CIFAR10

In [3]:
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torchvision import models


# create a transform class for applying the normalization
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # normalize the representations to have unit norm
])

In [4]:
# load the data
train_dataset = CIFAR10(root='./data', train=True, transform=transform, download=True)
test_dataset = CIFAR10(root='./data', train=False, transform=transform, download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 43219421.81it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

In [6]:
device=torch.device('cuda')

In [7]:
print(device)

cuda


We need to check the architecture of the chosen model and modify the last layer according to the original layer in the head

In [8]:
from torchvision import models

resnet = models.resnet50(True)
vgg16=models.vgg16(True)
num_classes = 10

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 170MB/s]
Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:03<00:00, 154MB/s]


In [9]:
for name, param in resnet.named_parameters():
  param.requires_grad = False

In [10]:
for name, param in vgg16.named_parameters():
  param.requires_grad = False

In [11]:
resnet.fc = nn.Linear(resnet.fc.in_features, num_classes)

In [12]:
vgg16.classifier[6] = nn.Linear(vgg16.classifier[6].in_features,10)

In [None]:
for name, param in vgg16.named_parameters():
  print(name, param.requires_grad)

In [14]:
resnet=resnet.to(device)

In [15]:
vgg16=vgg16.to(device)

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(vgg16.parameters(), lr = 0.001, momentum=0.9, weight_decay=0.0001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

num_epochs = 10

In [17]:
for epoch in range(num_epochs):

  vgg16.train()
  for images, labels in train_loader:
    images, labels = images.to(device), labels.to(device)

    outputs = vgg16(images)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()

    loss.backward()
    optimizer.step()
  print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item()}')

  scheduler.step()

  vgg16.eval()
  total_loss = 0
  correct = 0
  total = 0

  with torch.no_grad():
    for images, labels in test_loader:
      images, labels = images.to(device), labels.to(device)
      outputs = vgg16(images)
      loss = criterion(outputs, labels)
      total_loss += loss.item() * labels.size(0)
      total += labels.size(0)

      _, predicted = torch.max(outputs.data, 1)
      correct += (predicted == labels).sum().item()


  average = total_loss / total
  print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {average:.4f}')
  accuracy = correct / total
  print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {100 * accuracy:.2f}%')

Epoch [1/10], Training Loss: 0.9663029909133911
Epoch [1/10], Validation Loss: 1.1510
Epoch [1/10], Test Accuracy: 59.65%
Epoch [2/10], Training Loss: 1.3450578451156616
Epoch [2/10], Validation Loss: 1.1284
Epoch [2/10], Test Accuracy: 60.44%
Epoch [3/10], Training Loss: 0.8741094470024109
Epoch [3/10], Validation Loss: 1.1287
Epoch [3/10], Test Accuracy: 60.30%
Epoch [4/10], Training Loss: 1.268105149269104
Epoch [4/10], Validation Loss: 1.1059
Epoch [4/10], Test Accuracy: 60.79%
Epoch [5/10], Training Loss: 1.1579726934432983
Epoch [5/10], Validation Loss: 1.1050
Epoch [5/10], Test Accuracy: 61.25%
Epoch [6/10], Training Loss: 1.1116459369659424
Epoch [6/10], Validation Loss: 1.1022
Epoch [6/10], Test Accuracy: 60.62%
Epoch [7/10], Training Loss: 1.4910484552383423
Epoch [7/10], Validation Loss: 1.1068
Epoch [7/10], Test Accuracy: 61.39%
Epoch [8/10], Training Loss: 1.1516261100769043
Epoch [8/10], Validation Loss: 1.1071
Epoch [8/10], Test Accuracy: 61.20%
Epoch [9/10], Training Lo

In [18]:
vgg16.to("cpu")
vgg16.eval()
# sae the model state
state_dictphi1 = vgg16.state_dict()

tonumpystate= {key: value.numpy() for key, value in state_dictphi1.items()}

# Save the NumPy parameters to a .npy file
np.save('vgg16cifar10.npy', tonumpystate)

# Creating a task with those two representations

In [19]:
!pip3 install learn2learn

Collecting learn2learn
  Downloading learn2learn-0.2.0.tar.gz (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gsutil (from learn2learn)
  Downloading gsutil-5.27.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting qpth>=0.0.15 (from learn2learn)
  Downloading qpth-0.0.16.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting argcomplete>=1.9.4 (from gsutil->learn2learn)
  Downloading argcomplete-3.2.2-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting crcmod>=1.7 (from gsutil->learn2learn)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━

In [20]:
import os
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import learn2learn as l2l
import numpy as np
from tqdm import tqdm

In [21]:
exp_path="/content/linear/"
phi1_path="/content/vgg16cifar10.npy"
phi2_path="/content/dinov2basic.npy"
device = torch.device("cuda")

labels_path='/content/cifar10train_targets.npy'

classes=10
seed=98

In [37]:
phi1 = np.load(phi1_path, allow_pickle=True)
val=1
top = np.array(phi1.item()[key] for key in phi1.item())
top.astype(np.float32)

TypeError: float() argument must be a string or a real number, not 'generator'

In [24]:
if not os.path.exists(exp_path):
        os.makedirs(exp_path)

phi1 = np.load(phi1_path, allow_pickle=True).astype(np.float32)
phi2 = np.load(phi2_path, allow_pickle=True).astype(np.float32)

phi1_val = np.copy(phi1)
phi2_val = np.copy(phi2)

ylabels_val=np.load(labels_path)

TypeError: float() argument must be a string or a real number, not 'dict'

In [22]:
assert phi1.shape[0] == phi2.shape[0]
assert phi1_val.shape[0] == phi2_val.shape[0]
assert phi1_val.shape[0] == y_true_val.shape[0]

NameError: name 'phi1' is not defined

In [None]:
n_train = phi1.shape[0]
#dimensions of representations
d1, d2 = phi2.shape[1], phi1.shape[1]

inner_linear = nn.Linear(d1, classes, bias=True).to(device)
inner_lr=0.001

# MAML Algorithm  - Meta optimization algorithm to perform the optimization on the cross task distribution =
inner_linear = l2l.algorithms.MAML(inner_linear, lr=inner_lr)

optimize the cross task distribution - > solve a multi class logistic regression

In [None]:
# Instantiate task encoder with orthogonal weights parametrization (Equation 3)
task_encoder = nn.Linear(d2, classes, bias=False).to(device)
task_encoder = nn.utils.parametrizations.orthogonal(task_encoder)

outer_lr=0.001
temperature=0.1

optimizer = torch.optim.Adam(task_encoder.parameters(), lr=outer_lr)
scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[100, 200],
        gamma=0.1)

### Implementing the SparseMax activation function

The Sparsemax function operates element-wise on a real-valued vector and outputs a probability distribution that is sparse. Given an input vector
z, the Sparsemax function is defined as:



> Sparsemax(z)i=max(0,zi−τ(z))



where **τ(z)** is a threshold computed based on the input vector


The threshold is determined in such a way that the output is a probability distribution with a specified number of non-zero values.



The Sparsemax transforms logits into probabilities and when encouraging sparsity, it creates more distinctive representations for each class in the self-supervised learning task.

In [None]:
# Author: Mathieu Blondel
# License: Simplified BSD

"""
PyTorch implementation of

Learning Classifiers with Fenchel-Young Losses:
    Generalized Entropies, Margins, and Algorithms.
Mathieu Blondel, André F. T. Martins, Vlad Niculae.
https://arxiv.org/abs/1805.09717
"""


import torch

# begin: From OpenNMT-py
def threshold_and_support(z, dim=0):
    """
    z: any dimension
    dim: dimension along which to apply the sparsemax
    """
    sorted_z, _ = torch.sort(z, descending=True, dim=dim)
    z_sum = sorted_z.cumsum(dim) - 1  # sort of a misnomer
    k = torch.arange(1, sorted_z.size(dim) + 1, device=z.device).type(z.dtype).view(
        torch.Size([-1] + [1] * (z.dim() - 1))
    ).transpose(0, dim)
    support = k * sorted_z > z_sum

    k_z_indices = support.sum(dim=dim).unsqueeze(dim)
    k_z = k_z_indices.type(z.dtype)
    tau_z = z_sum.gather(dim, k_z_indices - 1) / k_z
    return tau_z, k_z


class SparsemaxFunction(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input, dim=0):
        """
        input (FloatTensor): any shape
        returns (FloatTensor): same shape with sparsemax computed on given dim
        """
        ctx.dim = dim
        tau_z, k_z = threshold_and_support(input, dim=dim)
        output = torch.clamp(input - tau_z, min=0)
        ctx.save_for_backward(k_z, output)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        k_z, output = ctx.saved_tensors
        dim = ctx.dim
        grad_input = grad_output.clone()
        grad_input[output == 0] = 0

        v_hat = (grad_input.sum(dim=dim) / k_z.squeeze()).unsqueeze(dim)
        grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
        return grad_input, None


sparsemax = SparsemaxFunction.apply


class Sparsemax(torch.nn.Module):

    def __init__(self, dim=0):
        self.dim = dim
        super(Sparsemax, self).__init__()

    def forward(self, input):
        return sparsemax(input, self.dim)
# end: From OpenNMT-py

### Cross validation function

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

def cv_score(X, y):
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    clf = LogisticRegression(penalty=None)
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return np.mean(scores)

### Main training loop

In [None]:
 sparsemax= Sparsemax(dim=1)

In [None]:
iterations=1000

linear_steps=300
#Number of inner iterations to fit linear model

In [None]:
number_subsets=20
#Number of (Xtr, Xte) subsets for averaging HUME's loss
size_subet=10000
#Size of union of each (Xtr, Xte) subset

In [None]:
for i in tqdm(range(iterations)):
        optimizer.zero_grad()
        mean_train_error = 0.0
        mean_valid_error = 0.0
        mean_valid_acc = 0.0
        mean_train_acc = 0.0
        mean_label_dist = 0.0
        mean_sparsity = 0.0

        for j in range(number_subsets):
            # Sample X_tr and X_te
            subset = np.random.choice(n_train, size=size_subset, replace=False)
            subset_tr = subset[:int(subset_size * 0.9)]
            subset_te = subset[int(subset_size * 0.9):]

            phi1_tr = torch.from_numpy(phi1[subset_tr]).to(device)
            phi1_te = torch.from_numpy(phi1[subset_te]).to(device)
            phi2_tr = torch.from_numpy(phi2[subset_tr]).to(device)
            phi2_te = torch.from_numpy(phi2[subset_te]).to(device)

            # Get labels using current task encoder
            task_labels_tr = sparsemax_act(task_encoder(phi1_tr) / temperature)
            task_labels_te = sparsemax_act(task_encoder(phi1_te) / temperature)
            task_labels_all = torch.cat((task_labels_tr, task_labels_te))

            """
            Perform inner optimization from the random initialization or
            from fixed w0 (corresponds to Cold Start BLO for Equation 5)
            """

            learner = inner_linear.clone()

            for step in range(linear_steps):
                train_error = F.cross_entropy(learner(phi2_tr), task_labels_tr)
                learner.adapt(train_error)

            # Compute HUME's objective (Equation 7)
            label_dist = task_labels_all.mean(0)
            entr = torch.special.entr(label_dist)
            valid_error = F.cross_entropy(learner(phi2_te), task_labels_te)

            # Accumulate gradients across args.num_subsets
            (valid_error - float(10)* entr.sum()).backward()

            # Compute training stats
            mean_train_error += train_error.item()
            mean_train_acc += torch.eq(learner(phi2_tr).argmax(1),task_labels_tr.argmax(1)).float().mean().item()
            mean_valid_error += valid_error.item()
            mean_valid_acc += torch.eq(learner(phi2_te).argmax(1),task_labels_te.argmax(1)).float().mean().item()
            mean_label_dist += label_dist.detach().cpu().numpy()
            mean_sparsity += task_labels_all[torch.arange(task_labels_all.shape[0]),task_labels_all.argmax(1)].mean().item()

        # Average gradients over subsets and update the task encoder parameters
        for p in task_encoder.parameters():
            p.grad.data.mul_(1.0 / number_subsets)
            print(f"Grad norm: {torch.norm(p.grad.data).item()}")
        nn.utils.clip_grad_norm_(task_encoder.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Anneal step size and temperature
        if scheduler.get_last_lr()[0] != outer_lr:
            print("Annealed Learning rate")
            outer_lr = scheduler.get_last_lr()[0]
            print("Annealed Temperature")
            temperature = temperature / 10
            print()

        # Print train stats
        print("Train stats:")
        print(f"Mean TrainError {mean_train_error / number_subsets}")
        print(f"Mean ValidError {mean_valid_error /number_subsets}")
        print(f"Mean TrainAcc {mean_train_acc / number_subsets}")
        print(f"Mean ValidAcc {mean_valid_acc / number_subsets}")
        print(f"Mean Sparsity {mean_sparsity / number_subsets}")
        print("Mean Label Dist:", mean_label_dist / number_subsets)
        print()


        # Compute cross-validation accuracy w.r.t. found task and save the results
        out_all_val = task_encoder(torch.from_numpy(phi1_val).to(device))
        task_val = torch.argmax(out_all_val, dim=1).detach().cpu().numpy()
        crossvalscore= cv_score(phi2_val, task_val)
        with open(exp_path + f"results_{seed}.pickle", "wb") as handle:
            pickle.dump({"CV_Score": crossvalscore}, handle, protocol=pickle.HIGHEST_PROTOCOL)
        torch.save(task_encoder.state_dict(), exp_path + f"linear_task_{seed}.pt")