In [1]:
import torch
import numpy as np
from scipy.spatial import distance
import os
import pandas as pd
import sys
sys.path.append('..')
import registry
import datafree

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"] = ["SimHei"]
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

In [2]:
!nvidia-smi

Fri Nov  3 01:38:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.60.02    Driver Version: 510.60.02    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:05:00.0 Off |                  N/A |
| 36%   48C    P2   114W / 350W |  23766MiB / 24576MiB |     30%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:09:00.0 Off |                  N/A |
| 58%   61C    P2   260W / 350W |  12323MiB / 24576MiB |     94%      Defaul

In [3]:
distributed = False
gpu = 1
batch_size = 32
workers = 8
num_classes = 200
def prepare_model(model):
    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
        return model
    elif distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if gpu is not None:
#             torch.cuda.set_device(gpu)
            model.cuda()
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            batch_size = int(batch_size / 1)
            workers = int((workers + 1 - 1) / 1)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[int(x) for x in gpu.split(',')])
            return model
        else:
            model.cuda()
            model = torch.nn.parallel.DistributedDataParallel(model)
            return model
    elif gpu is not None:
        torch.cuda.set_device(gpu)
        model = model.cuda(gpu)
        return model
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        model = torch.nn.DataParallel(model).cuda()
        return model

In [5]:
from torchvision.datasets import CIFAR10,CIFAR100
import datafree
import registry
from torch import nn
student = registry.get_model('resnet18_imagenet', num_classes=num_classes)
teacher = registry.get_model('resnet50_imagenet', num_classes=num_classes, pretrained=True).eval()
# teacher = registry.get_model('resnet34_imagenet', num_classes=num_classes, pretrained=True).eval()
normalizer = datafree.utils.Normalizer(**registry.NORMALIZE_DICT['tiny_imagenet'])


# teacher = teacher.to(gpu)
teacher.avgpool = nn.AdaptiveAvgPool2d(1)
num_ftrs = teacher.fc.in_features
teacher.fc = nn.Linear(num_ftrs, 200)
teacher.conv1 = nn.Conv2d(3,64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
teacher.maxpool = nn.Sequential()
teacher = prepare_model(teacher)
ckpt = torch.load('../checkpoints/scratch/tiny_imagenet_resnet50_imagenet.pth', map_location='cpu')
dict_ckpt = dict()
for k, v in ckpt['state_dict'].items():
    dict_ckpt['.'.join(k.split('.')[1:])] = v
teacher.load_state_dict(dict_ckpt)

print('Pretrained teacher model Acc@1 = ' + str(ckpt['best_acc1']))
teacher.eval()

Pretrained teacher model Acc@1 = 67.17999999999999


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): Sequential()
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, aff

In [11]:

ckpt = torch.load('../checkpoints/datafree-adadfkd/tiny_imagenet-resnet50_imagenet-resnet18_imagenet--adadfkd_new2-R0.pth', map_location='cpu')
dict_ckpt = dict()
for k, v in ckpt['state_dict'].items():
    dict_ckpt['.'.join(k.split('.')[1:])] = v
print('Student model Acc@1 = ' + str(ckpt['best_acc1']))

student = prepare_model(student)
student.avgpool = nn.AdaptiveAvgPool2d(1)
num_ftrs = student.fc.in_features
student.fc = nn.Linear(num_ftrs, 200)
student.conv1 = nn.Conv2d(3,64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
student.maxpool = nn.Sequential()

# student.load_state_dict(ckptstate_dictte_dict'])
student = prepare_model(student)
student.load_state_dict(dict_ckpt)
student.eval()

Student model Acc@1 = 58.18


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): Sequential()
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1,

In [12]:
import torchvision.transforms as T
from registry import NORMALIZE_DICT, TinyImageNet
from torchvision import datasets

val_transform = T.Compose([
    #T.Resize((224, 224), Image.BICUBIC),
    T.ToTensor(),
    T.Normalize( **NORMALIZE_DICT['tiny_imagenet'] ),
])


val_dst = TinyImageNet('/data/lijingru/timagenet/tiny-imagenet-200/', split='val', transform=val_transform)

val_loader = torch.utils.data.DataLoader(val_dst, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [13]:
import tqdm
probs_s = []
probs_t = []
ys = []
for x, y in tqdm.tqdm(val_loader, desc='calculate_agg'):
    s_out = student(x.to(gpu))
    t_out = teacher(x.to(gpu))
#     print(np.argmax(prob_t, 1), y.numpy())
#     print(torch.sum(s_out.argmax(1).detach().cpu() == y) / 128)
    prob_s = torch.softmax(s_out.detach(), 1).cpu().numpy()
    probs_s.append(prob_s)
    prob_t = torch.softmax(t_out.detach(), 1).cpu().numpy()
    probs_t.append(prob_t)
    ys.append(y.numpy())

probs_s = np.concatenate(probs_s, 0)
probs_t = np.concatenate(probs_t, 0)
ys = np.concatenate(ys, 0)

calculate_agg: 100%|██████████| 313/313 [00:21<00:00, 14.36it/s]


In [14]:
if not os.path.exists('prob_loyalty_distribution/'):
    os.mkdir('prob_loyalty_distribution/')
# print(y)
print('Acc@1 of student is {:.4f}'.format(np.sum(np.argmax(probs_s, 1) == ys) / len(val_dst)))
print('Acc@1 of teacher is {:.4f}'.format(np.sum(np.argmax(probs_t, 1) == ys) / len(val_dst)))
print('Agree@1 of student is {:.4f}'.format(np.sum(np.argmax(probs_t, 1) == np.argmax(probs_s, 1)) / len(val_dst)))
dist = distance.jensenshannon(probs_s.T, probs_t.T)
dist[np.isnan(dist)] = 0 if np.sum(np.abs(probs_s[np.where(np.isnan(dist))] - probs_t[np.where(np.isnan(dist))])) < 1e-6 else 1
prob_loyalty = 1 - np.sqrt(dist)
print('Probability loyalty of student is {:.4f}'.format(np.nanmean(prob_loyalty)))

Acc@1 of student is 0.5818
Acc@1 of teacher is 0.6718
Agree@1 of student is 0.6883
Probability loyalty of student is 0.4753
