# Custom Dataset

## In this tutorial, we provide an example of adapting usb to custom dataset.

In [1]:
import numpy as np
from torchvision import transforms
from semilearn import get_data_loader, get_net_builder, get_algorithm, get_config, Trainer
from semilearn import split_ssl_data, BasicDataset

  from .autonotebook import tqdm as notebook_tqdm


## Specifiy configs and define the model

In [3]:
# define configs and create config
config = {
    'algorithm': 'fixmatch',
    'net': 'vit_tiny_patch2_32',
    'use_pretrain': True, 
    'pretrain_path': 'https://github.com/microsoft/Semi-supervised-learning/releases/download/v.0.0.0/vit_tiny_patch2_32_mlp_im_1k_32.pth',

    # optimization configs
    'epoch': 1,  
    'num_train_iter': 1000, 
    'num_eval_iter': 500,   
    'num_log_iter': 50,    
    'optim': 'AdamW',
    'lr': 5e-4,
    'layer_decay': 0.5,
    'batch_size': 16,
    'eval_batch_size': 16,

    # dataset configs
    'dataset': 'mnist',
    'num_labels': 40,
    'num_classes': 10,
    'img_size': 32,
    'crop_ratio': 0.875,
    'data_dir': './data',

    # algorithm specific configs
    'hard_label': True,
    'uratio': 2,
    'ulb_loss_ratio': 1.0,

    # device configs
    'gpu': 0,
    'world_size': 1,
    "num_workers": 2,
    'distributed': False,
}
config = get_config(config)

/bin/sh: 1: netstat: not found


In [4]:
# create model and specify algorithm
algorithm = get_algorithm(config,  get_net_builder(config.net, from_name=False), tb_log=None, logger=None)

_IncompatibleKeys(missing_keys=['head.weight', 'head.bias'], unexpected_keys=[])
Create optimizer and scheduler


## Create dataset

In [5]:
# replace with your own code
data = np.random.randint(0, 255, size=3072 * 1000).reshape((-1, 32, 32, 3))
data = np.uint8(data)
target = np.random.randint(0, 10, size=1000)
lb_data, lb_target, ulb_data, ulb_target = split_ssl_data(config, data, target, 10,
                                                          config.num_labels, include_lb_to_ulb=config.include_lb_to_ulb)

train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                      transforms.RandomCrop(32, padding=int(32 * 0.125), padding_mode='reflect'),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

train_strong_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                             transforms.RandomCrop(32, padding=int(32 * 0.125), padding_mode='reflect'),
                                             transforms.ToTensor(),
                                             transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

lb_dataset = BasicDataset(config.algorithm, lb_data, lb_target, config.num_classes, train_transform, is_ulb=False)
ulb_dataset = BasicDataset(config.algorithm, lb_data, lb_target, config.num_classes, train_transform, is_ulb=True, strong_transform=train_strong_transform)

In [6]:
# replace with your own code
eval_data = np.random.randint(0, 255, size=3072 * 100).reshape((-1, 32, 32, 3))
eval_data = np.uint8(eval_data)
eval_target = np.random.randint(0, 10, size=100)

eval_transform = transforms.Compose([transforms.Resize(32),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

eval_dataset = BasicDataset(config.algorithm, lb_data, lb_target, config.num_classes, eval_transform, is_ulb=False)

In [7]:
# define data loaders
train_lb_loader = get_data_loader(config, lb_dataset, config.batch_size)
train_ulb_loader = get_data_loader(config, ulb_dataset, int(config.batch_size * config.uratio))
eval_loader = get_data_loader(config, eval_dataset, config.eval_batch_size)

## Training and evaluation

In [8]:
# training and evaluation
trainer = Trainer(config, algorithm)
trainer.fit(train_lb_loader, train_ulb_loader, eval_loader)
trainer.evaluate(eval_loader)

Epoch: 0
50 iteration USE_EMA: True, train/sup_loss: 2.3674, train/unsup_loss: 0.0000, train/total_loss: 2.3674, train/util_ratio: 0.0000, train/run_time: 0.1369, lr: 0.0005, train/prefecth_time: 0.0040 
100 iteration USE_EMA: True, train/sup_loss: 2.3490, train/unsup_loss: 0.0000, train/total_loss: 2.3490, train/util_ratio: 0.0000, train/run_time: 0.1373, lr: 0.0005, train/prefecth_time: 0.0038 
150 iteration USE_EMA: True, train/sup_loss: 2.3455, train/unsup_loss: 0.0000, train/total_loss: 2.3455, train/util_ratio: 0.0000, train/run_time: 0.1377, lr: 0.0005, train/prefecth_time: 0.0037 


KeyboardInterrupt: 