In [1]:
import os
import time
import torch
import numpy as np
from torch.autograd import Variable
import models
from config import cfg
from data_loader import data_loader
from loss import make_loss
from optimizer import make_optimizer
from scheduler import make_scheduler
from logger import make_logger
from evaluation import evaluation
from datasets import PersonReID_Dataset_Downloader
from utils import check_jupyter_run
if check_jupyter_run():
    from tqdm import tqdm_notebook as tqdm
else:
    from tqdm import tqdm

config_file = "./config/softmax.yaml"
cfg.merge_from_file(config_file)
cfg.freeze()

PersonReID_Dataset_Downloader('./datasets',cfg.DATASETS.NAMES)

output_dir = cfg.OUTPUT_DIR
if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

logger = make_logger("reid_baseline", output_dir)
logger.info("Using {} GPUS".format(1))
logger.info("Loaded configuration file {}".format(config_file))
logger.info("Running with config:\n{}".format(cfg))

train_loader, val_loader, num_query, num_classes = data_loader(cfg)
model = getattr(models, cfg.MODEL.NAME)(num_classes)
optimizer = make_optimizer(cfg, model)
scheduler = make_scheduler(cfg,optimizer)
loss_fn = make_loss(cfg)

log_period = cfg.SOLVER.LOG_PERIOD
checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
eval_period = cfg.SOLVER.EVAL_PERIOD
output_dir = cfg.OUTPUT_DIR
device = torch.device(cfg.MODEL.DEVICE)
epochs = cfg.SOLVER.MAX_EPOCHS
logger.info("Start training")

since = time.time()
for epoch in tqdm(range(epochs), desc='Epoch'):
    count = 0
    running_loss = 0.0
    running_acc = 0
    for data in tqdm(train_loader, desc='Iteration', leave=False):
        model.train()
        images, labels = data
        
        if device:
            model.to(device)
            images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
                
        scores, feats = model(images)
        loss = loss_fn(scores, feats, labels)
        
        loss.backward()
        optimizer.step()
        
        count = count + 1
        running_loss += loss.item()
        running_acc += (scores.max(1)[1] == labels).float().mean().item()
        
        if count % log_period == 0:
            logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}"
                                .format(epoch+1, count, len(train_loader),
                                running_loss/count, running_acc/count,
                                scheduler.get_lr()[0]))
            
    if epoch % checkpoint_period == 0:
        model.save(output_dir,epoch+1)
    
    # Validation
    if epoch % eval_period == 0:
        all_feats = []
        all_pids = []
        all_camids = []
        
        for data in tqdm(val_loader, desc='Feature Extraction', leave=False):
            model.eval()
            with torch.no_grad():
                images, pids, camids = data

                if device:
                    model.to(device)
                    images = images.to(device)

                feats = model(images)

            all_feats.append(feats)
            all_pids.extend(np.asarray(pids))
            all_camids.extend(np.asarray(camids))

        all_feats = torch.cat(all_feats, dim=0)
        # query
        qf = all_feats[:num_query]
        q_pids = np.asarray(all_pids[:num_query])
        q_camids = np.asarray(all_camids[:num_query])
        # gallery
        gf = all_feats[num_query:]
        g_pids = np.asarray(all_pids[num_query:])
        g_camids = np.asarray(all_camids[num_query:])
        
        m, n = qf.shape[0], gf.shape[0]
        distmat = torch.pow(qf, 2).sum(dim=1, keepdim=True).expand(m, n) + \
                  torch.pow(gf, 2).sum(dim=1, keepdim=True).expand(n, m).t()
        distmat.addmm_(1, -2, qf, gf.t())
        distmat = distmat.cpu().numpy()
        cmc, mAP = evaluation(distmat, q_pids, g_pids, q_camids, g_camids)
        logger.info("Validation Results - Epoch: {}".format(epoch))
        logger.info("mAP: {:.1%}".format(mAP))
        for r in [1, 5, 10]:
            logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(r, cmc[r - 1]))
        

time_elapsed = time.time() - since
logger.info('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
logger.info('-' * 10)

Dataset Check Success: Market1501 exists!
2019-02-04 07:40:30,204 reid_baseline INFO: Using 1 GPUS
2019-02-04 07:40:30,205 reid_baseline INFO: Loaded configuration file ./config/softmax.yaml
2019-02-04 07:40:30,206 reid_baseline INFO: Running with config:
DATALOADER:
  NUM_INSTANCE: 16
  NUM_WORKERS: 8
  SAMPLER: softmax
DATASETS:
  NAMES: Market1501
  STORE_DIR: ./datasets
INPUT:
  PADDING: 10
  PIXEL_MEAN: [0.485, 0.456, 0.406]
  PIXEL_STD: [0.229, 0.224, 0.225]
  PROB: 0.5
  SIZE_TEST: [384, 128]
  SIZE_TRAIN: [384, 128]
MODEL:
  DEVICE: cuda:7
  LAST_STRIDE: 1
  NAME: ResNet50
  PRETRAIN_PATH: 
OUTPUT_DIR: ./checkpoints/Market1501/Softmax_BS64_384x128
SOLVER:
  BASE_LR: 0.00035
  BIAS_LR_FACTOR: 1
  CHECKPOINT_PERIOD: 20
  EVAL_PERIOD: 1
  GAMMA: 0.1
  IMS_PER_BATCH: 64
  LOG_PERIOD: 100
  MARGIN: 0.3
  MAX_EPOCHS: 120
  MOMENTUM: 0.9
  OPTIMIZER_NAME: Adam
  STEP: 40
  WARMUP: True
  WARMUP_FACTOR: 0.01
  WARMUP_ITERS: 5
  WARMUP_METHOD: linear
  WARMUP_STEPS: [40, 70]
  WEIGHT_DE

HBox(children=(IntProgress(value=0, description='Epoch', max=120), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Iteration', max=203), HTML(value='')))

2019-02-04 07:41:40,302 reid_baseline INFO: Epoch[1] Iteration[100/203] Loss: 6.615, Acc: 0.003, Base Lr: -6.58e-05
2019-02-04 07:42:44,016 reid_baseline INFO: Epoch[1] Iteration[200/203] Loss: 6.605, Acc: 0.008, Base Lr: -6.58e-05
Model:resnet50_epo1.pth saves successfully


HBox(children=(IntProgress(value=0, description='Feature Extraction', max=76), HTML(value='')))

Traceback (most recent call last):
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/linshan/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
   

KeyboardInterrupt: 