## SimCLR: A Simple Framework for Contrastive Learning of Visual Representations

This colab demonstrates how to load pretrained/finetuned SimCLR models from checkpoints or hub modules. It contains two parts:

* Part I - Load checkpoints and print parameters (count)
* Part II - Load hub module for inference

The checkpoints are accessible in the following Google Cloud Storage folders.

* Pretrained SimCLRv2 models with a linear classifier: [gs://simclr-checkpoints/simclrv2/pretrained](https://console.cloud.google.com/storage/browser/simclr-checkpoints/simclrv2/pretrained)
* Fine-tuned SimCLRv2 models on 1% of labels: [gs://simclr-checkpoints/simclrv2/finetuned_1pct](https://console.cloud.google.com/storage/browser/simclr-checkpoints/simclrv2/finetuned_1pct)
* Fine-tuned SimCLRv2 models on 10% of labels: [gs://simclr-checkpoints/simclrv2/finetuned_10pct](https://console.cloud.google.com/storage/browser/simclr-checkpoints/simclrv2/finetuned_10pct)
* Fine-tuned SimCLRv2 models on 100% of labels: [gs://simclr-checkpoints/simclrv2/finetuned_100pct](https://console.cloud.google.com/storage/browser/simclr-checkpoints/simclrv2/finetuned_100pct)
* Supervised models with the same architectures: [gs://simclr-checkpoints/simclrv2/pretrained](https://console.cloud.google.com/storage/browser/simclr-checkpoints/simclrv2/pretrained)

Use the corresponding checkpoint / hub-module paths for accessing the model. For example, to use a pre-trained model (with a linear classifier) with ResNet-152 (2x+SK), set the path to `gs://simclr-checkpoints/simclrv2/pretrained/r152_2x_sk1`.

# 1. Imagenet Dataset 다운로드 / 전처리

In [1]:
# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function
import sys
import os
import yaml
import re
import numpy as np

import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import tensorflow_hub as hub
import tensorflow_datasets as tfds

import matplotlib
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn import preprocessing

import importlib.util
from PIL import Image

# # import data as data_li# b
# import model as model_# lib
# import model_util as model_util

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
pwd

'/home/user/Desktop/pky/simclr'

In [3]:
import torch, torchvision
from torchvision import transforms as T
# Load details
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cpu


In [4]:
# 1-3. Load data
dataType = 'val' # datatype (train / val)
dataHome = '/SSD_data/Imagenet2012' # Imagenet2012 path (classification)
dataPath = os.path.join(dataHome, dataType)
print('dataPath:', dataPath)

transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
data_total = 1000

def _load_imagenet(prefix='train'):
    imagenet_data = torchvision.datasets.ImageNet(dataPath,split=prefix,download=False,transform=transform)
    data_loader = torch.utils.data.DataLoader(imagenet_data,batch_size=data_total,shuffle=True,num_workers=16)
    for data in data_loader:
        return data

print(dataType, ": Load images...")
X_data, labels = _load_imagenet(dataType)
X_data, labels = X_data.numpy(), labels.numpy()
X_data = X_data.transpose(0,2,3,1) # HWC
print("Load Finished!")

dataPath: /SSD_data/Imagenet2012/val
val : Load images...
Load Finished!


In [5]:
print(X_data.shape, labels.shape)

(1000, 224, 224, 3) (1000,)


In [6]:
import time
savePath = './save/Imagenet/' # save path
timestr = time.strftime("%Y%m%d-%H%M%S") # time stamp
print('savePath:', savePath)
print('timeStamp:', timestr)

savePath: ./save/Imagenet/
timeStamp: 20200706-110039


In [9]:
model_name = 'pretrained_r50_1x_sk0'
hub_path = 'gs://simclr-checkpoints/simclrv2/pretrained/r50_1x_sk0/hub/' # self-supervised
# hub_path = 'gs://simclr-checkpoints/simclrv2/finetuned_1pct/r50_1x_sk0/hub/' # 1% fine-tuned
# hub_path = 'gs://simclr-checkpoints/simclrv2/finetuned_10pct/r50_1x_sk0/hub/' # 10% fine-tuned
# hub_path = 'gs://simclr-checkpoints/simclrv2/finetuned_100pct/r50_1x_sk0/hub/' # 100% fine-tuned
# hub_path = 'gs://simclr-checkpoints/simclrv2/supervised/r50_1x_sk0/hub/' # supervised
module = hub.Module(hub_path, trainable=False)

In [39]:
batch_size = 5
dataset = {'image':X_data, 'label':labels}
dataset_batch = tf.data.Dataset.from_tensor_slices(dataset).batch(batch_size)
dataset_batch = tf.data.make_one_shot_iterator(dataset_batch).get_next()

In [40]:
keys = module(inputs=dataset_batch['image'], signature="default", as_dict=True)
features = keys['default']
logits = keys['logits_sup']
features, logits

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


(<tf.Tensor 'module_apply_default_4/base_model/final_avg_pool:0' shape=(?, 2048) dtype=float32>,
 <tf.Tensor 'module_apply_default_4/head_supervised/linear_layer/linear_layer_out:0' shape=(?, 1000) dtype=float32>)

In [41]:
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
sess.run(tf.global_variables_initializer())

In [42]:
from tqdm import tqdm_notebook
device = '/gpu:2'
features_total = np.empty((0, 2048))
logits_total = np.empty((0, 1000))

iter = data_total // batch_size
with tf.device(device):
    for _ in tqdm_notebook(range(3), desc='Extract Features'):
        features_, logits_ = sess.run((features, logits))
        features_total = np.append(features_total, features_, axis=0)
        logits_total = np.append(logits_total, logits_, axis=0)

print(features_total.shape, logits_total.shape)

HBox(children=(FloatProgress(value=0.0, description='Extract Features', max=3.0, style=ProgressStyle(descripti…


(15, 2048) (15, 1000)


In [43]:
print(features_total.shape, logits_total.shape)

(15, 2048) (15, 1000)


In [48]:
#### 저장
model_name = 'pretrained_r50_1x_sk0'
np.save(savePath+dataType+'_'+model_name+'_features_'+timestr, features_total)
np.save(savePath+dataType+'_'+model_name+'_logits_'+timestr, logits_total)

In [None]:
 = np.load("./save/STL10/train_simclr.npy")
X_test_feature_simclr = np.load("./save/STL10/test_simclr.npy")
X_train_feature_pre = np.load("./save/STL10/train_pretrained.npy")
X_test_feature_pre = np.load("./save/STL10/test_pretrained.npy")

In [2]:
pwd

'/home/user/Desktop/pky/simclr'

In [10]:
import numpy as np
a = np.load('save/Imagenet/dataset/train_labels_3.npy')
a.shape

(10047,)

In [None]:
import numpy as np

a = np.load('save/Imagenet/data')


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os, yaml
from pandas import DataFrame
import pickle




In [9]:
with open("imagenet1000_classes.txt", 'r') as file:
    classname = eval(file.read())
n_classes = len(classname)
classname_new = [list(classname.values())[i].split(',')[0] for i in range(1000)]

subset_dict = {
    1: 'goldfish',
    8: 'hen',
    153: 'Maltese dog',
    281: 'tabby cat',
    292: 'tiger',
    532: 'dining table',
    559: 'folding chair',
    681: 'notebook',
    817: 'sports car',
    895: 'military plane',
}
subset = np.array(list(subset_dict.items()))
subset_keys = subset[:,0].astype(int)
subset_vals = subset[:,1]

In [31]:
dataType = 'val'
dataPath = '/home/user/Desktop/pky/simclr/save/Imagenet/dataset/'
batch_size = 1000

def _load_imagenet():
    for file_num in range(5):
        data = np.load(dataPath+dataType+'_images_{}.npy'.format(file_num))
        label = np.load(dataPath+dataType+'_labels_{}.npy'.format(file_num))
        for i in range(0, data.shape[0], batch_size):
            yield data[i: i+batch_size], label[i: i+batch_size]


new_set = np.empty((0,224,224,3))
new_labels = np.array([])
print("batch computation start...")
for batch_data, batch_label in _load_imagenet():
    for key in subset_keys:
        idx = np.where(batch_label == key)
        new_set = np.append(new_set, batch_data[idx], axis=0, axis=0)
        new_labels = np.append(new_labels, batch_la, axis=0bel[idx], axis=0)
print("batch Finished!")

# 저장
savePath = '/home/user/Desktop/pky/simclr/save/Imagenet/'
np.save(savePath+dataType+'_images10_diff', new_set)
np.save(savePath+dataType+'_labels10_diff', new_labels)

SyntaxError: invalid token (<ipython-input-31-95ddd540c955>, line 20)

In [32]:
dataType = 'val'
dataPath = '/home/user/Desktop/pky/simclr/save/Imagenet/dataset/'
batch_size = 1000

def _load_imagenet():
    for file_num in range(5):
        data = np.load(dataPath+dataType+'_images_{}.npy'.format(file_num))
        label = np.load(dataPath+dataType+'_labels_{}.npy'.format(file_num))
        for i in range(0, data.shape[0], batch_size):
            yield data[i: i+batch_size], label[i: i+batch_size]


new_set = np.empty((0,224,224,3))
new_labels = np.array([])
print("batch computation start...")
for batch_data, batch_label in _load_imagenet():
    for key in subset_keys:
        idx = np.where(batch_label == key)
        new_set = np.append(new_set, batch_data[idx], axis=0)
        new_labels = np.append(new_labels, batch_label[idx], axis=0)
print("batch Finished!")

# 저장
savePath = '/home/user/Desktop/pky/simclr/save/Imagenet/'
np.save(savePath+dataType+'_images10_diff', new_set)
np.save(savePath+dataType+'_labels10_diff', new_labels)

batch computation start...
batch Finished!


In [34]:
a = np.load('save/Imagenet/val_images10_diff.npy')
a.shape

(500, 224, 224, 3)

'/home/user/Desktop/pky/simclr'