In [1]:
import copy
import time
from pathlib import Path
import os


import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchvision import transforms
import torch.optim as optim
from torch.optim import lr_scheduler

from multiprocessing import Pool
import numpy as np
import pandas as pd

import torchvision
from torchvision import datasets, models, transforms

from PIL import Image

import matplotlib as mpl
mpl_params = {
    'figure.figsize': (10, 5),
    'figure.dpi': 300,
}
from matplotlib import pyplot as plt
mpl.rcParams.update(mpl_params)

import seaborn as sns
sns.set()

**Inception V2/V3 paper**

https://arxiv.org/pdf/1512.00567v3.pdf

**PyTorch InceptionV3 source**

https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py

**InceptionV3 Tranfer Learning Gist (Some of the notebook from here)** 

https://gist.github.com/Prakashvanapalli/fba135778219c37bacc744d8dbfb43b1

**PyTorch: Deep Learning (PyData Berlin 2018) (Most of notebook based off this talk/notebooks)** 

https://github.com/sotte/pytorch_tutorial/blob/master/notebooks/00_index.ipynb

**InceptionV1 and InceptionV3 available from PyTorch**

**InceptionV2 and InceptionV4 available from**

https://github.com/Cadene/pretrained-models.pytorch


# TODO:

Learn about this:

    from sklearn.preprocessing import MultiLabelBinarizer
    
Pillow SIMD Install in Conda env:

    https://gist.github.com/soumith/01da3874bf014d8a8c53406c2b95d56b


# Device

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

# Dataset

Required to implement `__len__` and `__getitem__`. 

+ labels convenience attributes
+ image display helpers
+ ...

In [3]:
TRAIN_DIR = Path('../input/train')

In [4]:
LABELS = {
    0: 'Nucleoplasm', 
    1: 'Nuclear membrane',   
    2: 'Nucleoli',   
    3: 'Nucleoli fibrillar center' ,  
    4: 'Nuclear speckles',
    5: 'Nuclear bodies',
    6: 'Endoplasmic reticulum',   
    7: 'Golgi apparatus',
    8: 'Peroxisomes',
    9: 'Endosomes',
    10: 'Lysosomes',
    11: 'Intermediate filaments',   
    12: 'Actin filaments',
    13: 'Focal adhesion sites',   
    14: 'Microtubules',
    15: 'Microtubule ends',   
    16: 'Cytokinetic bridge',   
    17: 'Mitotic spindle',
    18: 'Microtubule organizing center',  
    19: 'Centrosome',
    20: 'Lipid droplets',   
    21: 'Plasma membrane',   
    22: 'Cell junctions', 
    23: 'Mitochondria',
    24: 'Aggresome',
    25: 'Cytosol',
    26: 'Cytoplasmic bodies',   
    27: 'Rods & rings'
}

LABEL_NAMES = list(LABELS.values())
LABEL_KEYS = list(LABELS.keys())

In [5]:
df = pd.read_csv('../input/train.csv')
df.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [6]:
[int(x) for x in df.iloc[1].Target.split()]

[7, 1, 2, 0]

In [7]:
ls '../input/train/' | head

00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png
00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png
00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png
00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_blue.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_green.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_red.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_yellow.png
000a9596-bbc4-11e8-b2bc-ac1f6b6435d0_blue.png
000a9596-bbc4-11e8-b2bc-ac1f6b6435d0_green.png


In [42]:
class ProteinDataset(Dataset):

    def __init__(self, train_df, images_dir, transform=None):            
        self.df = train_df.copy()  # TODO: Rename. Actualy replace df maybe?
        self._dir = images_dir # TODO: PIL check?
        self.transform = transform

    def __len__(self):
        return len(self.df)
                                      
    def __getitem__(self, key):
        id_ = self.df.iloc[key].Id
        
        #  TODO: Clean this up...
        r = np.array(Image.open(self._dir / f'{id_}_red.png'), np.uint8)
        g = np.array(Image.open(self._dir / f'{id_}_green.png'), np.uint8)
        b = np.array(Image.open(self._dir / f'{id_}_blue.png'), np.uint8)
        y = np.array(Image.open(self._dir / f'{id_}_yellow.png'), np.uint8)
        
        rgb = np.stack([
            r // 2 + y // 2,
            g // 2 + y // 2,
            b // 2
        ], axis=2)
        
        rgb = np.array(Image.fromarray(rgb).resize((299, 299)))  # InceptionV3 input
        rgb = np.array(rgb) / 255.0  # 0.0 - 1.0 floats Better way?
        
        y = [int(x) for x in self.df.iloc[key].Target.split()]
        
        if transform:
            X = self.transform(rgb)
        else:
            X = rgb
            
        return X, y  # TODO: Generator...

# Transforms

*Get pillow-simd*  https://github.com/uploadcare/pillow-simd /  http://python-pillow.org/pillow-perf/

1. How to combine images
2. PIL -> Tensor
3. Normalize?

+ Augment labels with least samples?
+ ...

In [9]:
transform = transforms.Compose([
    transforms.transforms.ToTensor()
])

In [43]:
val_ds = ProteinDataset(df, Path('../input/train'), transform=transform)

In [11]:
val_ds[0]

(tensor([[[0.0157, 0.0000, 0.0000,  ..., 0.0039, 0.0078, 0.0000],
          [0.0000, 0.0000, 0.0235,  ..., 0.0039, 0.0078, 0.0275],
          [0.0078, 0.0157, 0.0000,  ..., 0.0118, 0.0549, 0.0275],
          ...,
          [0.0078, 0.0039, 0.0118,  ..., 0.0314, 0.0000, 0.0588],
          [0.0196, 0.0431, 0.0275,  ..., 0.0431, 0.0667, 0.0000],
          [0.0000, 0.0078, 0.0000,  ..., 0.0157, 0.0039, 0.0039]],
 
         [[0.0157, 0.0510, 0.0000,  ..., 0.0000, 0.0078, 0.0000],
          [0.1020, 0.0706, 0.0510,  ..., 0.0039, 0.0353, 0.0000],
          [0.0275, 0.0431, 0.0706,  ..., 0.0314, 0.0431, 0.0196],
          ...,
          [0.0078, 0.0000, 0.0314,  ..., 0.0431, 0.0000, 0.0039],
          [0.0235, 0.0275, 0.0235,  ..., 0.0039, 0.0157, 0.0039],
          [0.0000, 0.0039, 0.0000,  ..., 0.0118, 0.0000, 0.0000]],
 
         [[0.0000, 0.0392, 0.0078,  ..., 0.0235, 0.0157, 0.0000],
          [0.0235, 0.0275, 0.0196,  ..., 0.0078, 0.0000, 0.0000],
          [0.0275, 0.0000, 0.0157,  ...,

# DataLoader

In [50]:
val_dl = DataLoader(
    val_ds,
    batch_size=32,
    shuffle=False,
    num_workers=2,
)

# Sampler

# PreTrained Model

In [13]:
model = torchvision.models.inception_v3(pretrained=True).double()

In [14]:
# model.to(device)

In [15]:
# Freeze all layers, change fully connected layer from 1000 outputs to our 28 label output, unfreeze last layer
# How to change this from single classification to multilabel classification?
# 

#for name, param in model.named_parameters():
#     param.requires_grad = False
# n_features = model.fc.in_features
# model.fc = nn.Linear(n_features, len(LABELS))

# Feature Extraction 

### Forward Hook for Feature Extraction

https://discuss.pytorch.org/t/how-to-get-separate-conv-feature-map-from-pretrained-resnet/3479/4?u=justusschock

In [16]:
# features1a = []
# def hook1a(module, input, output):
#     features1a.extend(output)

# model.Conv2d_1a_3x3.register_forward_hook(hook1a)

# features2b = []
# def hook2b(module, input, output):
#     features2b.extend(output)

# model.Conv2d_2b_3x3.register_forward_hook(hook2b)

# features3b = []
# def hook3b(module, input, output):
#     features3b.extend(output)

# model.Conv2d_3b_1x1.register_forward_hook(hook3b)

# features4a = []
# def hook4a(module, input, output):
#     features4a.extend(output)

# model.Conv2d_4a_3x3.register_forward_hook(hook4a)

#     def __init__(self, num_classes=1000, aux_logits=True, transform_input=False):
#         super(Inception3, self).__init__()
#         self.aux_logits = aux_logits
#         self.transform_input = transform_input
#         self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, stride=2)
#         self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
#         self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
#         self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
#         self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
#         self.Mixed_5b = InceptionA(192, pool_features=32)
#         self.Mixed_5c = InceptionA(256, pool_features=64)
#         self.Mixed_5d = InceptionA(288, pool_features=64)
#         self.Mixed_6a = InceptionB(288)
#         self.Mixed_6b = InceptionC(768, channels_7x7=128)
#         self.Mixed_6c = InceptionC(768, channels_7x7=160)
#         self.Mixed_6d = InceptionC(768, channels_7x7=160)
#         self.Mixed_6e = InceptionC(768, channels_7x7=192)
#         if aux_logits:
#             self.AuxLogits = InceptionAux(768, num_classes)
#         self.Mixed_7a = InceptionD(768)
#         self.Mixed_7b = InceptionE(1280)
#         self.Mixed_7c = InceptionE(2048)
#         self.fc = nn.Linear(2048, num_classes)

## Visualize layers

In [17]:
import itertools

def grid(layer_weights):
    n, rows, cols = layer_weights.shape
    grid_size = int(np.ceil(np.sqrt(n)))
    grid_total = grid_size * grid_size
    n_empty = grid_total - n
    layer_weights = np.concatenate([layer_weights, np.zeros((n_empty, rows, cols))])
    layer_weights = layer_weights.reshape((grid_size, grid_size, rows, cols))
    im_array = np.vstack([np.hstack([col for col in row]) for row in layer_weights])
    im_array *= 255.0
    pil_im = Image.fromarray(im_array)
    return pil_im

layers = itertools.cycle([
    'Conv2d_1a_3x3',
    'Conv2d_2a_3x3',
    'Conv2d_2b_3x3',
    'Conv2d_3b_1x1',
    'Conv2d_4a_3x3',
    'Mixed_5b',
    'Mixed_5c',
    'Mixed_5d',
    'Mixed_6a',
    'Mixed_6b',
    'Mixed_6c',
    'Mixed_6d',
    'Mixed_6e',
    'Mixed_7a',
    'Mixed_7b',
    'Mixed_7c'
])

label_dir = ''


def hook(module, input, output):
    grid(output.squeeze()).convert('L').save(f'{label_dir}/{next(layers)}.png')

model.Conv2d_1a_3x3.register_forward_hook(hook)
# 149 x 149 x 32

model.Conv2d_2a_3x3.register_forward_hook(hook)
# 147 x 147 x 32

model.Conv2d_2b_3x3.register_forward_hook(hook)
# 147 x 147 x 64

model.Conv2d_3b_1x1.register_forward_hook(hook)
# 73 x 73 x 80

model.Conv2d_4a_3x3.register_forward_hook(hook)
# 71 x 71 x 192

model.Mixed_5b.register_forward_hook(hook)
# 35 x 35 x 256

model.Mixed_5c.register_forward_hook(hook)
# 35 x 35 x 288

model.Mixed_5d.register_forward_hook(hook)
# 35 x 35 x 288

model.Mixed_6a.register_forward_hook(hook)
# 17 x 17 x 768

model.Mixed_6b.register_forward_hook(hook)
# 17 x 17 x 768

model.Mixed_6c.register_forward_hook(hook)
# 17 x 17 x 768

model.Mixed_6d.register_forward_hook(hook)
# 17 x 17 x 768

model.Mixed_6e.register_forward_hook(hook)
# 17 x 17 x 768

model.Mixed_7a.register_forward_hook(hook)
# 8 x 8 x 1280

model.Mixed_7b.register_forward_hook(hook)
# 8 x 8 x 2048

model.Mixed_7c.register_forward_hook(hook)
# 8 x 8 x 2048


<torch.utils.hooks.RemovableHandle at 0x12b64e0b8>

In [18]:
start_ts = time.time()
model.eval()
labels_collected = []
with torch.no_grad():
    for X, y in val_dl:
        if len(y) > 1 and 10 not in y and 15 not in y and 17 not in y:
            continue
            
        if 10 in y:
            single_label = 10
        elif 15 in y:
            single_label = 15
        elif 17 in y:
            single_label = 17
        else:
            single_label = int(y[0])

        if single_label in labels_collected:
            continue
            
        labels_collected.append(single_label)

        label_dir = f'{str(single_label)}_{LABELS[single_label]}'
        Path(label_dir).mkdir(parents=True, exist_ok=True)

        t1 = time.time()
        model(X)
        print(f'Created {label_dir}') # Processed: {len(features2b)} Total: {len(val_ds)} {time.time() - t1}')
print(time.time() - start_ts)

Created 5_Nuclear bodies
Created 1_Nuclear membrane
Created 18_Microtubule organizing center
Created 0_Nucleoplasm
Created 7_Golgi apparatus
Created 23_Mitochondria
Created 21_Plasma membrane
Created 25_Cytosol
Created 11_Intermediate filaments
Created 13_Focal adhesion sites
Created 12_Actin filaments
Created 2_Nucleoli
Created 20_Lipid droplets
Created 3_Nucleoli fibrillar center
Created 14_Microtubules
Created 17_Mitotic spindle
Created 19_Centrosome
Created 6_Endoplasmic reticulum
Created 4_Nuclear speckles
Created 10_Lysosomes
Created 26_Cytoplasmic bodies
Created 22_Cell junctions
Created 8_Peroxisomes
Created 24_Aggresome
Created 16_Cytokinetic bridge
Created 9_Endosomes
Created 15_Microtubule ends
Created 27_Rods & rings
701.5315017700195


# Criterion

# Optimizer

# Train Loop

# Evaluation

In [51]:
%%timeit
count = 0
with torch.no_grad():
    t1 = time.time()
    for X, y in val_dl:
        print(f'Loaded in {time.time() - t1}')
        count += 1
        if count > 0:
            break

Loaded in 0.661773681640625
Loaded in 0.7015519142150879
Loaded in 0.7658672332763672
Loaded in 0.8106927871704102
Loaded in 0.9024658203125


Process Process-97:
Traceback (most recent call last):
  File "/Users/david.wagner/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.wagner/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/david.wagner/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/Users/david.wagner/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-42-720500edf817>", line 23, in __getitem__
    b // 2
KeyboardInterrupt
Process Process-98:
Traceback (most recent call last):


KeyboardInterrupt: 

  File "/Users/david.wagner/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/david.wagner/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/david.wagner/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/Users/david.wagner/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 187, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/Users/david.wagner/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 187, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/Users/david.wagner/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 164, in default_collate
    return torch.stack(batch, 0, out=out)
K

In [74]:
def mp_load(fn):
    pil_im = Image.open(fn)
    return np.array(pil_im, np.uint8)

from multiprocessing.dummy import Pool
p = Pool()

In [120]:
t1 = time.time()
for _ in range(32):
    id_ = '00070df0-bbc3-11e8-b2bc-ac1f6b6435d0'
    key = 0
    images_dir = Path('../input/train')

    #  TODO: Clean this up...
#     r = np.array(Image.open(images_dir / f'{id_}_red.png'), np.uint8)
#     g = np.array(Image.open(images_dir / f'{id_}_green.png'), np.uint8)
#     b = np.array(Image.open(images_dir / f'{id_}_blue.png'), np.uint8)
#     y = np.array(Image.open(images_dir / f'{id_}_yellow.png'), np.uint8)

    r, g, b, y = p.map(mp_load, [(images_dir / f'{id_}_red.png'),
                                 (images_dir / f'{id_}_green.png'), 
                                 (images_dir / f'{id_}_blue.png'), 
                                 (images_dir / f'{id_}_yellow.png'),])

    rgb = np.stack([
        r // 2 + y // 2,
        g // 2 + y // 2,
        b // 2
    ], axis=2)

    rgb = np.array(Image.fromarray(rgb).resize((299, 299)))  # InceptionV3 input
    rgb = np.array(rgb) / 255.0  # 0.0 - 1.0 floats Better way?

#     y = [int(x) for x in df.iloc[key].Target.split()]

    X = transform(rgb)
print(time.time() - t1)

0.33272600173950195


In [98]:
t1 = time.time()
id_ = '00070df0-bbc3-11e8-b2bc-ac1f6b6435d0'
key = 0
images_dir = Path('../input/train')
p.map(mp_load, [(images_dir / f'{id_}_red.png'),
                (images_dir / f'{id_}_green.png'), 
                (images_dir / f'{id_}_blue.png'), 
                (images_dir / f'{id_}_yellow.png'),])
time.time() - t1

0.007842063903808594

In [168]:
t1 = time.time()
Image.open(images_dir / f'{df.loc[12].Id}_red.png')
time.time() - t1

0.0060882568359375

In [169]:
t1 = time.time()
rgb = np.stack([
    r // 2 + y // 2,
    g // 2 + y // 2,
    b // 2
], axis=2)
time.time() - t1

0.004925966262817383

In [170]:
t1 = time.time()
test = np.array(Image.fromarray(rgb).resize((299, 299)))  # InceptionV3 input
test2 = np.array(test) / 255.0  # 0.0 - 1.0 floats Better way?
time.time() - t1

0.0017452239990234375

In [178]:
np.save('test', r)

In [179]:
ls

1512.00567v3.pdf           draft_vis.ipynb            [34minception_visualization[m[m/   kernel.ipynb               test.npy
InceptionLayers.zip        explore.ipynb              inceptionv3.ipynb          row.png                    visualize_inception.ipynb


In [171]:
x = torch.Tensor(r)

In [192]:
df.Target.tolist()[1]

'7 1 2 0'

In [188]:
next_list = iter(df.Target.tolist())

In [191]:
next(next_list)

'7 1 2 0'

In [183]:
%time
torch.save(x, 'test_tensor')

CPU times: user 6 µs, sys: 37 µs, total: 43 µs
Wall time: 15.3 µs


In [175]:
pickle.dump(x, 'test')

[0;31mSignature:[0m [0mpickle[0m[0;34m.[0m[0mdump[0m[0;34m([0m[0mobj[0m[0;34m,[0m [0mfile[0m[0;34m,[0m [0mprotocol[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m*[0m[0;34m,[0m [0mfix_imports[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Write a pickled representation of obj to the open file object file.

This is equivalent to ``Pickler(file, protocol).dump(obj)``, but may
be more efficient.

The optional *protocol* argument tells the pickler to use the given
protocol supported protocols are 0, 1, 2, 3 and 4.  The default
protocol is 3; a backward-incompatible protocol designed for Python 3.

Specifying a negative protocol version selects the highest protocol
version supported.  The higher the protocol used, the more recent the
version of Python needed to read the pickle produced.

The *file* argument must have a write() method that accepts a single
bytes argument.  It can thus be a file object opened for binary
writing, an io