In [1]:
import copy
import itertools
import time
from pathlib import Path
import os

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchvision import transforms
import torch.optim as optim
from torch.optim import lr_scheduler

from multiprocessing.dummy import Pool
import numpy as np
import pandas as pd

import torchvision
from torchvision import datasets, models, transforms

from PIL import Image

import matplotlib as mpl
mpl_params = {
    'figure.figsize': (10, 5),
    'figure.dpi': 300,
}
from matplotlib import pyplot as plt
mpl.rcParams.update(mpl_params)

import seaborn as sns
sns.set()

**Inception V2/V3 paper**

https://arxiv.org/pdf/1512.00567v3.pdf

**PyTorch InceptionV3 source**

https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py

**InceptionV3 Tranfer Learning Gist (Some of the notebook from here)** 

https://gist.github.com/Prakashvanapalli/fba135778219c37bacc744d8dbfb43b1

**PyTorch: Deep Learning (PyData Berlin 2018) (Most of notebook based off this talk/notebooks)** 

https://github.com/sotte/pytorch_tutorial/blob/master/notebooks/00_index.ipynb

**InceptionV1 and InceptionV3 available from PyTorch**

**InceptionV2 and InceptionV4 available from**

https://github.com/Cadene/pretrained-models.pytorch


# Config

In [2]:
GENERATE_FEATURES = False

# TODO:

Learn about this:

    from sklearn.preprocessing import MultiLabelBinarizer
    
Pillow SIMD Install in Conda env:

    https://gist.github.com/soumith/01da3874bf014d8a8c53406c2b95d56b


# Device

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

# Dataset

Required to implement `__len__` and `__getitem__`. 

+ labels convenience attributes
+ image display helpers
+ ...

In [4]:
ls ../input

[34mFeatures[m[m/              [34mtest[m[m/                  train.csv
sample_submission.csv  [34mtrain[m[m/


In [8]:
TRAIN_DIR = Path('../input/human-protein-atlas-image-classification/train')

In [9]:
LABELS = {
    0: 'Nucleoplasm', 
    1: 'Nuclear membrane',   
    2: 'Nucleoli',   
    3: 'Nucleoli fibrillar center' ,  
    4: 'Nuclear speckles',
    5: 'Nuclear bodies',
    6: 'Endoplasmic reticulum',   
    7: 'Golgi apparatus',
    8: 'Peroxisomes',
    9: 'Endosomes',
    10: 'Lysosomes',
    11: 'Intermediate filaments',   
    12: 'Actin filaments',
    13: 'Focal adhesion sites',   
    14: 'Microtubules',
    15: 'Microtubule ends',   
    16: 'Cytokinetic bridge',   
    17: 'Mitotic spindle',
    18: 'Microtubule organizing center',  
    19: 'Centrosome',
    20: 'Lipid droplets',   
    21: 'Plasma membrane',   
    22: 'Cell junctions', 
    23: 'Mitochondria',
    24: 'Aggresome',
    25: 'Cytosol',
    26: 'Cytoplasmic bodies',   
    27: 'Rods & rings'
}

LABEL_NAMES = list(LABELS.values())

In [10]:
df = pd.read_csv('../input/human-protein-atlas-image-classification/train.csv')
df.head()

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18


In [11]:
df.iloc[1].Target.split()

['7', '1', '2', '0']

In [12]:
ls '../input/human-protein-atlas-image-classification/train/' | head

00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_blue.png
00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_green.png
00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_red.png
00070df0-bbc3-11e8-b2bc-ac1f6b6435d0_yellow.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_blue.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_green.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_red.png
000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_yellow.png
000a9596-bbc4-11e8-b2bc-ac1f6b6435d0_blue.png
000a9596-bbc4-11e8-b2bc-ac1f6b6435d0_green.png


In [13]:
class ProteinDataset(Dataset):

    def __init__(self, train_df, images_dir, transform=None):            
        self.df = train_df.copy()  # TODO: Rename. Actualy replace df maybe?
        self._dir = images_dir # TODO: PIL check?
        self.transform = transform
        self.p = Pool()

    def __len__(self):
        return len(self.df)
    
    def mp_load(self, path):
        pil_im = Image.open(path)
        return np.array(pil_im, np.uint8)
                                      
    def __getitem__(self, key):
        id_ = self.df.iloc[key].Id
        
        colors = ['red', 'green', 'blue', 'yellow']
        image_paths = [self._dir / f'{id_}_{c}.png' for c in colors]
        r, g, b, y = self.p.map(self.mp_load, image_paths)

        rgb = np.stack([
            r // 2 + y // 2,
            g // 2 + y // 2,
            b // 2
        ], axis=2)
        
        rgb = np.array(Image.fromarray(rgb).resize((299, 299)))  # InceptionV3 input
        rgb = np.array(rgb) / 255.0  # 0.0 - 1.0 floats Better way?
        
        y = self.df.iloc[key].Target.split()
        
        if transform:
            X = self.transform(rgb)
        else:
            X = rgb
            
        return X, y  # TODO: Generator...

# Transforms

*Get pillow-simd*  https://github.com/uploadcare/pillow-simd /  http://python-pillow.org/pillow-perf/

1. How to combine images
2. PIL -> Tensor
3. Normalize?

+ Augment labels with least samples?
+ ...

In [14]:
transform = transforms.Compose([
    transforms.transforms.ToTensor()
])

In [15]:
val_ds = ProteinDataset(
    train_df=df,
    images_dir=TRAIN_DIR,
    transform=transform
)

In [16]:
val_ds[0]

(tensor([[[0.0157, 0.0000, 0.0000,  ..., 0.0039, 0.0078, 0.0000],
          [0.0000, 0.0000, 0.0235,  ..., 0.0039, 0.0078, 0.0275],
          [0.0078, 0.0157, 0.0000,  ..., 0.0118, 0.0549, 0.0275],
          ...,
          [0.0078, 0.0039, 0.0118,  ..., 0.0314, 0.0000, 0.0588],
          [0.0196, 0.0431, 0.0275,  ..., 0.0431, 0.0667, 0.0000],
          [0.0000, 0.0078, 0.0000,  ..., 0.0157, 0.0039, 0.0039]],
 
         [[0.0157, 0.0510, 0.0000,  ..., 0.0000, 0.0078, 0.0000],
          [0.1020, 0.0706, 0.0510,  ..., 0.0039, 0.0353, 0.0000],
          [0.0275, 0.0431, 0.0706,  ..., 0.0314, 0.0431, 0.0196],
          ...,
          [0.0078, 0.0000, 0.0314,  ..., 0.0431, 0.0000, 0.0039],
          [0.0235, 0.0275, 0.0235,  ..., 0.0039, 0.0157, 0.0039],
          [0.0000, 0.0039, 0.0000,  ..., 0.0118, 0.0000, 0.0000]],
 
         [[0.0000, 0.0392, 0.0078,  ..., 0.0235, 0.0157, 0.0000],
          [0.0235, 0.0275, 0.0196,  ..., 0.0078, 0.0000, 0.0000],
          [0.0275, 0.0000, 0.0157,  ...,

# DataLoader

In [17]:
val_dl = DataLoader(
    val_ds,
    batch_size=256,
    shuffle=False,
    num_workers=0,
)

# Sampler

# PreTrained Model

In [18]:
model = torchvision.models.inception_v3(pretrained=True).double()

In [19]:
model.to(device)

Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, t

In [20]:
# Freeze all layers, change fully connected layer from 1000 outputs to our 28 label output, unfreeze last layer
# How to change this from single classification to multilabel classification?
# 

if GENERATE_FEATURES:
    for name, param in model.named_parameters():
        param.requires_grad = False
    n_features = model.fc.in_features
    model.fc = nn.Linear(n_features, len(LABELS))

# Feature Extraction 

### Forward Hook for Feature Extraction

https://discuss.pytorch.org/t/how-to-get-separate-conv-feature-map-from-pretrained-resnet/3479/4?u=justusschock

In [21]:
# model

In [22]:
files = iter([f'{str(num)}-{str(num + 255)}.pth' for num in range(0, len(df.Target.tolist()), 256)])

In [23]:
feature_dir = Path('Features')
feature_dir.mkdir(parents=True, exist_ok=True)

In [24]:
# features = []
save_pool = Pool(32)
def hook(module, input, output):
    save_pool.apply_async(torch.save, args=(input[0].squeeze(), Path('Features') / next(files)))

model.fc.register_forward_hook(hook)

#     def __init__(self, num_classes=1000, aux_logits=True, transform_input=False):
#         super(Inception3, self).__init__()
#         self.aux_logits = aux_logits
#         self.transform_input = transform_input
#         self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, stride=2)
#         self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
#         self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
#         self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
#         self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
#         self.Mixed_5b = InceptionA(192, pool_features=32)
#         self.Mixed_5c = InceptionA(256, pool_features=64)
#         self.Mixed_5d = InceptionA(288, pool_features=64)
#         self.Mixed_6a = InceptionB(288)
#         self.Mixed_6b = InceptionC(768, channels_7x7=128)
#         self.Mixed_6c = InceptionC(768, channels_7x7=160)
#         self.Mixed_6d = InceptionC(768, channels_7x7=160)
#         self.Mixed_6e = InceptionC(768, channels_7x7=192)
#         if aux_logits:
#             self.AuxLogits = InceptionAux(768, num_classes)
#         self.Mixed_7a = InceptionD(768)
#         self.Mixed_7b = InceptionE(1280)
#         self.Mixed_7c = InceptionE(2048)
#         self.fc = nn.Linear(2048, num_classes)

<torch.utils.hooks.RemovableHandle at 0x134262be0>

### This cell iterates over the entire dataset and serializes the last layer of features to disk. This has already been run and the output is loaded in the Draft Environment section from that previous kernel.

In [25]:
# model.eval()
# with torch.no_grad():
#     for X, y in val_dl:
#         X = X.to(device)
#         model(X)

# Load Serialized Features

In [27]:
features_dir = Path('../input/inceptionv3-wip/Features/')
features_list = []
for path in sorted(list(features_dir.iterdir()), key=lambda x: int(x.name[:x.name.find('-')])):
    batch = torch.load(path, map_location='cpu')
    for features in batch:
        features_list.append(features.numpy())

In [28]:
len(features_list) == len(df)

True

In [33]:
all(len(x) == 2048 and len(x.shape) == 1 for x in features_list)

True

In [None]:
all_features = {id_: x for id_, x in zip(df.Id.tolist(), features_list)}

In [41]:
#np.savez_compressed('features.npz', **all_features)

# Criterion

# Optimizer

# Train Loop

# Evaluation