# Alaska2 Multiclass Classifier with Catalyst

This is a fork of a great [kernel](https://www.kaggle.com/meaninglesslives/alaska2-cnn-multiclass-classifier) by [@meaninglesslives](https://www.kaggle.com/meaninglesslives). Rewritten using beautiful [Catalyst](https://github.com/catalyst-team/catalyst) framework. I've also made some minor improvements.

# Load Libraries

In [1]:
!pip install -q efficientnet_pytorch
from efficientnet_pytorch import EfficientNet
from albumentations.pytorch import ToTensorV2
from albumentations import (
    Compose, HorizontalFlip,
    ToFloat, VerticalFlip
)
from catalyst.dl.callbacks.metrics import AccuracyCallback, AUCCallback
from catalyst.dl import SupervisedRunner
from catalyst.utils import get_one_hot
import os
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from glob import glob
import torchvision
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from skimage.io import imread
import torch.nn.functional as F
from scipy.special import softmax

  from pandas import Panel

numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working



# Create dataset for training and Validation

In [2]:
data_dir = '../input/alaska2-image-steganalysis'
folder_names = ['JMiPOD/', 'JUNIWARD/', 'UERD/']
class_names = ['Normal', 'JMiPOD_75', 'JMiPOD_90', 'JMiPOD_95', 
               'JUNIWARD_75', 'JUNIWARD_90', 'JUNIWARD_95',
                'UERD_75', 'UERD_90', 'UERD_95']
class_labels = { name: i for i, name in enumerate(class_names)}

In [3]:
train_df = pd.read_csv('../input/alaska2trainvalsplit/alaska2_train_df.csv')

train_df = train_df.sample(40000).reset_index(drop=True) # Delete this line for good training =)

val_df = pd.read_csv('../input/alaska2trainvalsplit/alaska2_val_df.csv')

In [4]:
class Alaska2Dataset(Dataset):
    def __init__(self, df, augmentations=None, test = False):
        self.data = df
        self.augment = augmentations
        self.test = test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.test:
            fn = self.data.loc[idx][0]
        else:
            fn, label = self.data.loc[idx]
        im = imread(fn)
        if self.augment:
            im = self.augment(image=im)
        if self.test:
            item = {'features': im['image']}
        else:
            item = {'features': im['image'], 'targets':label, 'bool_targets': get_one_hot(label, 10)}

        return item


AUGMENTATIONS_TRAIN = Compose([
    VerticalFlip(p=0.5),
    HorizontalFlip(p=0.5),
    ToFloat(max_value=255),
    ToTensorV2()
], p=1)


AUGMENTATIONS_TEST = Compose([
    ToFloat(max_value=255),
    ToTensorV2()
], p=1)

In [5]:
batch_size = 32
num_workers = 8

train_dataset = Alaska2Dataset(train_df, augmentations=AUGMENTATIONS_TRAIN)
valid_dataset = Alaska2Dataset(val_df.sample(5000).reset_index(drop=True), augmentations=AUGMENTATIONS_TEST)

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size,
                                           num_workers=num_workers,
                                           shuffle=True)

valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                           batch_size=batch_size*2,
                                           num_workers=num_workers,
                                           shuffle=False)

# CNN Model for multiclass classification

In [6]:
class Net(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.model = EfficientNet.from_name('efficientnet-b0')
        self.dense_output = nn.Linear(1280, num_classes)

    def forward(self, x):
        feat = self.model.extract_features(x)
        feat = F.avg_pool2d(feat, feat.size()[2:]).reshape(-1, 1280)
        return self.dense_output(feat)

In [7]:
loaders = {
    "train": train_loader,
    "valid": valid_loader
}

model = Net(num_classes=len(class_labels))
model.load_state_dict(torch.load('../input/alaska2trainvalsplit/epoch_5_val_loss_3.75_auc_0.833.pth'))

#optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0003)
criterion = torch.nn.CrossEntropyLoss()
callbacks = [
    AccuracyCallback(),
    AUCCallback(input_key='bool_targets', num_classes = 1) # I was too lazy to implement weighted AUC. It is strongly correlated with regular AUC. But it should be easy to make your own catalyst "meter" for weighted AUC 
]

runner = SupervisedRunner()

In [8]:
#runner.train(
#    model=model,
#    criterion=criterion,
#    optimizer=optimizer,
#    loaders=loaders,
#    num_epochs=10,
#    verbose=True,
#    callbacks=callbacks,
#    logdir="logs",
#    main_metric="auc/class_0",
#    minimize_metric = False,
#)

# Create Inference Dataset

In [9]:
test_filenames = sorted(glob(f"{data_dir}/Test/*.jpg"))
test_df = pd.DataFrame({'ImageFileName': list(
    test_filenames)}, columns=['ImageFileName'])

batch_size = 16
num_workers = 4
test_dataset = Alaska2Dataset(test_df, augmentations=AUGMENTATIONS_TEST, test=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=batch_size,
                                          num_workers=num_workers,
                                          shuffle=False,
                                          drop_last=False)

# Do inference

In [10]:
#model.load_state_dict(torch.load('logs/checkpoints/best.pth')["model_state_dict"])
model.cuda()
preds = []
for outputs in tqdm(runner.predict_loader(loader=test_loader, model=model)):
    preds.append(softmax(outputs))

preds = np.array(preds)

test_df['Id'] = test_df['ImageFileName'].apply(lambda x: x.split(os.sep)[-1])
test_df['Label'] = 1-preds[:, 0]

test_df = test_df.drop('ImageFileName', axis=1)
test_df.to_csv('submission.csv', index=False)
print(test_df.head())

HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


         Id     Label
0  0001.jpg  0.729291
1  0002.jpg  0.670964
2  0003.jpg  0.718036
3  0004.jpg  0.531977
4  0005.jpg  0.619327


In [11]:
torch.save(model.state_dict(), 'efficientnet_0')

In [12]:
model.state_dict()

OrderedDict([('model._conv_stem.weight',
              tensor([[[[-2.4737e-02, -9.3575e-03, -1.8632e-02],
                        [-1.3360e-01, -7.0349e-02,  1.0306e-01],
                        [-2.1624e-01, -3.4004e-01,  2.4232e-02]],
              
                       [[-3.1219e-02, -1.6772e-02, -8.0486e-02],
                        [-8.0047e-02, -1.9756e-02,  2.6649e-02],
                        [-2.4762e-01, -4.5952e-01,  6.7515e-02]],
              
                       [[ 4.1735e-02, -1.5183e-03, -9.6227e-03],
                        [-8.7247e-02, -8.5713e-02,  1.1767e-01],
                        [-7.0021e-02, -1.2315e-01,  3.2387e-02]]],
              
              
                      [[[ 7.1282e-01, -8.9188e-01,  4.5034e-02],
                        [ 4.4676e-01, -1.4991e-01, -5.7209e-02],
                        [ 1.5093e-01, -2.0580e-01, -5.7241e-03]],
              
                       [[ 1.1975e+00, -1.7536e+00, -2.4069e-02],
                        [ 9.2786e-