In [1]:
% matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas
import torch, torch.utils.data, torchvision
import PIL
import os.path
import time
# import skimage, skimage.io
import time
import copy
from my_utils import *

In [2]:
full_model = torch.load('full_model_new_whale_as_extra_class_gpu.pt')

In [3]:
# define the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
full_model.to(device);

In [4]:
df_test_no_leakage = pandas.read_csv('data/test_no_leakage.csv')

In [5]:
MEAN = [0.485, 0.456, 0.406] # expected by pretrained resnet18
STD = [0.229, 0.224, 0.225] # expected by pretrained resnet18

# define transformations without data augmentation
transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize((224,224)),
    torchvision.transforms.ToTensor(), # Expected by pretrained neural network
    torchvision.transforms.Normalize(MEAN, STD) # Expected by pretrained neural network
    ])

In [6]:
test_data = TestDataset(df_test_no_leakage.reindex(columns=['Image']),'./data/test/',transform=transforms)
test_dataloader = torch.utils.data.DataLoader(test_data,\
                                              batch_size=256,\
                                              shuffle=False,\
                                              num_workers=4,\
                                              sampler=None)

In [7]:
df_train = pandas.read_csv('data/train_no_duplicates.csv')
categories = WhaleDataset(df_train.reindex(columns=['Image','Id']),'./data/').categories

In [70]:
threshhold = 0.2
new_whale_index = categories.index('new_whale')

prob = []
Id = []
image_name = []
for image, im_name in test_dataloader:
    image = image.to(device)
    
    out = torch.exp(full_model(image))
    norm = torch.norm(out,p=1,dim=1,keepdim=True)
    out = out/norm
    
    prob_tmp, Id_tmp = torch.topk(out,k=5,dim=1)
    prob_tmp = prob_tmp.tolist()
    Id_tmp = Id_tmp.tolist()
    
    # Note image is a tuple. The final batch may have a
    # different length. That is why I am using the length of image_names
    # in xrange below.
    for j in xrange(len(image)):
        
        i = 0
        for p in prob_tmp[j]:
            if p < threshhold:
                Id_tmp[j].insert(i,0)
                Id_tmp[j].pop(5)
                break
            i+=1
            
        Id.append([categories[i] for i in Id_tmp[j]])
        prob.append(prob_tmp[j])
        image_name.append(im_name[j])

In [71]:
df_prediction = pandas.DataFrame({'Image' : image_name, 'Id' : Id})
df_prediction.Id = df_prediction.Id.apply(lambda x: ' '.join(x))
df_prediction.reindex(columns=['Image','Id'])

Unnamed: 0,Image,Id
0,bf2de0b3.jpg,new_whale w_7311fe4 w_69a9f72 w_d19a884 w_fe5e78b
1,97545d3c.jpg,new_whale w_8fab53d w_7c943ab w_ace8c54 w_b7d5069
2,9c814a76.jpg,new_whale w_b7d5069 w_759b647 w_3197568 w_307065e
3,bead0b7e.jpg,new_whale w_7554f44 w_ff7630a w_47d2bc6 w_434ad6a
4,1f35985e.jpg,new_whale w_66c1b54 w_861cc1c w_21e178f w_d9adb4f
5,0b8448c8.jpg,new_whale w_143b201 w_ee948c6 w_99c07e8 w_e64c9a6
6,696cdb1f.jpg,new_whale w_43be268 w_434ad6a w_71764b4 w_7285eb3
7,9c165fe1.jpg,new_whale w_73d5489 w_edf5f77 w_3aa2073 w_5a81425
8,d56d47b4.jpg,new_whale w_43be268 w_3af4e73 w_53064a6 w_e61dd6d
9,395b47a3.jpg,new_whale w_ee948c6 w_29f00ae w_2111212 w_73d5489


In [72]:
# Load the prediction obtained with phash
phash_pred = pandas.read_csv('my_submission_phash_only.csv')

In [73]:
# Choose only those images which are not contained in the prediction made with the CNN
phash_pred = phash_pred[phash_pred.Image.apply(lambda x: x not in df_prediction.Image.values)]

In [74]:
df_combined = df_prediction.append(phash_pred,ignore_index=True)

In [75]:
df_combined = df_combined.reindex(columns=['Image','Id'])
df_combined.to_csv('my_submission_new_whale_as_extra_class_threshhold_{:.2f}.csv'.\
                   format(threshhold),index=False)