In [None]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy
import os
import pandas as pd


from util import *

from PIL import ImageFile, Image
ImageFile.LOAD_TRUNCATED_IMAGES = True
    
## for inceptionresnetv2
# from model_zoo.inceptionresnetv2.pytorch_load import inceptionresnetv2

# %load_ext autoreloadls

# %autoreload 2
    
plt.ion()   # interactive mode

%matplotlib inline

In [None]:
## NORMALIZATION

cervix_means = [98.629, 91.756, 121.141] # [0.38526953125, 0.358421875, 0.47320703125]
cervix_stds = [59.188, 63.595, 57.666] # [0.231203125, 0.24841796875, 0.2252578125]
cervix_means = [cervix_means[i]/256 for i in range(len(cervix_means))] 
cervix_stds = [cervix_stds[i]/256 for i in range(len(cervix_stds))] 

imagenet_means = [0.485, 0.456, 0.406]
imagenet_stds = [0.229, 0.224, 0.225]

means = imagenet_means
stds = imagenet_stds

print(means)
print(stds)

In [None]:
## TRANSFORMS and PHASES

# phases = ['train', 'val', 'test','test_kaggle', 'additional']
phases = ['test', 'test_kaggle']
# phases = ['test']

data_transforms = {
    'test_kaggle': transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(means, stds)
    ]),
}

for phase in phases:
    data_transforms[phase] = data_transforms['test_kaggle']

In [None]:
## DATA LOADER
home = os.path.expanduser('~')

data_dir = '../../data/all_data_scaled_alt' # alternative validation

# exp_str = 'preaug'
exp_str = 'crop224_phase1_submission'

dsets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
         for x in phases}

batch_size = 64
dset_loaders = {x: torch.utils.data.DataLoader(
                dsets[x], batch_size = batch_size, shuffle=False, num_workers = 16, pin_memory = False)
                for x in phases}

dset_sizes = {x: len(dsets[x]) for x in phases}
# # dset_classes = {x: dsets[x].classes for x in phases}

In [None]:
## model names
# some_squeezenet =  'squeeze11lr0.0072272183267_wd3.05847296262e-06_decayep24_decayconst0.7stamp_1496955129.2932725'
# best_res18 = 'resnet18lr0.000120762743793_wd0.0208152639471_decayep11_decayconst0.4stamp_1496392713.9479108'
# best_res34 = 'resnet34lr0.000385913914401_wd8.70321287782e-05_decayep12_decayconst0.7stamp_1496458609.4664185'

## read names from DF
# poster_all = 'best_val_loss_post_poster.csv'
# top10 = 'top10_names_all.csv'
top5 = 'top5_names_all.csv'
# best_in_class =  'select_best_model_name_all.csv'

## SETTINGS #################
is_ensemble = True
# ensemble_name = 'top10'
ensemble_name = 'top5'
# ensemble_name = 'best_in_class'
####################################

if 'top' in ensemble_name: 
    model_list_csv = top5
if ensemble_name == 'best_in_class':
    model_list_csv = best_in_class
    
df = pd.read_csv('../performance/' + model_list_csv, sep=',',header=0)
df = df.as_matrix()
print(df)

In [None]:
## MODELS

num_models = len(df)
#### SET for topX
# num_models = 10
######################
exp_name = 'phase_1_submission'
description_str_list = os.listdir('../model_weights/experiments/' + exp_name + '/')
for phase in phases:
    print('PHASE: ', phase)
    for i in range(num_models):
        
        ## for top5
        description_string = description_str_list[i]
        model_name_full = exp_name + str(i)

        print(exp_name)
        path_checkpoint =  '../model_weights/experiments/' + exp_name + '/' + description_string + '.chkpt'
        model_name = description_string.split('lr')[0]
        print(model_name)

        ## ResNet18 ##
        if model_name == 'resnet18':
            ## Select Model architecture
            model = models.resnet18(pretrained=False)

        ## ResNet34 ##
        if model_name == 'resnet34':
            ## Select Model architecture
            model = models.resnet34(pretrained=False)

        if  model_name == 'resnet18'or model_name == 'resnet34':
            ## Replace Classifier
            num_ftrs = model.fc.in_features
            model.fc = nn.Linear(num_ftrs, 3) 

        ## reload state
        checkpoint = torch.load(path_checkpoint)
        sd = {k.replace('module.', '') : v for k, v in checkpoint.items()}
        model.load_state_dict(sd)

        ## do not train
        for param in model.parameters():
            param.requires_grad = False  

        ## test if model works
        model.train(False)  # Set model to evaluate mode

        ## Cuda?
        model = model.cuda()
        dtype = torch.cuda.FloatTensor

        ## PREDICT
        print('MODEL: ', model_name_full)

        ## loader
        dset_loader = dset_loaders[phase]

        ## predict
        probs, labels = predict_on_test2(model, dset_loader, dtype)

        # Extract file name
        file_names = dset_loader.dataset.imgs        
        file_names = [x[0].split('/')[-1] for x in file_names]
        img_names = [x[0:-4] for x in file_names] # remove .jpg
        img_names = [x.split('_')[0] for x in img_names] ## remove _descripts
        img_names_jpg = [x + '.jpg' for x in img_names]

        ## make df
        predictions_df = pd.DataFrame(data = probs, columns = (['Type_1', 'Type_2', 'Type_3']))
        predictions_df.insert(0, column = 'image_name', value = img_names_jpg)
        predictions_df.insert(4, column = 'labels', value = labels)
    #         print(predictions_df)

        ## group by image name, take average
        grouped_df = predictions_df.groupby('image_name', as_index=False).mean()
    #         print(grouped_df)        

        ## Save predictions
        pred_out_df = grouped_df.copy()
        if phase == 'test_kaggle':
            pred_out_df = pred_out_df.drop('labels', 1)

        ## make folders
        predictions_dir = '../test_predictions/' + exp_str + '/'
        maybe_makedir(predictions_dir)
        predictions_dir = predictions_dir + phase + '/'
        maybe_makedir(predictions_dir)
        ## save
        filename = 'scores_' + model_name_full + description_string + '.csv'
        pred_out_df.to_csv(path_or_buf = predictions_dir + filename, index = False)

        if is_ensemble:
            if i==0:
                ensemble_scores = grouped_df
            else:
                ensemble_scores = ensemble_scores.append(grouped_df, ignore_index=True)

        ## compute Loss and stats        
        if phase == 'test':
            preds_labels = grouped_df.as_matrix()
            probs = preds_labels[:, 1:4].astype(np.float)
            labels = preds_labels[:, 4].astype(np.int)

            ## collect the stats and loss
            df_stats = stats_from_probs(probs, labels, description_string, phase)
            df_stats['model_name_full'] = model_name_full 
    #             print(df_stats)

            ## save
            filename = 'test_stats_' + model_name_full +description_string + '.csv'
            df_stats.to_csv(path_or_buf = predictions_dir + filename, index = False)

            if is_ensemble:
                if i==0:
                    ensemble_stats = df_stats
                else:
                    ensemble_stats = ensemble_stats.append(df_stats, ignore_index=True)



    if is_ensemble: #and any("test" == p for p in phases)
        print('-'*10  + 'Computing Ensemble' + '-'*10 )
        ## make folders
        predictions_dir = '../test_predictions/' + exp_str + '/'
        maybe_makedir(predictions_dir)
        predictions_dir = predictions_dir + phase + '/'
        maybe_makedir(predictions_dir)

        ## save individual scores
        filename = 'scores_ensemble_individual_' +ensemble_name+ '.csv'
        ensemble_scores.to_csv(path_or_buf = predictions_dir + filename, index = False)
        
        ## save stats
        if phase == 'test':
            filename = 'test_stats_ensemble_individual_' +ensemble_name+ '.csv'
            ensemble_stats.to_csv(path_or_buf = predictions_dir + filename, index = False)

        # average scores
        ensemble_scores_grouped = ensemble_scores.groupby('image_name', as_index=False).mean()
        ensemble_scores_out = ensemble_scores_grouped.copy()
        if phase == 'test_kaggle':
            ensemble_scores_out = ensemble_scores_out.drop('labels', 1)        
        ## save average scores
        filename = 'scores_ensemble_mean_' +ensemble_name+ '.csv'
        ensemble_scores_out.to_csv(path_or_buf = predictions_dir + filename, index = False)

        ## collect the stats and loss
        if phase == 'test':
            preds_labels = ensemble_scores_grouped.as_matrix()
            probs = preds_labels[:, 1:4].astype(np.float)
            labels = preds_labels[:, 4].astype(np.int)

            df_stats = stats_from_probs(probs, labels, ensemble_name, 'test')
            df_stats['model_name_full'] = ensemble_name 
        #   print(df_stats)

            ## save collected stats
            filename = 'test_stats_ensemble_mean_' +ensemble_name+ '.csv'
            df_stats.to_csv(path_or_buf = predictions_dir + filename, index = False)