In [48]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import copy
import os
import pandas as pd


from util import *

from PIL import ImageFile, Image
ImageFile.LOAD_TRUNCATED_IMAGES = True
    
## for inceptionresnetv2
# from model_zoo.inceptionresnetv2.pytorch_load import inceptionresnetv2

# %load_ext autoreloadls

# %autoreload 2
    
plt.ion()   # interactive mode

%matplotlib inline

In [49]:
## NORMALIZATION

cervix_means = [98.629, 91.756, 121.141] # [0.38526953125, 0.358421875, 0.47320703125]
cervix_stds = [59.188, 63.595, 57.666] # [0.231203125, 0.24841796875, 0.2252578125]
cervix_means = [cervix_means[i]/256 for i in range(len(cervix_means))] 
cervix_stds = [cervix_stds[i]/256 for i in range(len(cervix_stds))] 

imagenet_means = [0.485, 0.456, 0.406]
imagenet_stds = [0.229, 0.224, 0.225]

means = imagenet_means
stds = imagenet_stds

print(means)
print(stds)

[0.485, 0.456, 0.406]
[0.229, 0.224, 0.225]


In [50]:
## TRANSFORMS and PHASES

# phases = ['train', 'val', 'test','test_kaggle', 'additional']
phases = ['test_stg2_scaled224']
# phases = ['test']

data_transforms = {
    'test_stg2_scaled224': transforms.Compose([
            transforms.Scale(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(means, stds)
            ])
}

for phase in phases:
    data_transforms[phase] = data_transforms['test_stg2_scaled224']

In [51]:
## DATA LOADER
home = os.path.expanduser('~')

data_dir = '../../data/test_stg2_all' # alternative validation

# exp_str = 'preaug'
exp_str = 'phase2_submission_exp1'

dsets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x])
         for x in phases}

batch_size = 64
dset_loaders = {x: torch.utils.data.DataLoader(
                dsets[x], batch_size = batch_size, shuffle=False, num_workers = 16, pin_memory = False)
                for x in phases}

dset_sizes = {x: len(dsets[x]) for x in phases}
# # dset_classes = {x: dsets[x].classes for x in phases}

In [52]:
## model names
# some_squeezenet =  'squeeze11lr0.0072272183267_wd3.05847296262e-06_decayep24_decayconst0.7stamp_1496955129.2932725'
# best_res18 = 'resnet18lr0.000120762743793_wd0.0208152639471_decayep11_decayconst0.4stamp_1496392713.9479108'
# best_res34 = 'resnet34lr0.000385913914401_wd8.70321287782e-05_decayep12_decayconst0.7stamp_1496458609.4664185'

## read names from DF
# poster_all = 'best_val_loss_post_poster.csv'
# top10 = 'top10_names_all.csv'
top5 = 'top5_names_all.csv'
# best_in_class =  'select_best_model_name_all.csv'

## SETTINGS #################
is_ensemble = True
# # ensemble_name = 'top10'
ensemble_name = 'top5'
# # ensemble_name = 'best_in_class'
# ####################################

# if 'top' in ensemble_name: 
#     model_list_csv = top5
# if ensemble_name == 'best_in_class':
#     model_list_csv = best_in_class
    
model_list_csv = top5
df = pd.read_csv('../performance/' + model_list_csv, sep=',',header=0)
# df = df.as_matrix()
print(df)

                                         description  Epoch      loss  \
0  resnet18lr0.00016488533004_wd3.16552084027e-08...     12  0.670640   
1  resnet18lr0.000595705150116_wd1.99122445693e-0...     17  0.674302   
2  resnet18lr0.000120762743793_wd0.0208152639471_...     22  0.687929   
3  resnet34lr0.000385913914401_wd8.70321287782e-0...     16  0.689479   
4  resnet34lr0.000155365752892_wd0.00317909033787...      7  0.691991   

    model_name           experiment augmentations phase        lr  \
0  resnet18-FT  exp6_stephen_altval         basic   val  0.000165   
1  resnet18-FT                 exp3         basic   val  0.000596   
2  resnet18-FT          exp-poster2         basic   val  0.000121   
3  resnet34-FT         exp5_stephen         basic   val  0.000386   
4  resnet34-FT         exp5_stephen         basic   val  0.000155   

   weight_decay  pretrained  loss_model  loss_txt  
0  3.165521e-08        True    0.670640     0.671  
1  1.991224e-06        True    0.674302   

In [53]:
num_models = len(df)
description_str_list = df['description'].tolist()
exp_name_list = df['experiment'].tolist()
model_name_list = df['model_name'].tolist()
print(description_str_list)

['resnet18lr0.00016488533004_wd3.16552084027e-08_decayep12_decayconst0.3stamp_1496692512.7544453', 'resnet18lr0.000595705150116_wd1.99122445693e-06_decayep8_decayconst0.5stamp_1496419169.2827077', 'resnet18lr0.000120762743793_wd0.0208152639471_decayep11_decayconst0.4stamp_1496392713.9479108', 'resnet34lr0.000385913914401_wd8.70321287782e-05_decayep12_decayconst0.7stamp_1496458609.4664185', 'resnet34lr0.000155365752892_wd0.00317909033787_decayep12_decayconst0.6stamp_1496463117.337411']


In [54]:
phase1_submission_path = '../../data/solution_stg1_release.csv'
phase_1_submission = pd.read_csv(phase1_submission_path, sep=',', header = 0)
# print(phase_1_submission)

In [None]:
## MODELS

num_models = len(df)
#### SET for topX
# num_models = 10
######################
# exp_name = 'phase_2_submission'
# description_str_list = os.listdir('../model_weights/experiments/' + exp_name + '/')

for phase in phases:
    print('PHASE: ', phase)
    for i in range(num_models):
        print(i)
        ## for top5
        description_string = description_str_list[i]
        exp_name = exp_name_list[i]
        model_name_full = exp_name + str(i)

        print(exp_name)
        path_checkpoint =  '../model_weights/experiments/' + exp_name + '/' + description_string + '.chkpt'
#         model_name = description_string.split('lr')[0]
        model_name = model_name_list[i]
        print(model_name)

        ## ResNet18 ##
        if 'resnet18' in model_name:
            ## Select Model architecture
            model = models.resnet18(pretrained=False)

        ## ResNet34 ##
        if 'resnet34' in model_name:
            ## Select Model architecture
            model = models.resnet34(pretrained=False)

        if ('resnet18' in model_name) or ('resnet34' in model_name):
            ## Replace Classifier
            num_ftrs = model.fc.in_features
            model.fc = nn.Linear(num_ftrs, 3) 

        ## reload state
        checkpoint = torch.load(path_checkpoint)
        sd = {k.replace('module.', '') : v for k, v in checkpoint.items()}
        model.load_state_dict(sd)

        ## do not train
        for param in model.parameters():
            param.requires_grad = False  

        ## test if model works
        model.train(False)  # Set model to evaluate mode

        ## Cuda?
        model = model.cuda()
        dtype = torch.cuda.FloatTensor

        ## PREDICT
        print('MODEL: ', model_name_full)

        ## loader
        dset_loader = dset_loaders[phase]

        ## predict
        probs, labels = predict_on_test2(model, dset_loader, dtype)

        # Extract file name
        file_names = dset_loader.dataset.imgs        
        file_names = [x[0].split('/')[-1] for x in file_names]
        img_names = [x[0:-4] for x in file_names] # remove .jpg
        img_names = [x.split('_')[0] for x in img_names] ## remove _descripts
        img_names_jpg = [x + '.jpg' for x in img_names]

        ## make df
        predictions_df = pd.DataFrame(data = probs, columns = (['Type_1', 'Type_2', 'Type_3']))
        predictions_df.insert(0, column = 'image_name', value = img_names_jpg)
#         predictions_df.insert(4, column = 'labels', value = labels)
    #         print(predictions_df)

        ## group by image name, take average
        grouped_df = predictions_df.groupby('image_name', as_index=False).mean()
    #         print(grouped_df)        

        ## Save predictions
        pred_out_df = grouped_df.copy()
#         if phase == 'test_kaggle':
#             pred_out_df = pred_out_df.drop('labels', 1)

        ## make folders
        predictions_dir = '../test_predictions/' + exp_str + '/'
        maybe_makedir(predictions_dir)
        predictions_dir = predictions_dir + phase + '/'
        maybe_makedir(predictions_dir)
        ## save
        filename = 'scores_' + model_name_full + description_string + '.csv'
        pred_out_df.to_csv(path_or_buf = predictions_dir + filename, index = False)

        if is_ensemble:
            if i==0:
                ensemble_scores = grouped_df
            else:
                ensemble_scores = ensemble_scores.append(grouped_df, ignore_index=True)

        ## compute Loss and stats        
        if phase == 'test':
            preds_labels = grouped_df.as_matrix()
            probs = preds_labels[:, 1:4].astype(np.float)
            labels = preds_labels[:, 4].astype(np.int)

            ## collect the stats and loss
            df_stats = stats_from_probs(probs, labels, description_string, phase)
            df_stats['model_name_full'] = model_name_full 
    #             print(df_stats)

            ## save
            filename = 'test_stats_' + model_name_full +description_string + '.csv'
            df_stats.to_csv(path_or_buf = predictions_dir + filename, index = False)

            if is_ensemble:
                if i==0:
                    ensemble_stats = df_stats
                else:
                    ensemble_stats = ensemble_stats.append(df_stats, ignore_index=True)


    
    if is_ensemble: #and any("test" == p for p in phases)
        print('-'*10  + 'Computing Ensemble' + '-'*10 )
        ## make folders
        predictions_dir = '../test_predictions/' + exp_str + '/'
        maybe_makedir(predictions_dir)
        predictions_dir = predictions_dir + phase + '/'
        maybe_makedir(predictions_dir)

        ## save individual scores
        filename = 'scores_ensemble_individual_' +ensemble_name+ '.csv'
        ensemble_scores.to_csv(path_or_buf = predictions_dir + filename, index = False)
        
        ## save stats
        if phase == 'test':
            filename = 'test_stats_ensemble_individual_' +ensemble_name+ '.csv'
            ensemble_stats.to_csv(path_or_buf = predictions_dir + filename, index = False)

        # average scores
        ensemble_scores_grouped = ensemble_scores.groupby('image_name', as_index=False).mean()
        ensemble_scores_out = ensemble_scores_grouped.copy()
        
        # Append the phase 1 scores
        
#         if phase == 'test_kaggle':
#             ensemble_scores_out = ensemble_scores_out.drop('labels', 1) 
        ensemble_scores_out = ensemble_scores_out.append(phase_1_submission, ignore_index = True)
    
        ## save average scores
        filename = 'scores_ensemble_mean_' + ensemble_name + '.csv'
        ensemble_scores_out.to_csv(path_or_buf = predictions_dir + filename, index = False)

        ## collect the stats and loss
        if phase == 'test':
            preds_labels = ensemble_scores_grouped.as_matrix()
            probs = preds_labels[:, 1:4].astype(np.float)
            labels = preds_labels[:, 4].astype(np.int)

            df_stats = stats_from_probs(probs, labels, ensemble_name, 'test')
            df_stats['model_name_full'] = ensemble_name 
        #   print(df_stats)

            ## save collected stats
            filename = 'test_stats_ensemble_mean_' +ensemble_name+ '.csv'
            df_stats.to_csv(path_or_buf = predictions_dir + filename, index = False)

PHASE:  test_stg2_scaled224
0
exp6_stephen_altval
resnet18-FT
MODEL:  exp6_stephen_altval0
1
exp3
resnet18-FT
MODEL:  exp31
2
exp-poster2
resnet18-FT
MODEL:  exp-poster22
3
exp5_stephen
resnet34-FT
MODEL:  exp5_stephen3
4
exp5_stephen
resnet34-FT
MODEL:  exp5_stephen4
