In [1]:
import sys
sys.path.append('../lib/')

In [4]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import model_selection
from uuid import uuid4
from datetime import datetime
import time

from torch.utils.data import Dataset
import torch
import tez

from configuration import load_configuration
from constants import ColumnNames
from image_transforms import (
    cnn_training_transform,
    cnn_inferencing_transform
)

from datasets import PetfinderImageSet
from models.resnet import PawpularResnetModel

# set up and configurations

## Settings not currently in the configuration file

In [21]:
IS_KAGLE = False # flag to not if we're in KAGLE and should set paths etc accordingly
TRAIN = True # if False will just load and make submission 
TRAIN &= IS_KAGLE # TRAIN should always be False when running in kaggle

LOAD_MODEL = "pawpular_model_2021_11_08_18" # only needed when TRAIN is False

EARLY_STOPPING_PATIENCE = 11 # how many epocs with no improvment before we stop training. This has reached a point where it should be added to the config soon
N_FOLDS = 10 # should probably be added to configuration.yaml at some point
USE_HOLDOUT = True # for testing internally we should use a hold out dataset to better measure improvments. but if we are training for submission we should use all the data 
FEATURE_RESCALLING = False # this is only needed if we add new features, eventually might make it into our config file but for now set here

# the scheduler and optimizer are hard one to get into the configuration
# these might be things that are specfific to and individual notebook / experiment 
# so these settings will only exist within notebooks using cosine annealing
COSINE_ANNEALING_T0 =5
COSINE_ANNEALING_Tmulit =1
COSINE_ANNEALING_eta_min = 1e-7

configuration_file = "configuration_exp_20.yaml"

## Setting the save folder and paths (shouldn't need to change anything here most of the time)

In [11]:
save_date = datetime.now().strftime("%Y_%m_%d_%H")
save_folder = Path('../') / "experiments" / f'conf_{configuration_file[:-5]}_{save_date}'
save_folder.mkdir(exist_ok=True)
print(f"saving in {save_folder}")

saving in ../experiments/conf_configuration_exp_20_2021_11_20_23


In [17]:
if IS_KAGLE:
    base_dir = Path('/kaggle')
    test_img_dir = base_dir / 'input/petfinder-pawpularity-score/test'
    test_data_csv = base_dir / 'input/petfinder-pawpularity-score/test.csv'
    train_img_dir = base_dir / 'input/petfinder-pawpularity-score/train'
    train_data_csv = base_dir / 'input/petfinder-pawpularity-score/train.csv'
    sumbission_csv = base_dir / 'input/petfinder-pawpularity-score/sample_submission.csv'
    temp_dir = base_dir / 'temp'
    store_dir = base_dir / 'working'
    test_submission_path = base_dir/ 'input/pawpularity-submission/submission.csv'
    load_dir =  base_dir/ f'pawpular_model_{LOAD_MODEL}'
    config = load_configuration(configuration_path = base_dir / f'input/kapet-v003/{configuration_file}')
else:
    base_dir = Path('../')
    test_img_dir = base_dir / 'input/test'
    test_data_csv = base_dir / 'input/test.csv'
    train_img_dir = base_dir / 'input/train'
    train_data_csv = base_dir / 'input/train.csv'
    sumbission_csv = base_dir / 'input/sample_submission.csv'
    store_dir = base_dir / 'experiments'
    test_submission_path = base_dir/ 'input/pawpularity-submission/submission.csv'
    load_dir = save_folder
    config = load_configuration(configuration_path = base_dir / f'lib/{configuration_file}')
    

# prepping the data

In [15]:

def add_image_path(data,img_dir):
	data[ColumnNames.image_path.value] = data[ColumnNames.image_name.value].map(lambda x : img_dir / (x + ".jpg"))
	return data

train_data = pd.read_csv(train_data_csv)
test_data = pd.read_csv(test_data_csv)

train_data = add_image_path(train_data,train_img_dir)
test_data = add_image_path(test_data,test_img_dir)

train_data.head()
print(f"total files in image train dir {len(list(train_img_dir.glob('*.*')))}")
print(f"total jpgs in image test dir {len(list(test_img_dir.glob('*.jpg')))}")

total files in image train dir 9912
total jpgs in image test dir 8


## Generate data splits
Going to make a hold out data set if specified
Ideally if we want to train a final model for submission and we think it has a good set of hyper paramaters we would skip this

In [22]:
y = train_data[ColumnNames.label.value]

if USE_HOLDOUT:
    # x_hold_out and y_hold_out are to be used in as a secondary assecment of fold performance
    # different then the validation sets in a fold which are used for early stopping criteria
    X, X_hold_out, y, y_hold_out = model_selection.train_test_split(
        train_data, y, train_size=0.9, random_state=2019, shuffle=True, stratify=y)
else:
    X = train_data
    


# Training

In [None]:
if TRAIN:
    holdout_losses = []
    validation_losses = []
    training_losses = []
    timing_list = []
    k_folds = model_selection.KFold(n_splits=N_FOLDS, random_state=2019, shuffle=True)
    for i_fold, (train_index, test_index) in enumerate(k_folds.split(X)):
        tic = time.time()
        X_train = X.iloc[train_index]
        X_validation = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_validation = y.iloc[test_index]

        # rescale features specific to this fold
        if FEATURE_RESCALLING
            scaler = StandardScaler()
            scaler = scaler.fit(X_train[config.regression_config.features_to_use])
            X_train[config.regression_config.features_to_use] = scaler.transform(
                X_train[config.regression_config.features_to_use]
            )
            X_validation[config.regression_config.features_to_use] = scaler.transform(
                X_validation[config.regression_config.features_to_use]
            )
            if USE_HOLDOUT:
                X_hold_out[config.regression_config.features_to_use] = scaler.transform(
                    X_hold_out[config.regression_config.features_to_use]
                )= torch.optim.Adam(model.parameters(), lr=model.learning_rate)

        # build data sets
        dataset_train = PetfinderImageSet(
            config, train_img_dir, images_df=X_train, transform=cnn_training_transform(config)
        )
        dataset_validation = PetfinderImageSet(
            config, train_img_dir, images_df=X_validation, transform=cnn_inferencing_transform(config)
        )
        
        print(f'train size {len(dataset_train)} batches {len(dataset_train) / config.cnn_config.batch_size}')
        print(f'validation size {len(dataset_validation)} batches {len(dataset_validation) / config.cnn_config.batch_size}')
        if USE_HOLDOUT:
            print(f'internal testing size {len(dataset_hold_out)} batches {len(dataset_hold_out) / config.cnn_config.batch_size}')

        model = PawpularResnetModel(
            config.cnn_config.model_configuration,
            learning_rate=config.cnn_config.learning_rate,
            number_of_latent_image_features=128,
            number_of_additional_features=len(config.regression_config.features_to_use),
            number_of_intermediate_regression_variables=64,
            regression_dropout=0.2,
            model_drop_rate=0.1
        )

        model.optimizer = torch.optim.Adam(model.parameters(), lr=config.cnn_config.learning_rate)
        model.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                model.optimizer, T_0=COSINE_ANNEALING_T0, T_mult=COSINE_ANNEALING_Tmulit, eta_min=COSINE_ANNEALING_eta_min, last_epoch=-1
            ) 
        

        model.step_scheduler_after = "epoch"

        # early stopping callback, you can also write your own callback
        early_stopping = tez.callbacks.EarlyStopping(
            monitor="valid_loss",
            model_path=save_folder / f"best_model_fold_{i_fold}.bin",
            patience=EARLY_STOPPING_PATIENCE
        )

        # train model using the tez framework
        model.fit(
            train_dataset=dataset_train,
            valid_dataset = dataset_validation,
            train_bs=8,
            valid_bs=8,
            device="cuda",
            fp16=True,
            epochs=150,
            callbacks=[early_stopping],
            accumulation_steps=2  # needed for memory constraints
        )

        # save model (with optimizer and scheduler for future!)
        model.save(save_folder / f"final_model_{i_fold}.bin")
        
        # reload the best model version
        model.load(save_folder / f"best_model_fold_{i_fold}.bin")
        
        # get the best predictions for both validation and train datasets
        validation_loss = model.validate_one_epoch(
            data_loader=model.valid_loader
        )
        training_loss = model.validate_one_epoch(
            data_loader=model.train_loader
        )

        # store
        validation_losses.append(validation_loss)
        training_losses.append(training_loss)
        
        # run time store
        timing = time.time()-tic
        timing_list.append(timing)


        # some clean up
        del dataset_validation
        del dataset_train
        gc.collect()
        gc.collect()
        torch.cuda.empty_cache()

        # test the holdout scores
        if USE_HOLDOUT:
            dataset_hold_out = PetfinderImageSet(
                config, train_img_dir, images_df=X_hold_out, transform=cnn_inferencing_transform(config)
            )
            hold_out_loader = torch.utils.data.DataLoader(
                dataset_hold_out,
                batch_size=8
            )
            holdout_loss = model.validate_one_epoch(
                data_loader=hold_out_loader
            )
            holdout_losses.append(holdout_loss)
            print(f"Finished fold {i_fold} in {timing} with holdout loss {holdout_loss}")

            del hold_out_loader
            del dataset_hold_out
        
        # even more clean up
        del model
        gc.collect()
        gc.collect()
        torch.cuda.empty_cache()