# Create a submission using pretrained UNet models

In this notebook we will perform the following actions:
* Create a valid submission for the core-competition (R1-3) using pretrained UNets per region i.e. individual models per region
* Create a valid submission for the transfer-learning-competition (R4-6) using a single UNet trained on region R1
* Use the ensamble of models trained in regions R1-3 to generate a valid submission transfer-learning-competition (R4-6) by averaging their predictions


Dependencies required:
* torch 
* pytorch_lightning
* numpy

The model is defined in weather4cast/benchmarks:
* unet.py: architecture definition 
* FeaturesSysUNet.py:  Inherits from pytorch_lightning.LightningModule. In this notebook we only use it for the forward pass

Please, refer to those files if you want to know more about the architecture. You can also read the [docs of pytorch_lightning](https://pytorch-lightning.readthedocs.io/en/latest/), but it is not necessary in order to understand how to produce predictions, which is the main topic of this notebook.


## Let us fisrt define the functions that will perform the main tasks:
* create the submission directory structure
* load the data & the model 
* compute predictions per day in the test split for a given region

In [13]:
%load_ext autoreload
%autoreload 2

from torch.utils.data import DataLoader

from pathlib import Path
import sys
import os
sys.path.append(str(Path('.').absolute().parent))

import data_utils
import config as cf
from w4c_dataloader import create_dataset
from benchmarks.FeaturesSysUNet import FeaturesSysUNet as Model

# ------------
# 1. create folder structures for the submission
# ------------
def create_directory_structure(root, folder_name='submission'):
    """
    create competition output directory structure at given root path. 
    """
    challenges = {'w4c-core-stage-1': ['R1', 'R2', 'R3'], 'w4c-transfer-learning-stage-1': ['R4', 'R5', 'R6']}
    
    for f_name, regions in challenges.items():
        for region in regions:
            r_path = os.path.join(root, folder_name, f_name, region, 'test')
            try:
                os.makedirs(r_path)
                print(f'created path: {r_path}')
            except:
                print(f'failed to create directory structure, maybe they already exist: {r_path}')

# ------------
# 2. load data & model
# ------------
def get_data_iterator(region_id='R1', data_split= 'test', collapse_time=True, 
                      batch_size=32, shuffle=False, num_workers=0):
    """ creates an iterator for data in region 'region_id' for the 'data_split' data partition """
    
    params = cf.get_params(region_id=region_id)
    params['data_params']['collapse_time'] = collapse_time

    ds = create_dataset(data_split, params['data_params'])
    dataloader = DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
    
    data_splits, test_sequences = data_utils.read_splits(params['data_params']['train_splits'], params['data_params']['test_splits'])
    test_dates = data_splits[data_splits.split=='test'].id_date.sort_values().values

    return iter(dataloader), test_dates, params

def load_model(Model, params, checkpoint_path='', device=None):
    """ loads a model from a checkpoint or from scratch if checkpoint_path='' """
    
    if checkpoint_path == '':
        model = Model(params['model_params'], **params['data_params'])            
    else:
        print("model:", Model)
        print(f'-> Loading model checkpoint: {checkpoint_path}')
        model = Model.load_from_checkpoint(checkpoint_path)
        
    if device is not None:
        model = model.eval().cuda(device)
        
    return model

# ------------
# 3. make predictions & loop over regions
# ------------
def get_preds(model, batch, device=None):
    """ computes the output of the model on the next iterator's batch 
        returns the prediction and the date of it
    """
    
    in_seq, out, metadata = batch
    day_in_year = metadata['in']['day_in_year'][0][0].item()
    
    if device is not None:
        in_seq = in_seq.cuda(device=device)
    y_hat = model(in_seq)
    y_hat = y_hat.data.cpu().numpy()  
    
    return y_hat, day_in_year

def predictions_per_day(test_dates, model, ds_iterator, device, file_path, data_params):
    """ computes predictions of all dates and saves them to disk """
    
    for target_date in test_dates:
        print(f'generating submission for date: {target_date}...')
        batch = next(ds_iterator)
        y_hat, predicted_day = get_preds(model, batch, device)
        
        # force data to be in the valid range
        y_hat[y_hat>1] = 1
        y_hat[y_hat<0] = 0
        
        # batches are sorted by date for the dataloader, that's why they coincide
        assert predicted_day==target_date, f"Error, the loaded date {predicted_day} is different than the target: {target_date}"

        f_path = os.path.join(file_path, f'{predicted_day}.h5')
        y_hat = data_utils.postprocess_fn(y_hat, data_params['target_vars'], data_params['preprocess']['source'])
        data_utils.write_data(y_hat, f_path)
        print(f'--> saved in: {f_path}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Let us now generate and save the predictions for the core-competition:

In [None]:
# 1. Define model's checkpoints, regions per task & gpu id to use
root_to_ckps = '~/projects/weather4cast/lightning_logs'
checkpoint_paths = {'R1': f'{root_to_ckps}/version_21/checkpoints/epoch=03-val_loss_epoch=0.027697.ckpt', 
                    'R2': f'{root_to_ckps}/version_19/checkpoints/epoch=01-val_loss_epoch=0.042129.ckpt', 
                    'R3': f'{root_to_ckps}/version_20/checkpoints/epoch=06-val_loss_epoch=0.058147.ckpt'}
challenges = {'w4c-core-stage-1': ['R1', 'R2', 'R3'], 'w4c-transfer-learning-stage-1': ['R4', 'R5', 'R6']}
device = 1 # gpu id

# 2. define root and name of the submission to create the folders' structure
root = '/iarai/home/pedro.herruzo/projects/weather4cast/weather4cast/utils/submission_examples'
folder_name = 'UNet_submission'
create_directory_structure(root, folder_name=folder_name)

In [None]:
# 3. compute and save predictions for each reagion for all dates in the test split
task_name = 'w4c-core-stage-1'
for region in challenges[task_name]:
    # load data and model
    ds_iterator, test_dates, params = get_data_iterator(region_id=region)
    model = load_model(Model, params, checkpoint_path=checkpoint_paths[region], device=device)

    r_path = os.path.join(root, folder_name, task_name, region, 'test')
    predictions_per_day(test_dates, model, ds_iterator, device, r_path, params['data_params']) 

We have just computed a valid submission for all regions using a pretrained UNet model. 

Now we should submit a zip containing all regions and we are done. Please, follow the instructions in weather4cast/README.md to generate the zip file.

## Transfer Learning competition submission

### Let us first generate the predictions using only a single model from the core-competition

In [14]:
folder_name = 'UNet_submission'

# define the model to be used to generate predictions
backbone_region = 'R1'
model = load_model(Model, params, checkpoint_path=checkpoint_paths[backbone_region], device=device)

# 3. compute and save predictions for each reagion for all dates in the test split
task_name = 'w4c-transfer-learning-stage-1'
for region in challenges[task_name]:
    # load data and model
    ds_iterator, test_dates, params = get_data_iterator(region_id=region)

    r_path = os.path.join(root, folder_name, task_name, region, 'test')
    predictions_per_day(test_dates, model, ds_iterator, device, r_path, params['data_params']) 

model: <class 'benchmarks.FeaturesSysUNet.FeaturesSysUNet'>
-> Loading model checkpoint: ~/projects/weather4cast/lightning_logs/version_21/checkpoints/epoch=03-val_loss_epoch=0.027697.ckpt
Using data for region R4 | size: 256 | Central
Maghreb
generating submission for date: 2019047...
--> saved in: /iarai/home/pedro.herruzo/projects/weather4cast/weather4cast/utils/submission_examples/UNet_submission/w4c-transfer-learning-stage-1/R4/test/2019047.h5
generating submission for date: 2019073...
--> saved in: /iarai/home/pedro.herruzo/projects/weather4cast/weather4cast/utils/submission_examples/UNet_submission/w4c-transfer-learning-stage-1/R4/test/2019073.h5
generating submission for date: 2019082...
--> saved in: /iarai/home/pedro.herruzo/projects/weather4cast/weather4cast/utils/submission_examples/UNet_submission/w4c-transfer-learning-stage-1/R4/test/2019082.h5
generating submission for date: 2019092...
--> saved in: /iarai/home/pedro.herruzo/projects/weather4cast/weather4cast/utils/submi

### Let use an ensemble of the models learned in regions 1-3 by getting individual predictions and just averaging across them

In [None]:
# create a new folder structure (note that we will only use R4-6)
folder_name = 'transfer-ensample'
create_directory_structure(root, folder_name=folder_name)

In [11]:
import numpy as np

def predictions_per_day_ensamble(test_dates, models, ds_iterator, device, file_path, data_params):
    """ computes predictions of all dates and saves them to disk. It uses the average of predictions across all models provided
        models (list): list of models to be used in the ensample
    """
    
    for target_date in test_dates:
        print(f'generating submission for date: {target_date}...')
        batch = next(ds_iterator)
        
        ensamble = []
        for model in models:
            y_hat, predicted_day = get_preds(model, batch, device)

            # force data to be in the valid range
            y_hat[y_hat>1] = 1
            y_hat[y_hat<0] = 0

            # batches are sorted by date for the dataloader, that's why they coincide
            assert predicted_day==target_date, f"Error, the loaded date {predicted_day} is different than the target: {target_date}"
            
            ensamble.append(y_hat)
            
        ensamble = np.asarray(ensamble)
        y_hat = np.mean(ensamble, axis=0)

        f_path = os.path.join(file_path, f'{predicted_day}.h5')
        y_hat = data_utils.postprocess_fn(y_hat, data_params['target_vars'], data_params['preprocess']['source'])
        data_utils.write_data(y_hat, f_path)
        print(f'--> saved in: {f_path}')
        
# load all 3 models into a list
models = [load_model(Model, params, checkpoint_path=checkpoint_paths[reg_id], device=device) for reg_id in challenges['w4c-core-stage-1']]

# compute and save averaged predictions with the ensamble of models
task_name = 'w4c-transfer-learning-stage-1'
for region in challenges[task_name]:
    # load data
    ds_iterator, test_dates, params = get_data_iterator(region_id=region)

    # compute predictions in the ensamble of models and save them to disk
    r_path = os.path.join(root, folder_name, task_name, region, 'test')
    predictions_per_day_ensamble(test_dates, models, ds_iterator, device, r_path, params['data_params']) 