# Multimodal deep learning 


It employes the spectral and tabular deep learning models as modules, which will be trained together to inform the grain yield prediction.

In [None]:
# Import libraries
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.vision.all import *
import fastai
from fastai.tabular.all import *
from fastai.data.load import _FakeLoader, _loaders
import torch
from ipywidgets import IntProgress
from glob import glob

import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import os

# Custom functions
from msi_utils import *
from fold_utils import * 
from multimodal_utisl import *
from multimodal_model import *

# use gpu by default if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

Fastai and fastcore versions have to be above 

fastai : 2.1.2 (at the moment 2.3.0)
fastcore: 1.3.1 (1.3.19)
torch : 1.7.0 ('1.8.1+cu102')

## Reference tables

In [None]:
path = Path('/data/g2f_data/input_data/')
df_test = pd.read_csv('/data/fielddata/df_test.csv')
df_train_val = pd.read_csv('/data/fielddata/df_train_val.csv')

In [None]:
# Use random splitter function from fastai
splitter = RandomSplitter(seed=42)
splits = splitter(range_of(df_train_val))
splits

# Tab Dataloaders

In [None]:
procs = [Categorify, Normalize, FillMissing]
cat_names = ['Parental 1', 'Parental 2', 'Planting', 'Stock', 'Fertilizer']
cont_names =['Days_after_sowing']

to = TabularPandas(df_train_val,
                   procs,
                   cat_names,
                   cont_names=cont_names,
                   y_names='Yield',
                   y_block=RegressionBlock(),
                   splits=splits)

In [None]:
tab_dl = to.dataloaders(bs=8)

In [None]:
tab_dl.show_batch()

# Spectral Dataloaders

In [None]:
dblock = DataBlock(blocks    = (MSITensorBlock, RegressionBlock),
                       get_items = get_npy,
                       get_y     = get_y,
                       splitter  = splitter)

msi_dls = dblock.dataloaders(df_train_val, bs=8)

In [None]:
msi_dls.show_batch(channels=3)

In [None]:
msi_dls.show_batch(channels=9)

# Mixed Dataloader

In [None]:
# Check that the tabular dataset is aligned with the spectral dataset
mixed_dl = MixedDL(tab_dl[0], msi_dls[0])
# These should show the same ids
msi_dls[0].get_idxs()[:10]

In [None]:
# Now mix the tabular and spectral datasets to create the multimodal input
train_mixed_dl = MixedDL(tab_dl[0], msi_dls[0])
valid_mixed_dl = MixedDL(tab_dl[1], msi_dls[1])
mixed_dls = DataLoaders(train_mixed_dl, valid_mixed_dl).cuda()

In [None]:
mixed_dls.show_batch()

# Fusion at feature level - Training modules from scratch

## Kfold 

In [None]:
# KFOLD VALIDATION
kfold_preds = pd.DataFrame(columns=['predictions', 'target_yield'])

# tab variables
procs = [Categorify, Normalize, FillMissing]
cat_names = ['Parental 1', 'Parental 2', 'Planting', 'Stock', 'Fertilizer']
cont_names =['Days_after_sowing']

split_list = kfold_splitter(df=df_train_val)

# Mixed model variables
# Set weights for each loss
tab_w, vis_w, tv_w = 0.1, 0.55, 0.35

# Initialise Loss
gb_loss = myGradientBlending(tab_weight=tab_w, visual_weight=vis_w, tab_vis_weight=tv_w, 
                             loss_scale=1.0)

# METRICS
metrics = [t_rmse, v_rmse, tv_rmse, weighted_RMSEp]
csvlogger = CSVLogger('/data/results/multimodal_5fold_metrics.csv', append=True)
cbs = [csvlogger]    

for i in range(5):
    getter = get_fold(split_list, fold=i)
    splits = getter(range_of(df_train_val))

    to = TabularPandas(df_train_val,
                   procs,
                   cat_names,
                   cont_names=cont_names,
                   y_names='Yield',
                   y_block=RegressionBlock(),
                   splits=splits)
    tab_dl = to.dataloaders(bs=8)
    
    # Call MSI dataloader
    msi_fold = DataBlock(blocks = (MSITensorBlock, RegressionBlock),
                          get_items = get_npy,
                          get_y = get_y,
                          splitter = getter)
    msi_dl = msi_fold.dataloaders(df_train_val, bs=8)

    # Make mixed dls
    train_mixed_dl = MixedDL(tab_dl[0], msi_dl[0])
    valid_mixed_dl = MixedDL(tab_dl[1], msi_dl[1])
    mixed_dls = DataLoaders(train_mixed_dl, valid_mixed_dl).cuda()
    
    # Modules
    config = tabular_config(ps=0.5, embed_p=0.5)
    learn_tab = tabular_learner(tab_dl,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=0.3)
    learn_tab.fit_one_cycle(1, 1e-3)
    
    model_msi = xresnet18(n_out=1, c_in=13, pretrained=False, sa=True, p=0.5, ndim=2)
    learn_msi = Learner(msi_dl, 
                model_msi,
                opt_func=Adam, 
                loss_func=root_mean_squared_error,
                metrics=[rmse, R2Score()])
    learn_msi.fit_one_cycle(1, 1e-3)
    
    multi_model = TabVis(learn_tab.model, learn_msi.model)
    multi_learn = Learner(mixed_dls, multi_model, gb_loss, cbs=cbs, metrics=metrics)
    
    # Disable Fastai progress bar
    with multi_learn.no_bar()and multi_learn.no_logging():
        multi_learn.fit_one_cycle(35, lr_max=1e-3)
        
    preds,targs = multi_learn.get_preds(dl=valid_mixed_dl)
    pred_mixed_df = pd.DataFrame()
    tab_pred = preds[0].flatten()
    vis_pred = preds[1].flatten()
    mixed_pred = preds[2].flatten()

    pred_mixed_df['tab_pred'] = tab_pred
    pred_mixed_df['msi_pred'] = vis_pred
    pred_mixed_df['mixed_pred'] = mixed_pred
    pred_mixed_df['target_yield'] = targs
    kfold_preds = kfold_preds.append(pred_mixed_df)

kfold_preds.to_csv('/data/results/multimodal_5fold_prediction_for metrics.csv')

## Train the model to predict the holdout dataset

In [None]:
# Train a model for test prediction
config = tabular_config(ps=0.5, embed_p=0.5)
learn_tab = tabular_learner(tab_dl,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=0.3)
learn_tab.fit_one_cycle(1, 1e-3)

model_msi = xresnet18(n_out=1, c_in=13, pretrained=False, sa=True, p=0.5, ndim=2)
learn_msi = Learner(msi_dls, 
                model_msi,
                opt_func=Adam, 
                loss_func=root_mean_squared_error,
                metrics=[rmse, R2Score()])
learn_msi.fit_one_cycle(1, 1e-3)

multi_model = TabVis(learn_tab.model, learn_msi.model)
tab_w, vis_w, tv_w = 0.05, 0.5, 0.35 # Set weights for each loss

# Initialise Loss
gb_loss = myGradientBlending(tab_weight=tab_w, visual_weight=vis_w, tab_vis_weight=tv_w, 
                             loss_scale=1.0)
# Define metrics weights
metrics = [t_rmse, v_rmse, tv_rmse, weighted_RMSEp]
early_stopping = EarlyStoppingCallback(monitor='valid_loss', patience=3, min_delta=0.01)

multi_learn = Learner(mixed_dls, multi_model, gb_loss, metrics=metrics, cbs=early_stopping)
multi_learn.fit_one_cycle(60, lr_max=1e-3)

In [None]:
multi_learn.recorder.plot_loss()

In [None]:
# Saved the updated model
multi_learn.save('/data/model_weights/multimodal_colearning')

# Use this to load the model
# multi_learn.load('/data/model_weights/multimodal_colearning')

### Holdout dataset results

In [None]:
test_msi_dls = dblock.dataloaders(df_test, shuffle=False)
learn_msi.dls.loaders.append(msi_dls.test_dl(test_msi_dls.items, with_labels=True, shuffle=False))

In [None]:
# 1st half -
# Find the order of samples in the MSI test DL

fnames_MSIorder =[]
for fname in test_msi_dls.items:
    fname = str(fname)
    fname = fname.split(sep='/')[-1]
    fname = fname.replace('.npy', '')
    fnames_MSIorder.append(fname)
    
fnames_MSIorder

# Reorder the df_test to reflect this order
df_test1 = df_test.set_index('Barcode').reindex(fnames_MSIorder)

learn_tab.dls.loaders.append(tab_dl.test_dl(df_test1, with_labels=True, shuffle=False))
test_mixed_dl = MixedDL(learn_tab.dls[2], learn_msi.dls[2])
test_mixed_dl.show_batch()

In [None]:
preds,targs = multi_learn.get_preds(dl=test_mixed_dl)
tab_pred = preds[0].flatten()
vis_pred = preds[1].flatten()
mixed_pred = preds[2].flatten()

mixed_results = df_test1.copy()
mixed_results['tab_pred'] = tab_pred
mixed_results['msi_pred'] = vis_pred
mixed_results['mixed_pred'] = mixed_pred

len(mixed_results)

In [None]:
# 2nd half -
# Find the order of samples in the MSI test DL

fnames_MSIorder =[]
for fname in test_msi_dls[1].items:
    fname = str(fname)
    fname = fname.split(sep='/')[-1]
    fname = fname.replace('.npy', '')
    fnames_MSIorder.append(fname)
    
# fnames_MSIorder

# Reorder the df_test to reflect this order
df_test2 = df_test.set_index('Barcode').reindex(fnames_MSIorder)
learn_tab.dls.loaders.append(tab_dl.test_dl(df_test2, with_labels=True, shuffle=False))
learn_msi.dls.loaders.append(msi_dls.test_dl(test_msi_dls[1].items, with_labels=True, shuffle=False))
test_mixed_dl = MixedDL(learn_tab.dls[4], learn_msi.dls[3])
test_mixed_dl.show_batch()

In [None]:
preds,targs = multi_learn.get_preds(dl=test_mixed_dl)
tab_pred = preds[0].flatten()
vis_pred = preds[1].flatten()
mixed_pred = preds[2].flatten()

mixed_results2 = df_test2.copy()
mixed_results2['tab_pred'] = tab_pred
mixed_results2['msi_pred'] = vis_pred
mixed_results2['mixed_pred'] = mixed_pred

len(mixed_results2)

In [None]:
ff_GB_results = mixed_results.append(mixed_results2)
ff_GB_results = mixed_results
ff_GB_results.to_csv('/data/results/multimodal_colearning_prediction_on_holdout_dataset.csv')

# Fusion at feature level - Pretrained modules 

## Kfold 

In [None]:
kfold_preds = pd.DataFrame(columns=['predictions', 'target_yield'])

# tab variables
procs = [Categorify, Normalize, FillMissing]
cat_names = ['Parental 1', 'Parental 2', 'Planting', 'Stock', 'Fertilizer']
cont_names =['Days_after_sowing']

split_list = kfold_splitter(df=df_train_val)

# Mixed model variables
# Set weights for each loss
tab_w, vis_w, tv_w = 0.1, 0.55, 0.35

# Initialise Loss
gb_loss = myGradientBlending(tab_weight=tab_w, visual_weight=vis_w, tab_vis_weight=tv_w, 
                             loss_scale=1.0)

# METRICS
metrics = [t_rmse, v_rmse, tv_rmse, weighted_RMSEp]
csvlogger = CSVLogger('/data/results/multimodal_pretrained_5fold_metrics.csv', append=True)
cbs = [csvlogger]
    
for i in range(5):
    getter = get_fold(split_list, fold=i)
    splits = getter(range_of(df_train_val))

    to = TabularPandas(df_train_val,
                   procs,
                   cat_names,
                   cont_names=cont_names,
                   y_names='Yield',
                   y_block=RegressionBlock(),
                   splits=splits)
    tab_dl = to.dataloaders(bs=8)
    
    # Call MSI dataloader
    msi_fold = DataBlock(blocks = (MSITensorBlock, RegressionBlock),
                          get_items = get_npy,
                          get_y = get_y,
                          splitter = getter)
    msi_dl = msi_fold.dataloaders(df_train_val, bs=8)

    # Make mixed dls
    train_mixed_dl = MixedDL(tab_dl[0], msi_dl[0])
    valid_mixed_dl = MixedDL(tab_dl[1], msi_dl[1])
    mixed_dls = DataLoaders(train_mixed_dl, valid_mixed_dl).cuda()
    
    # Modules
    config = tabular_config(ps=0.5, embed_p=0.5)
    learn_tab = tabular_learner(tab_dl,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=0.3)
    learn_tab.load('/data/model_weights/DNN_model')
    learn_tab.fit_one_cycle(1, 1e-3)
    
    model_msi = xresnet18(n_out=1, c_in=13, pretrained=False, sa=True, p=0.5, ndim=2)
    learn_msi = Learner(msi_dl, 
                model_msi,
                opt_func=Adam, 
                loss_func=root_mean_squared_error,
                metrics=[rmse, R2Score()])
    learn_msi.load('/data/model_weights/xresnet18_model')
    learn_msi.fit_one_cycle(1, 1e-3)
    
    multi_model = TabVis(learn_tab.model, learn_msi.model)
    multi_learn = Learner(mixed_dls, multi_model, gb_loss, cbs=cbs, metrics=metrics)
    
        # Disable Fastai progress bar
    with multi_learn.no_bar()and multi_learn.no_logging():
        multi_learn.fit_one_cycle(35, lr_max=1e-3)
        
    preds,targs = multi_learn.get_preds(dl=valid_mixed_dl)
    
    pred_mixed_df = pd.DataFrame()
    tab_pred = preds[0].flatten()
    vis_pred = preds[1].flatten()
    mixed_pred = preds[2].flatten()

    pred_mixed_df['tab_pred'] = tab_pred
    pred_mixed_df['msi_pred'] = vis_pred
    pred_mixed_df['mixed_pred'] = mixed_pred
    
    pred_mixed_df['target_yield'] = targs
    
    kfold_preds = kfold_preds.append(pred_mixed_df)

kfold_preds.to_csv('/data/results/multimodal_pretrained_5fold_prediction_for_metrics.csv')

## Train the model to predict the holdout dataset

In [None]:
# TAB pretrained version
config = tabular_config(ps=0.5, embed_p=0.5)
preTAB_learner = tabular_learner(tab_dl,
                            config=config,
                            layers=[200,100],
                            metrics=[rmse, R2Score()],
                            opt_func=ranger,
                            y_range=[0,20],
                            wd=0.3)

preTAB_learner.load('/data/model_weights/DNN_model')
# test that the model is working
preTAB_learner.fit_one_cycle(1, 1e-3)

In [None]:
# Spectral pretrained version
model_msi = xresnet18(n_out=1, c_in=13, pretrained=False, sa=True, p=0.5, ndim=2)
preVIS_learner = Learner(msi_dls, 
                model_msi,
                opt_func=Adam, 
                loss_func=root_mean_squared_error,
                metrics=[rmse, R2Score()])

preVIS_learner.load('/data/model_weights/xresnet18_model')
# test that the model is working
preVIS_learner.fit_one_cycle(1, 1e-3)

In [None]:
# Train the multimodal model
multi_model = TabVis(preTAB_learner.model, preVIS_learner.model)

# Set weights for each loss
tab_w, vis_w, tv_w = 0.1, 0.55, 0.35

# Initialise Loss
gb_loss = myGradientBlending(tab_weight=tab_w, visual_weight=vis_w, tab_vis_weight=tv_w, 
                             loss_scale=1.0)

# Define metrics weights
metrics = [t_rmse, v_rmse, tv_rmse, weighted_RMSEp]
multi_learn = Learner(mixed_dls, multi_model, gb_loss, metrics=metrics)

multi_learn.fit_one_cycle(60, lr_max=1e-3)
# multi_learn.remove_my_hooks()

In [None]:
# Saved the updated model
multi_learn.save('/data/model_weights/multimodal_pretrained')
# Load the updated model
# multi_learn.load('/data/model_weights/multimodal_pretrained')

### Holdout dataset results

In [None]:
test_msi_dls = dblock.dataloaders(df_test, shuffle=False)
preVIS_learner.dls.loaders.append(msi_dls.test_dl(test_msi_dls.items, with_labels=True, shuffle=False))

In [None]:
# 1st half -
# Find the order of samples in the MSI test DL

fnames_MSIorder =[]
for fname in test_msi_dls.items:
    fname = str(fname)
    fname = fname.split(sep='/')[-1]
    fname = fname.replace('.npy', '')
    fnames_MSIorder.append(fname)
    
fnames_MSIorder

# Reorder the df_test to reflect this order
df_test1 = df_test.set_index('Barcode').reindex(fnames_MSIorder)
preTAB_learner.dls.loaders.append(tab_dl.test_dl(df_test1, with_labels=True, shuffle=False))
test_mixed_dl = MixedDL(preTAB_learner.dls[2], preVIS_learner.dls[2])
test_mixed_dl.show_batch()

In [None]:
preds,targs = multi_learn.get_preds(dl=test_mixed_dl)
tab_pred = preds[0].flatten()
vis_pred = preds[1].flatten()
mixed_pred = preds[2].flatten()

mixed_results = df_test1.copy()
mixed_results['tab_pred'] = tab_pred
mixed_results['msi_pred'] = vis_pred
mixed_results['mixed_pred'] = mixed_pred

mixed_results

In [None]:
# 2nd half -
# Find the order of samples in the MSI test DL

fnames_MSIorder =[]
for fname in test_msi_dls[1].items:
    fname = str(fname)
    fname = fname.split(sep='/')[-1]
    fname = fname.replace('.npy', '')
    fnames_MSIorder.append(fname)
    
# fnames_MSIorder

# Reorder the df_test to reflect this order
df_test2 = df_test.set_index('Barcode').reindex(fnames_MSIorder)
preTAB_learner.dls.loaders.append(tab_dl.test_dl(df_test2, with_labels=True, shuffle=False))
preVIS_learner.dls.loaders.append(msi_dls.test_dl(test_msi_dls[1].items, with_labels=True, shuffle=False))
test_mixed_dl = MixedDL(preTAB_learner.dls[3], preVIS_learner.dls[3])
test_mixed_dl.show_batch()

In [None]:
preds,targs = multi_learn.get_preds(dl=test_mixed_dl)
tab_pred = preds[0].flatten()
vis_pred = preds[1].flatten()
mixed_pred = preds[2].flatten()

mixed_results2 = df_test2.copy()
mixed_results2['tab_pred'] = tab_pred
mixed_results2['msi_pred'] = vis_pred
mixed_results2['mixed_pred'] = mixed_pred

mixed_results2

In [None]:
ff_GB_results = mixed_results.append(mixed_results2)
ff_GB_results.to_csv('/data/results/multimodal_pretrained_prediction_on_holdout_dataset.csv')