In [1]:
from mil_framework import *
from split_indices import *
import numpy as np
import pandas as pd
import torch
import random

# Seed + Configuration

In [2]:
def set_seed(seed: int):
    """
    Set the random seed for modules torch, numpy and random.
    :param seed: random seed
    """
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

class Config:
    def __init__(self, name_model):
        self.model_dir = "/content/drive/MyDrive/DLMI_Challenge/"
        self.data_path = "data"
        self.seed = 42
        self.epochs = 100
        self.features_dim = 2560
        self.use_cuda = True
        self.batch_size = 32
        self.lr = 0.1
        self.num_heads = 8
        self.name_model = name_model

def add_augmentation(train_indices, val_indices):
    total_train_indices = [key+'_horizontal' for key in train_indices] + train_indices + [key+'_vertical' for key in train_indices]
    total_val_indices = [key+'_horizontal' for key in val_indices] + val_indices + [key+'_vertical' for key in val_indices]
    return total_train_indices, total_val_indices

# Training

In [3]:
splits = [(Train_0, Val_0), (Train_1, Val_1), (Train_2, Val_2), (Train_3, Val_3), (Train_4, Val_4)]
names_model = ['lympho_efficient', 'lympho_efficient_1', 'lympho_efficient_2', 'lympho_efficient_3', 'lympho_efficient_4']

# Loop over all split to make different models
for idx, name_model in enumerate(names_model):
    cfg = Config(name_model)
    set_seed(cfg.seed)
    train_indices, val_indices = add_augmentation(*splits[idx])
    train_data, dev_data = load_data(cfg, train_indices, val_indices)
    model = build_model(cfg)
    trainer = TrainManager(model, cfg)
    trainer.train_and_validate(train_data, dev_data)



FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/DLMI_Challenge/data/files_efficient.train'

# Inference

In [None]:
cfg = Config(None)
set_seed(cfg.seed)
_, _, test_data  = load_data(cfg, train_indices, val_indices, True)

In [None]:
names = list()
predictions = list()
for name_model in names_model:
    # Creating and loading the model weight
    model = build_model(cfg)
    checkpoint = load_checkpoint(cfg, name_model)
    model.load_state_dict(checkpoint['model_state'])
    
    # Inference
    prediction, name = infer(model, test_data, cfg)
    predictions.append(prediction)
    names.append(name)

# names is just the list of list of all patient_id returned by each model, as they are the same 
# (we didn't shuffle the data), we will only keep the first one
names = names[0]

In [None]:
# For each patient, I will put on a list the output prediction (probability) of each model
probs_per_patient = {id_patient : list() for id_patientent in test_indices}
for i in range(len(predictions)):
    for idx, name in enumerate(names):
        id_patient = name.split('_')[0]
        probs_per_patient[id_patient].append(predictions[i][idx])
        
# Now for each patient I will generate (mean_prediction_probability, std_prediction_probability, 
#max_prediction_probability, min_prediction_probability)
statistics_per_patient = list()
ids_patient = list()
for id_patient, probs in probs_per_patient.items():
    stack_probs = np.vstack(probs)
    mean_prediction_probability = stack_probs.mean()
    std_prediction_probability = stack_probs.std()
    max_prediction_probability = stack_probs.max()
    min_prediction_probability = stack_probs.min()
    statics_patient = (mean_prediction_probability, std_prediction_probability, max_prediction_probability, min_prediction_probability)
    ids_patient.append(id_patient)

In [None]:
def compare_min_max(max_prob, min_prob):
    prob_1_max = max_prob
    prob_0_max = 1 - min_prob
    if prob_1_max > prob_0_max:
        return 1
    return 0

# Now I will make predictions by setting some rules on the predictions
predictions_final = []
names_final = []
for id_patient, statistic_patient in zip(ids_patient, statistics_per_patient):
    mean, std, max_, min_ = *statistic_patient
    if mean >= 0.5:
        if mean-std > 0.5:
            label = 1
        else:
            label = compare_min_max(max_, min_)
            
    elif mean < 0.5:
        if mean+std < 0.5:
            label = 0
        else:
            label = compare_min_max(max_, min_)
    predictions_final.append(label)
    names_final.append(id_patient)   

In [None]:
names_submissions = '2560'
submissions = pd.DataFrame({"ID": names_final, "Predicted": predictions_final})
submissions.to_csv(f"/content/drive/MyDrive/DLMI_Challenge/submissions_{names_submissions}.csv", index=False)