In [3]:
from features_extractor import *
import torchvision.transforms as transforms
import torch

# Load Data Locally (to speed up things)

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle
from google.colab import files
# Here you should upload your Kaggle API key (see : https://www.kaggle.com/docs/api (Authentification paragraph))
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list
! kaggle competitions download -c 3md3070-dlmi
! unzip /content/3md3070-dlmi.zip

# Generate Data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DLMI_Challenge/clinical_data_clean.csv")
train_df = df[df.LABEL > -0.5]
test_df = df[df.LABEL < -0.5]

train_path = []
for name in train_df.ID:
    listfiles = os.listdir('/content/trainset/' + name)
    train_path += ['/content/trainset/' + name + '/' + img_name for img_name in listfiles]
test_path = []
for name in test_df.ID:
    listfiles = os.listdir('/content/testset/' + name)
    test_path += ['/content/testset/' + name + '/' + img_name for img_name in listfiles]
all_path = train_path + test_path

# Training

In [None]:
res_size = 224        # EfficientNet image size

# training parameters
epochs = 80        # training epochs
batch_size = 16
learning_rate = 0.002
log_interval = 10   # interval for displaying training info

# save model
save_model_path = '/content/drive/MyDrive/DLMI_Challenge/'

# Detect devices
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

# Data Augmentation
transform = transforms.Compose([transforms.Resize(res_size),
                                MyRotateTransform([0, 90, 180, 0, 270, 360, 0]),
                                transforms.RandomHorizontalFlip(p=0.5),
                                transforms.RandomVerticalFlip(p=0.5),
                                transforms.ToTensor()])

data = LymphoDataset(all_path, transform)
data_loader = torch.utils.data.DataLoader(dataset=data, batch_size=batch_size, shuffle=True, num_workers=8)

In [None]:
# Initialize Model To Device
lympho_AE = LymphoAutoEncoder().to(device)
# Initialize Optimizer
model_params = list(lympho_AE.parameters())
optimizer = torch.optim.Adam(model_params, lr=learning_rate)

In [None]:
# Begin Training
criterion = nn.MSELoss(reduction='sum')
best = np.inf
# start training
for epoch in range(epochs):
    # train, test model
    train_losses = train(50, resnet_vae, device, data_loader, optimizer, epoch, criterion, best, save_model_path)
    err = sum(train_losses)/len(train_losses)
    if err < best:
        best = err
    print(f"Epoch : {epoch} , Mean Error : {sum(train_losses)/len(train_losses)}")

# Generate Features Files

In [None]:
# Load Model
lympho_AE = LymphoAutoEncoder()
lympho_AE.load_state_dict(torch.load("/content/drive/MyDrive/DLMI_Challenge/model_vae_efficient.pth"))
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU
lympho_AE.to(device)

# Mode Eval
lympho_AE.eval()

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DLMI_Challenge/clinical_data_clean.csv")
# Normalize
df.loc[:, ['LYMPH_COUNT', 'AGE']] = (df[['LYMPH_COUNT', 'AGE']] - df[['LYMPH_COUNT', 'AGE']].min())/(df[['LYMPH_COUNT', 'AGE']].max() - df[['LYMPH_COUNT', 'AGE']].min())
# Separate train & test
train_df = df[df.LABEL > -0.5].reset_index(drop=True)
test_df = df[df.LABEL < -0.5].reset_index(drop=True)

## Train data

In [None]:
train_path = []
names_file = []
for name in train_df.ID:
    listfiles = os.listdir('/content/trainset/' + name)
    train_path += ['/content/trainset/' + name + '/' + img_name for img_name in listfiles]
    names_file += [name]*len(listfiles)

horizontal_flip = [key + '_horizontal' for key in train_df.ID]
vertical_flip = [key + '_vertical' for key in train_df.ID]

In [None]:
# Creating dictionary to store everything
train_data = {k: {'features': [], 'features_reduced': []} for k in train_df.ID.tolist() + horizontal_flip + vertical_flip}

In [None]:
# Normal Picture (no data augmentation)
res_size = 224
transform = transforms.Compose([transforms.Resize(res_size),
                                transforms.ToTensor()])
Train = InferLymphoDataset(train_path, names_file, transform)
data_loader = DataLoader(dataset=Train, batch_size=10, shuffle=False, num_workers=8)
for batch_idx, (X, names) in tqdm(enumerate(data_loader)):
    # distribute data to device
    X  = X.to(device)
    _ , pooled_reduced, pooled = lympho_AE(X)
    pooled = pooled.detach().cpu()
    pooled_reduced = pooled_reduced.detach().cpu()
    for idx, name in enumerate(names[0]):
        train_data[name]['features'].append(pooled[idx].squeeze(1).reshape(1,-1))
        train_data[name]['features_reduced'].append(pooled_reduced[idx].squeeze(1).reshape(1,-1))

In [None]:
# Picture being fliped horizontally
transform = transforms.Compose([transforms.Resize(res_size),
                                transforms.RandomHorizontalFlip(p=1.0),
                                transforms.ToTensor()])
train = InferLymphoDataset(train_path, names_file, transform)
data_train = DataLoader(dataset=train, batch_size=10, shuffle=False, num_workers=8)
for batch_idx, (X, names) in tqdm(enumerate(data_train)):
    # distribute data to device
    X  = X.to(device)
    _ , pooled_reduced, pooled = lympho_AE(X)
    pooled = pooled.detach().cpu()
    pooled_reduced = pooled_reduced.detach().cpu()
    for idx, name in enumerate(names[0]):
        train_data[name + '_horizontal']['features'].append(pooled[idx].squeeze(1).reshape(1,-1))
        train_data[name + '_horizontal']['features_reduced'].append(pooled_reduced[idx].squeeze(1).reshape(1,-1))

In [None]:
# Picture Being Fliped Vertically
transform = transforms.Compose([transforms.Resize(res_size),
                                transforms.RandomVerticalFlip(p=1.0),
                                transforms.ToTensor()])
Train = InferLymphoDataset(train_path, names_file, transform)
data_loader = DataLoader(dataset=Train, batch_size=10, shuffle=False, num_workers=8)
for batch_idx, (X, names) in tqdm(enumerate(data_loader)):
    # distribute data to device
    X  = X.to(device)
    _ , pooled_reduced, pooled = lympho_AE(X)
    pooled = pooled.detach().cpu()
    pooled_reduced = pooled_reduced.detach().cpu()
    for idx, name in enumerate(names[0]):
        train_data[name + '_vertical']['features'].append(pooled[idx].squeeze(1).reshape(1,-1))
        train_data[name + '_vertical']['features_reduced'].append(pooled_reduced[idx].squeeze(1).reshape(1,-1))

In [None]:
# Concatenate features of the same patient
for key in train_data.keys():
    train_data[key]['features'] = torch.cat(train_data[key]['features'], dim=0)
    train_data[key]['features_reduced'] = torch.cat(train_data[key]['features_reduced'], dim=0)

# Add all data (age, label, concentration)
for idx, row in train_df.iterrows():
    key = row['ID']
    train_data[key]['label'] = torch.as_tensor([row['LABEL']], dtype=torch.int64)
    train_data[key]['age'] = torch.as_tensor([row['AGE']], dtype=torch.float32)
    train_data[key]['concentration'] = torch.as_tensor([row['LYMPH_COUNT']], dtype=torch.float32)
    train_data[key + '_horizontal']['label'] = torch.as_tensor([row['LABEL']], dtype=torch.int64)
    train_data[key + '_horizontal']['age'] = torch.as_tensor([row['AGE']], dtype=torch.float32)
    train_data[key + '_horizontal']['concentration'] = torch.as_tensor([row['LYMPH_COUNT']], dtype=torch.float32)
    train_data[key + '_vertical']['label'] = torch.as_tensor([row['LABEL']], dtype=torch.int64)
    train_data[key + '_vertical']['age'] = torch.as_tensor([row['AGE']], dtype=torch.float32)
    train_data[key + '_vertical']['concentration'] = torch.as_tensor([row['LYMPH_COUNT']], dtype=torch.float32)
    
# save
save(train_data, "/content/drive/MyDrive/DLMI_Challenge/data/files_efficient.train")

## Test Data

In [None]:
test_path = []
names_file = []
for name in test_df.ID:
    listfiles = os.listdir('/content/testset/' + name)
    test_path += ['/content/testset/' + name + '/' + img_name for img_name in listfiles]
    names_file += [name]*len(listfiles)
    
horizontal_flip = [key + '_horizontal' for key in test_df.ID]
vertical_flip = [key + '_vertical' for key in test_df.ID]

In [None]:
# Creating dictionary to store everything
test_data = {k: {'features': [], 'features_reduced': []} for k in test_df.ID.tolist() + horizontal_flip + vertical_flip}

In [None]:
# Normal Picture
res_size = 224
transform = transforms.Compose([transforms.Resize(res_size),
                                transforms.ToTensor()])
Test = InferLymphoDataset(test_path, names_file, transform)
data_loader = DataLoader(dataset=Test, batch_size=10, shuffle=False, num_workers=8)
for batch_idx, (X, names) in tqdm(enumerate(data_loader)):
    # distribute data to device
    X  = X.to(device)
    _ , pooled_reduced, pooled = lympho_AE(X)
    pooled = pooled.detach().cpu()
    pooled_reduced = pooled_reduced.detach().cpu()
    for idx, name in enumerate(names[0]):
        test_data[name]['features'].append(pooled[idx].squeeze(1).reshape(1,-1))
        test_data[name]['features_reduced'].append(pooled_reduced[idx].squeeze(1).reshape(1,-1))

In [None]:
# Horizontal Picture
transform = transforms.Compose([transforms.Resize(res_size),
                                transforms.RandomHorizontalFlip(p=1.0),
                                transforms.ToTensor()])
Test = InferLymphoDataset(test_path, names_file, transform)
data_loader = DataLoader(dataset=Test, batch_size=10, shuffle=False, num_workers=8)
for batch_idx, (X, names) in tqdm(enumerate(data_loader)):
    # distribute data to device
    X  = X.to(device)
    _ , pooled_reduced, pooled = lympho_AE(X)
    pooled = pooled.detach().cpu()
    pooled_reduced = pooled_reduced.detach().cpu()
    for idx, name in enumerate(names[0]):
        test_data[name + '_horizontal']['features'].append(pooled[idx].squeeze(1).reshape(1,-1))
        test_data[name + '_horizontal']['features_reduced'].append(pooled_reduced[idx].squeeze(1).reshape(1,-1))

In [None]:
# Vertical Picture
transform = transforms.Compose([transforms.Resize(res_size),
                                transforms.RandomVerticalFlip(p=1.0),
                                transforms.ToTensor()])
Test = InferLymphoDataset(test_path, names_file, transform)
data_loader = DataLoader(dataset=Test, batch_size=10, shuffle=False, num_workers=8)
for batch_idx, (X, names) in tqdm(enumerate(data_loader)):
    # distribute data to device
    X  = X.to(device)
    _ , pooled_reduced, pooled = lympho_AE(X)
    pooled = pooled.detach().cpu()
    pooled_reduced = pooled_reduced.detach().cpu()
    for idx, name in enumerate(names[0]):
        test_data[name + '_vertical']['features'].append(pooled[idx].squeeze(1).reshape(1,-1))
        test_data[name + '_vertical']['features_reduced'].append(pooled_reduced[idx].squeeze(1).reshape(1,-1))

In [None]:
for key in test_data.keys():
    test_data[key]['features'] = torch.cat(test_data[key]['features'], dim=0)
    test_data[key]['features_reduced'] = torch.cat(test_data[key]['features_reduced'], dim=0)
    
for idx, row in test_df.iterrows():
    key = row['ID']
    test_data[key]['label'] = torch.as_tensor([row['LABEL']], dtype=torch.int64)
    test_data[key]['age'] = torch.as_tensor([row['AGE']], dtype=torch.float32)
    test_data[key]['concentration'] = torch.as_tensor([row['LYMPH_COUNT']], dtype=torch.float32)
    test_data[key + '_horizontal']['label'] = torch.as_tensor([row['LABEL']], dtype=torch.int64)
    test_data[key + '_horizontal']['age'] = torch.as_tensor([row['AGE']], dtype=torch.float32)
    test_data[key + '_horizontal']['concentration'] = torch.as_tensor([row['LYMPH_COUNT']], dtype=torch.float32)
    test_data[key + '_vertical']['label'] = torch.as_tensor([row['LABEL']], dtype=torch.int64)
    test_data[key + '_vertical']['age'] = torch.as_tensor([row['AGE']], dtype=torch.float32)
    test_data[key + '_vertical']['concentration'] = torch.as_tensor([row['LYMPH_COUNT']], dtype=torch.float32)
    
save(test_data, "/content/drive/MyDrive/DLMI_Challenge/data/files_efficient.test")