In [1]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [2]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'data_train_process': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_process/',
    'data_train_split': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_split/',
    'data_host_process': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/host_process/',
    'train_path': 'train.csv',
    'folds_path': 'v1.csv', 

    'DEVICE' : 'cuda',
    'SEED' : 42,
    'BATCH_SIZE' : 32,
    'LR' : 0.001,
    'EPOCHS': 500,
    'output_dim' : 1
}

In [3]:
# Import libs
import numpy as np
from tqdm import tqdm
import pandas as pd 
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.init as init
from metric import score

In [4]:
# Device, torch decimal places and seed for reproducibility
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=40) 

torch.manual_seed(CONFIG['SEED'])  
torch.cuda.manual_seed_all(CONFIG['SEED'])  
np.random.seed(CONFIG['SEED'])  

In [5]:
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
host_3 = pd.read_csv(f"{CONFIG['data_host_process']}3/v1")
train = train.fillna('-1')

def map_dri_score(dri_score):
    if dri_score in ['Low', 'Intermediate']:
        return 'Low/intermediate'
    elif dri_score in ['High', 'Very high']:
        return 'High/very high'
    else:
        return '-1'
train['dri_gp_score'] = train['dri_score'].apply(map_dri_score)

def map_comorbidity_score(comorbidity_score):
    if comorbidity_score >= 3:
        return '3+'
    elif comorbidity_score == 2 or comorbidity_score == 1:
        return '1-2'
    elif comorbidity_score == 0:
        return '0'
    else:
        return '-1'
train['comorbidity_gp_score'] = train['comorbidity_score'].apply(map_dri_score)

train['year_hct'] = train['year_hct'].astype(int)
def map_year_hct(year_hct):
    if year_hct == 2008:
        return '2006-2008'
    elif year_hct > 2008 and year_hct <= 2011:
        return '2009-2011'
    elif year_hct > 2011 and year_hct <= 2014:
        return '2011-2014'
    else:
        return '-1'
train['year_gp_hct'] = train['year_hct'].apply(map_year_hct)

def map_sex_match(sex_match):
    if sex_match == 'F/M':
        return 'F/M'
    else:
        return '-1'
train['sex_gp_match'] = train['sex_match'].apply(map_sex_match)

def map_sex(sex_match):
    if '-M' in sex_match:
        return 'M'
    elif '-F' in sex_match:
        return 'F'
    else:
        return '-1'
train['sex'] = train['sex_match'].apply(map_sex)

def map_karnofsky_score(karnofsky_score):
    if karnofsky_score == '90.0' or karnofsky_score == '100.0':
        return '>=90'
    elif karnofsky_score == '-1':
        return '-1'
    else:
        return '<90'
train['karnofsky_gp_score'] = train['karnofsky_score'].apply(map_karnofsky_score)
host_3.drop(columns=['hla_gp_match'], inplace=True)

In [6]:
train = pd.concat([train, host_3], ignore_index=True)

In [7]:
# Load and prepare data

for col in ['donor_age', 'age_at_hct']:
    train[col] = train[col].astype(int)
    
train["y"] = train.efs_time.values
mx = train.loc[train.efs==1,"efs_time"].max()
mn = train.loc[train.efs==0,"efs_time"].min()
train.loc[train.efs==0,"y"] = train.loc[train.efs==0,"y"] + mx - mn
train.y = train.y.rank()
train.loc[train.efs==0,"y"] += 2*len(train)
train.y = train.y / train.y.max()
train.y = np.log( train.y )
train.y -= train.y.mean()
train.y *= -1.0

# train['efs_time'] = (train['efs_time'] - train['efs_time'].min()) / (train['efs_time'].max() - train['efs_time'].min())

cat_columns = [col for col in train.columns if col not in ['efs', 'efs_time', 'y', 'ID']]
train[cat_columns] = train[cat_columns].astype(str)

In [8]:
train

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,hla_low_res_10,efs,efs_time,dri_gp_score,comorbidity_gp_score,year_gp_hct,sex_gp_match,sex,karnofsky_gp_score,y
0,0.0,N/A - non-malignant indication,No,-1,No,-1,-1,No TBI,No,6.0,...,10.0,0.0,42.356,-1,-1,-1,-1,F,<90,-1.461133
1,1.0,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,10.0,1.0,4.672,Low/intermediate,-1,2006-2008,-1,F,<90,1.329045
2,2.0,N/A - non-malignant indication,No,-1,No,2.0,8.0,No TBI,No,6.0,...,10.0,0.0,19.793,-1,-1,-1,-1,M,<90,-1.383611
3,3.0,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,10.0,0.0,102.349,High/very high,-1,2009-2011,-1,M,<90,-1.517746
4,4.0,High,No,-1,No,2.0,8.0,No TBI,No,6.0,...,10.0,0.0,16.223,High/very high,-1,-1,-1,F,<90,-1.375527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30911,24630461.0,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,1.0,4.380,Low/intermediate,1-2,2012-2014,-1,M,<90,1.491224
30912,24633158.0,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,1.0,2.960,Low/intermediate,3+,2012-2014,-1,F,<90,2.508494
30913,24633924.0,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,1.0,0.790,Low/intermediate,1-2,2012-2014,-1,F,-1,4.260963
30914,24635360.0,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,0.0,0.230,-1,3+,2012-2014,-1,F,<90,-1.361068


In [9]:
folds = pd.read_csv(f"{CONFIG['data_train_split']}{CONFIG['folds_path']}")
len_host_3 = len(host_3)

# Создаем DataFrame с длиной host_3 и значением 1 для 'fold'
new_folds = pd.DataFrame({
    'fold': [1] * len_host_3
})

# Расширяем folds, добавляя new_folds
folds = pd.concat([folds, new_folds], ignore_index=True)
val = train[folds['fold'] == 0].copy(deep=True)
train = train[folds['fold'].isin([1, 2, 3, 4])].copy(deep=True)
val.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

In [10]:
train_one_hot = pd.get_dummies(train[cat_columns], drop_first=True)
val_one_hot = pd.get_dummies(val[cat_columns], drop_first=True)
val_one_hot = val_one_hot.reindex(columns=train_one_hot.columns, fill_value=False)

In [11]:
input_dim = train_one_hot.shape[1]

In [12]:
# Model definition
class Model(nn.Module):
    def __init__(self, input_dim, output_dim=CONFIG['output_dim']):
        super(Model, self).__init__()
    
        self.fc1 = nn.Linear(input_dim, input_dim)
        self.fc2 = nn.Linear(input_dim, input_dim)
        self.fc3 = nn.Linear(input_dim, input_dim)
        self.fc4 = nn.Linear(input_dim, output_dim)
        
        self.relu = nn.ReLU()
        self.do = nn.Dropout(p=0.5)
        
        self.init_weights()
    
    def init_weights(self):
        init.xavier_uniform_(self.fc1.weight)
        init.xavier_uniform_(self.fc2.weight)
        init.xavier_uniform_(self.fc3.weight)
        init.xavier_uniform_(self.fc4.weight)
    
        self.fc1.weight.data = torch.clamp(self.fc1.weight.data, 0, 1)
        self.fc2.weight.data = torch.clamp(self.fc2.weight.data, 0, 1)
        self.fc3.weight.data = torch.clamp(self.fc3.weight.data, 0, 1)
        self.fc4.weight.data = torch.clamp(self.fc4.weight.data, 0, 1)
        
        if self.fc1.bias is not None:
            init.zeros_(self.fc1.bias)
        if self.fc2.bias is not None:
            init.zeros_(self.fc2.bias)
        if self.fc3.bias is not None:
            init.zeros_(self.fc3.bias)
        if self.fc4.bias is not None:
            init.zeros_(self.fc4.bias)
        
    def forward(self, x):

        x = self.relu(self.fc1(x))
        x = self.do(x)
        x = self.relu(self.fc2(x))
        x = self.do(x)
        x = self.relu(self.fc3(x))
        x = self.do(x)
        x = self.fc4(x)
        
        return x

In [13]:
# Model, criterion and optimizer
model = Model(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=CONFIG['LR'])

In [14]:
train_tensors = torch.tensor(train_one_hot.values, device=device, dtype=torch.float32)

train_targets = torch.tensor(train['y'].values, device=device, dtype=torch.float32)

val_tensors = torch.tensor(val_one_hot.values, device=device, dtype=torch.float32)

val_targets = torch.tensor(val['y'].values, device=device, dtype=torch.float32)

In [None]:
# Training
train_num_samples = len(train)
train_num_batches = (train_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

val_num_samples = len(val)
val_num_batches = (val_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']
best_score = float('-inf')
for epoch in range(CONFIG['EPOCHS']):
    ##################################################################TRAIN##################################################################
    model.train()
    train_running_loss = 0.0

    with tqdm(range(train_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as t:
        for batch_idx in t:
            start_idx = batch_idx * CONFIG['BATCH_SIZE']
            end_idx = min(start_idx + CONFIG['BATCH_SIZE'], train_num_samples)

            # Извлечение батча данных
            batch_inputs = train_tensors[start_idx:end_idx]
            batch_targets = train_targets[start_idx:end_idx]
            
            # Обнуление градиентов
            optimizer.zero_grad()

            # Прямой проход
            outputs = model(batch_inputs)

            # Вычисление функции потерь
            batch_loss = criterion(outputs, batch_targets.unsqueeze(1))

            # Обратный проход
            batch_loss.backward()
            optimizer.step()

            train_running_loss += batch_loss.item()
            t.set_postfix(train_mean_loss=f"{train_running_loss / (batch_idx + 1):.6f}")
    
    ###EVAL
    model.eval()
    val_running_loss = 0.0
    
    outputs_list = []
    
    with torch.no_grad():
        with tqdm(range(val_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as v:
            for batch_idx in v:
                start_idx = batch_idx * CONFIG['BATCH_SIZE']
                end_idx = min(start_idx + CONFIG['BATCH_SIZE'], val_num_samples)
                
                batch_inputs = val_tensors[start_idx:end_idx]
                batch_targets = val_targets[start_idx:end_idx]
    
                # Прямой проход
                outputs = model(batch_inputs)
                
                batch_loss = criterion(outputs, batch_targets.unsqueeze(1))
                val_running_loss += batch_loss.item()
                v.set_postfix(val_mean_loss=f"{val_running_loss / (batch_idx + 1):.6f}")
        
                outputs_list.extend(outputs.cpu().numpy().flatten())
    
    ###SAVE
    row_id_column_name = "ID"
    y_pred = val[['ID']].copy(deep=True)
    y_pred["prediction"] = outputs_list
    y_true = val[['ID', 'efs', 'efs_time', 'race_group']].copy(deep=True)
    current_score = score(y_true.copy(), y_pred.copy(), row_id_column_name)
    if current_score > best_score:  # Для минимизации score (если это метрика типа ошибки)
        best_score = current_score
        best_epoch = epoch + 1
        print(f"New Best Score: {best_score}, Epoch: {epoch + 1}")

Epoch 1/500: 100%|██████████| 787/787 [00:02<00:00, 312.76batch/s, train_mean_loss=6423.696341] 
Epoch 1/500: 100%|██████████| 180/180 [00:00<00:00, 730.91batch/s, val_mean_loss=2.026062]


New Best Score: 0.5145202288736127, Epoch: 1


Epoch 2/500: 100%|██████████| 787/787 [00:02<00:00, 317.24batch/s, train_mean_loss=11.304232]
Epoch 2/500: 100%|██████████| 180/180 [00:00<00:00, 737.73batch/s, val_mean_loss=2.084354]


New Best Score: 0.52784445541362, Epoch: 2


Epoch 3/500: 100%|██████████| 787/787 [00:02<00:00, 317.73batch/s, train_mean_loss=7.659294]
Epoch 3/500: 100%|██████████| 180/180 [00:00<00:00, 765.93batch/s, val_mean_loss=2.093512]


New Best Score: 0.5400412631412076, Epoch: 3


Epoch 4/500: 100%|██████████| 787/787 [00:02<00:00, 279.70batch/s, train_mean_loss=5.194554]
Epoch 4/500: 100%|██████████| 180/180 [00:00<00:00, 801.58batch/s, val_mean_loss=2.109370]


New Best Score: 0.552962207833304, Epoch: 4


Epoch 5/500: 100%|██████████| 787/787 [00:03<00:00, 231.35batch/s, train_mean_loss=3.835294]
Epoch 5/500: 100%|██████████| 180/180 [00:00<00:00, 578.82batch/s, val_mean_loss=2.085985]


New Best Score: 0.5672983271283342, Epoch: 5


Epoch 6/500: 100%|██████████| 787/787 [00:03<00:00, 225.07batch/s, train_mean_loss=3.039939]
Epoch 6/500: 100%|██████████| 180/180 [00:00<00:00, 725.72batch/s, val_mean_loss=2.030792]


New Best Score: 0.5842906711820415, Epoch: 6


Epoch 7/500: 100%|██████████| 787/787 [00:03<00:00, 226.81batch/s, train_mean_loss=2.599604]
Epoch 7/500: 100%|██████████| 180/180 [00:00<00:00, 760.23batch/s, val_mean_loss=1.963716]


New Best Score: 0.6014880970898935, Epoch: 7


Epoch 8/500:  43%|████▎     | 335/787 [00:01<00:01, 226.51batch/s, train_mean_loss=2.088144]

In [None]:
# 5_folds
# fold_0, epoch_151 , score_0.6634
# fold_1, epoch_150 , score_0.6627
# fold_2, epoch_138 , score_0.6600
# fold_3, epoch_143 , score_0.665
# fold_4, epoch_147 , score_0.6692

# 4_folds
# fold_0, epoch_192 , score_0.6623
# fold_1, epoch_191 , score_0.6586
# fold_2, epoch_183 , score_0.6615
# fold_3, epoch_196 , score_0.6584

In [None]:
best_score

In [None]:
best_epoch

In [None]:
train

In [None]:
train['cyto_score']

In [None]:
len_host_3

In [None]:
train.tail(len_host_3)