In [1]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [2]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'data_train_process': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_process/',
    'data_train_split': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_split/',
    'train_path': 'train.csv',
    'folds_path': 'v1.csv', 

    'DEVICE' : 'cuda',
    'SEED' : 42,
    'BATCH_SIZE' : 32,
    'LR' : 0.001,
    'EPOCHS': 7,
    'output_dim' : 1
}

In [3]:
# Import libs
import numpy as np
from tqdm import tqdm
import pandas as pd 
import torch
import torch.nn as nn
from torch.optim import Adam
import torch.nn.init as init
from metric import score

In [4]:
# Device, torch decimal places and seed for reproducibility
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=40) 

torch.manual_seed(CONFIG['SEED'])  
torch.cuda.manual_seed_all(CONFIG['SEED'])  
np.random.seed(CONFIG['SEED'])  

In [5]:
# Load and prepare data
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
train = train.fillna('-1')

for col in ['donor_age', 'age_at_hct']:
    train[col] = train[col].astype(int)
    
train["y"] = train.efs_time.values
mx = train.loc[train.efs==1,"efs_time"].max()
mn = train.loc[train.efs==0,"efs_time"].min()
train.loc[train.efs==0,"y"] = train.loc[train.efs==0,"y"] + mx - mn
train.y = train.y.rank()
train.loc[train.efs==0,"y"] += 2*len(train)
train.y = train.y / train.y.max()
train.y = np.log( train.y )
train.y -= train.y.mean()
train.y *= -1.0

# train['efs_time'] = (train['efs_time'] - train['efs_time'].min()) / (train['efs_time'].max() - train['efs_time'].min())

cat_columns = [col for col in train.columns if col not in ['efs', 'efs_time', 'y', 'ID']]
train[cat_columns] = train[cat_columns].astype(str)

In [6]:
folds = pd.read_csv(f"{CONFIG['data_train_split']}{CONFIG['folds_path']}")
val = train[folds['fold'] == 4].copy(deep=True)
train = train[folds['fold'].isin([0, 1, 2, 3])].copy(deep=True)
val.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

In [7]:
train_one_hot = pd.get_dummies(train[cat_columns], drop_first=True)
val_one_hot = pd.get_dummies(val[cat_columns], drop_first=True)
val_one_hot = val_one_hot.reindex(columns=train_one_hot.columns, fill_value=False)

In [8]:
input_dim = train_one_hot.shape[1]

In [9]:
# Model definition
class Model(nn.Module):
    def __init__(self, input_dim, output_dim=CONFIG['output_dim']):
        super(Model, self).__init__()
    
        self.fc1 = nn.Linear(input_dim, input_dim // 2)
        self.fc2 = nn.Linear(input_dim // 2, input_dim // 4)
        self.fc3 = nn.Linear(input_dim // 4, output_dim)
        
        self.relu = nn.ReLU()
        self.do = nn.Dropout(p=0.5)
        
        self.init_weights()
    
    def init_weights(self):
        init.xavier_uniform_(self.fc1.weight)
        init.xavier_uniform_(self.fc2.weight)
        init.xavier_uniform_(self.fc3.weight)
    
        self.fc1.weight.data = torch.clamp(self.fc1.weight.data, 0, 1)
        self.fc2.weight.data = torch.clamp(self.fc2.weight.data, 0, 1)
        self.fc3.weight.data = torch.clamp(self.fc3.weight.data, 0, 1)

        if self.fc1.bias is not None:
            init.zeros_(self.fc1.bias)
        if self.fc2.bias is not None:
            init.zeros_(self.fc2.bias)
        if self.fc3.bias is not None:
            init.zeros_(self.fc3.bias)
        
    def forward(self, x):

        x = self.relu(self.fc1(x))
        x = self.do(x)
        x = self.relu(self.fc2(x))
        x = self.do(x)
        x = self.fc3(x)
        
        return x

In [10]:
# Model, criterion and optimizer
model = Model(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=CONFIG['LR'])

In [11]:
train_tensors = torch.tensor(train_one_hot.values, device=device, dtype=torch.float32)

train_targets = torch.tensor(train['y'].values, device=device, dtype=torch.float32)

val_tensors = torch.tensor(val_one_hot.values, device=device, dtype=torch.float32)

val_targets = torch.tensor(val['y'].values, device=device, dtype=torch.float32)

In [12]:
# Training
train_num_samples = len(train)
train_num_batches = (train_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

val_num_samples = len(val)
val_num_batches = (val_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

for epoch in range(CONFIG['EPOCHS']):
    ##################################################################TRAIN##################################################################
    model.train()
    train_running_loss = 0.0

    with tqdm(range(train_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as t:
        for batch_idx in t:
            start_idx = batch_idx * CONFIG['BATCH_SIZE']
            end_idx = min(start_idx + CONFIG['BATCH_SIZE'], train_num_samples)

            # Извлечение батча данных
            batch_inputs = train_tensors[start_idx:end_idx]
            batch_targets = train_targets[start_idx:end_idx]
            
            # Обнуление градиентов
            optimizer.zero_grad()

            # Прямой проход
            outputs = model(batch_inputs)

            # Вычисление функции потерь
            batch_loss = criterion(outputs, batch_targets.unsqueeze(1))

            # Обратный проход
            batch_loss.backward()
            optimizer.step()

            train_running_loss += batch_loss.item()
            t.set_postfix(train_mean_loss=f"{train_running_loss / (batch_idx + 1):.6f}")
    
    ###EVAL
    model.eval()
    val_running_loss = 0.0
    
    outputs_list = []
    
    with torch.no_grad():
        with tqdm(range(val_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as v:
            for batch_idx in v:
                start_idx = batch_idx * CONFIG['BATCH_SIZE']
                end_idx = min(start_idx + CONFIG['BATCH_SIZE'], val_num_samples)
                
                batch_inputs = val_tensors[start_idx:end_idx]
                batch_targets = val_targets[start_idx:end_idx]
    
                # Прямой проход
                outputs = model(batch_inputs)
                
                batch_loss = criterion(outputs, batch_targets.unsqueeze(1))
                val_running_loss += batch_loss.item()
                v.set_postfix(val_mean_loss=f"{val_running_loss / (batch_idx + 1):.6f}")
        
                outputs_list.extend(outputs.cpu().numpy().flatten())
    
    ###SAVE
    row_id_column_name = "ID"
    y_pred = val[['ID']].copy(deep=True)
    y_pred["prediction"] = outputs_list
    
    y_true = val[['ID', 'efs', 'efs_time', 'race_group']].copy(deep=True)
    print(score(y_true.copy(), y_pred.copy(), row_id_column_name))

Epoch 1/7: 100%|██████████| 720/720 [00:01<00:00, 378.75batch/s, train_mean_loss=18.588293]
Epoch 1/7: 100%|██████████| 180/180 [00:00<00:00, 820.67batch/s, val_mean_loss=2.129881]


0.6329182499412365


Epoch 2/7: 100%|██████████| 720/720 [00:01<00:00, 386.70batch/s, train_mean_loss=2.074710]
Epoch 2/7: 100%|██████████| 180/180 [00:00<00:00, 877.81batch/s, val_mean_loss=1.971634]


0.6547586703231209


Epoch 3/7: 100%|██████████| 720/720 [00:01<00:00, 408.39batch/s, train_mean_loss=1.935668]
Epoch 3/7: 100%|██████████| 180/180 [00:00<00:00, 870.56batch/s, val_mean_loss=1.924696]


0.6616678654811642


Epoch 4/7: 100%|██████████| 720/720 [00:01<00:00, 389.99batch/s, train_mean_loss=1.884340]
Epoch 4/7: 100%|██████████| 180/180 [00:00<00:00, 810.48batch/s, val_mean_loss=1.913616]


0.6661412702470068


Epoch 5/7: 100%|██████████| 720/720 [00:01<00:00, 382.94batch/s, train_mean_loss=1.845700]
Epoch 5/7: 100%|██████████| 180/180 [00:00<00:00, 861.32batch/s, val_mean_loss=1.906154]


0.6674319116433822


Epoch 6/7: 100%|██████████| 720/720 [00:02<00:00, 341.80batch/s, train_mean_loss=1.830560]
Epoch 6/7: 100%|██████████| 180/180 [00:00<00:00, 803.55batch/s, val_mean_loss=1.902106]


0.6695741517000044


Epoch 7/7: 100%|██████████| 720/720 [00:02<00:00, 311.98batch/s, train_mean_loss=1.799513]
Epoch 7/7: 100%|██████████| 180/180 [00:00<00:00, 821.95batch/s, val_mean_loss=1.881882]


0.6720195432442079
