# Split Data Training - Sequence LSTM + MLP 기반 CTR 예측

10개로 분할된 데이터를 순차적으로 학습하여 하나의 통합 모델을 생성하는 코드입니다.

## 주요 기능
- 10개 split 데이터를 순차적으로 처리
- 메모리 효율적 관리 (각 처리 후 메모리 해제)
- models 폴더에 모델 저장
- 증분 학습을 통한 통합 모델 생성


## Import


In [13]:
import pandas as pd
import numpy as np
import os
import random
import gc
from datetime import datetime
import glob

from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import pyarrow as pa
import pyarrow.parquet as pq


## Setting


In [14]:
CFG = {
    'BATCH_SIZE': 4096,
    'EPOCHS_PER_SPLIT': 3,  # 각 split 데이터당 에포크 수
    'LEARNING_RATE': 1e-3,
    'SEED': 42,
    'DOWNSAMPLE_RATIO': 2,  # clicked=0 데이터를 clicked=1의 몇 배로 샘플링할지
    'SPLIT_DATA_PATH': '../data/processed/split_data/',
    'MODELS_PATH': '../models/',
    'MODEL_NAME': 'ctr_lstm_mlp_model'
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# models 폴더가 없으면 생성
os.makedirs(CFG['MODELS_PATH'], exist_ok=True)


🚀 Using device: cuda
⚙️  Enhanced Gradient Descent Configuration:
   Batch Size: 4,096
   Learning Rate: 1.0e-03
   Weight Decay: 1.0e-05
   Gradient Clipping: 1.0
   Scheduler Min LR: 1.0e-06
📁 Model directory: ../models/
🎮 GPU: NVIDIA GeForce RTX 2070
💾 GPU Memory: 8.6 GB


In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED'])  # Seed 고정


## Memory Management Utilities


In [16]:
def clear_memory():
    """메모리 정리를 위한 함수"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def get_memory_usage():
    """현재 GPU 메모리 사용량 확인"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**2  # MB
        cached = torch.cuda.memory_reserved() / 1024**2  # MB
        return f"GPU Memory - Allocated: {allocated:.1f}MB, Cached: {cached:.1f}MB"
    return "CPU mode - No GPU memory tracking"

print(get_memory_usage())


GPU Memory - Allocated: 0.0MB, Cached: 0.0MB


## Data Processing Functions


In [17]:
def load_and_downsample_data(file_path, downsample_ratio=2):
    """
    Split 데이터를 로드하고 다운샘플링 수행
    """
    print(f"Loading data from: {file_path}")
    
    # 데이터 로드
    df = pd.read_parquet(file_path, engine="pyarrow")
    print(f"Original shape: {df.shape}")
    
    # clicked == 1 데이터
    clicked_1 = df[df['clicked'] == 1]
    
    # clicked == 0 데이터에서 동일 개수 x downsample_ratio 만큼 무작위 추출
    clicked_0_count = len(clicked_1) * downsample_ratio
    clicked_0 = df[df['clicked'] == 0]
    
    if len(clicked_0) < clicked_0_count:
        # clicked=0 데이터가 부족한 경우 모든 데이터 사용
        print(f"Warning: Not enough clicked=0 data. Using all {len(clicked_0)} samples.")
        sampled_0 = clicked_0
    else:
        sampled_0 = clicked_0.sample(n=clicked_0_count, random_state=42)
    
    # 두 데이터프레임 합치기
    result = pd.concat([clicked_1, sampled_0], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"Downsampled shape: {result.shape}")
    print(f"Clicked=0: {len(result[result['clicked']==0])}, Clicked=1: {len(result[result['clicked']==1])}")
    
    return result

def get_feature_columns(df):
    """피처 컬럼 추출"""
    FEATURE_EXCLUDE = {"clicked", "seq", "ID"}
    return [c for c in df.columns if c not in FEATURE_EXCLUDE]


## Dataset & DataLoader


In [18]:
class ClickDataset(Dataset):
    def __init__(self, df, feature_cols, seq_col, target_col=None, has_target=True):
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.seq_col = seq_col
        self.target_col = target_col
        self.has_target = has_target

        # 비-시퀀스 피처: 전부 연속값으로
        self.X = self.df[self.feature_cols].astype(float).fillna(0).values

        # 시퀀스: 문자열 그대로 보관 (lazy 파싱)
        self.seq_strings = self.df[self.seq_col].astype(str).values

        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx], dtype=torch.float)

        # 전체 시퀀스 사용 (빈 시퀀스만 방어)
        s = self.seq_strings[idx]
        if s and s != 'nan':
            try:
                arr = np.fromstring(s, sep=",", dtype=np.float32)
            except:
                arr = np.array([], dtype=np.float32)
        else:
            arr = np.array([], dtype=np.float32)

        if arr.size == 0:
            arr = np.array([0.0], dtype=np.float32)  # 빈 시퀀스 방어

        seq = torch.from_numpy(arr)  # shape (seq_len,)

        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float)
            return x, seq, y
        else:
            return x, seq


In [19]:
def collate_fn_train(batch):
    xs, seqs, ys = zip(*batch)
    xs = torch.stack(xs)
    ys = torch.stack(ys)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)  # 빈 시퀀스 방지
    return xs, seqs_padded, seq_lengths, ys

def collate_fn_infer(batch):
    xs, seqs = zip(*batch)
    xs = torch.stack(xs)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return xs, seqs_padded, seq_lengths


## Model Architecture


In [20]:
class TabularSeqModel(nn.Module):
    def __init__(self, d_features, lstm_hidden=32, hidden_units=[1024, 512, 256, 128], dropout=0.2):
        super().__init__()
        # 모든 비-시퀀스 피처에 BN
        self.bn_x = nn.BatchNorm1d(d_features)
        # seq: 숫자 시퀀스 → LSTM
        self.lstm = nn.LSTM(input_size=1, hidden_size=lstm_hidden, batch_first=True)

        # 최종 MLP
        input_dim = d_features + lstm_hidden
        layers = []
        for h in hidden_units:
            layers += [nn.Linear(input_dim, h), nn.ReLU(), nn.Dropout(dropout)]
            input_dim = h
        layers += [nn.Linear(input_dim, 1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, x_feats, x_seq, seq_lengths):
        # 비-시퀀스 피처
        x = self.bn_x(x_feats)

        # 시퀀스 → LSTM (pack)
        x_seq = x_seq.unsqueeze(-1)  # (B, L, 1)
        packed = nn.utils.rnn.pack_padded_sequence(
            x_seq, seq_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h = h_n[-1]                  # (B, lstm_hidden)

        z = torch.cat([x, h], dim=1)
        return self.mlp(z).squeeze(1)  # logits


## Model Save & Load Functions


In [None]:
def save_model(model, model_path, model_config=None):
    """모델과 설정 저장"""
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'model_config': model_config,
        'timestamp': datetime.now().isoformat()
    }
    
    torch.save(checkpoint, model_path)
    print(f"Model saved to: {model_path}")

def load_model(model_path, d_features, device='cpu'):
    """저장된 모델 로드"""
    checkpoint = torch.load(model_path, map_location=device)
    
    # 모델 설정이 저장되어 있으면 사용, 없으면 기본값
    if 'model_config' in checkpoint and checkpoint['model_config']:
        config = checkpoint['model_config']
        model = TabularSeqModel(
            d_features=d_features,
            lstm_hidden=config.get('lstm_hidden', 64),
            hidden_units=config.get('hidden_units', [256, 128]),
            dropout=config.get('dropout', 0.2)
        )
    else:
        # 기본 설정
        model = TabularSeqModel(
            d_features=d_features,
            lstm_hidden=64,
            hidden_units=[256, 128],
            dropout=0.2
        )
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    print(f"Model loaded from: {model_path}")
    
    if 'timestamp' in checkpoint:
        print(f"Model timestamp: {checkpoint['timestamp']}")
    
    return model


## Split Training Function


In [None]:
def train_on_split(model, train_df, feature_cols, seq_col, target_col, 
                   batch_size=512, epochs=3, lr=1e-3, device="cuda"):
    """
    단일 split 데이터에 대해 모델 학습
    """
    print(f"Training on split data with {len(train_df)} samples")
    
    # Train/Validation split
    tr_df, va_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)
    print(f"Train: {len(tr_df)}, Validation: {len(va_df)}")

    # Dataset & DataLoader
    train_dataset = ClickDataset(tr_df, feature_cols, seq_col, target_col, has_target=True)
    val_dataset = ClickDataset(va_df, feature_cols, seq_col, target_col, has_target=True)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_train)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_train)

    # Loss & Optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training Loop
    for epoch in range(1, epochs + 1):
        # Train
        model.train()
        train_loss = 0.0
        train_batches = 0
        
        for xs, seqs, seq_lens, ys in tqdm(train_loader, desc=f"Train Epoch {epoch}"):
            xs, seqs, seq_lens, ys = xs.to(device), seqs.to(device), seq_lens.to(device), ys.to(device)
            
            optimizer.zero_grad()
            logits = model(xs, seqs, seq_lens)
            loss = criterion(logits, ys)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_batches += 1

        avg_train_loss = train_loss / train_batches

        # Validation
        model.eval()
        val_loss = 0.0
        val_batches = 0
        
        with torch.no_grad():
            for xs, seqs, seq_lens, ys in tqdm(val_loader, desc=f"Val Epoch {epoch}"):
                xs, seqs, seq_lens, ys = xs.to(device), seqs.to(device), seq_lens.to(device), ys.to(device)
                
                logits = model(xs, seqs, seq_lens)
                loss = criterion(logits, ys)
                
                val_loss += loss.item()
                val_batches += 1

        avg_val_loss = val_loss / val_batches
        print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        
        # Memory monitoring
        print(f"Memory usage: {get_memory_usage()}")

    return model


## Main Training Loop


In [None]:
def train_all_splits():
    """
    모든 split 데이터를 순차적으로 처리하여 통합 모델 학습
    """
    print("="*60)
    print("Starting Split Data Training")
    print("="*60)
    
    # Split 파일 목록 가져오기
    split_files = sorted(glob.glob(os.path.join(CFG['SPLIT_DATA_PATH'], "part_*.parquet")))
    print(f"Found {len(split_files)} split files:")
    for f in split_files:
        print(f"  - {os.path.basename(f)}")
    
    if len(split_files) == 0:
        print("No split files found!")
        return None
    
    # 첫 번째 파일로 feature 정보 확인
    print("\n" + "="*40)
    print("Analyzing first split for feature info...")
    first_df = load_and_downsample_data(split_files[0], CFG['DOWNSAMPLE_RATIO'])
    feature_cols = get_feature_columns(first_df)
    seq_col = "seq"
    target_col = "clicked"
    
    print(f"Number of features: {len(feature_cols)}")
    print(f"Sequence column: {seq_col}")
    print(f"Target column: {target_col}")
    
    # 메모리 정리
    del first_df
    clear_memory()
    
    # 모델 초기화
    print("\n" + "="*40)
    print("Initializing model...")
    model_config = {
        'lstm_hidden': 64,
        'hidden_units': [256, 128],
        'dropout': 0.2
    }
    
    model = TabularSeqModel(
        d_features=len(feature_cols),
        lstm_hidden=model_config['lstm_hidden'],
        hidden_units=model_config['hidden_units'],
        dropout=model_config['dropout']
    ).to(device)
    
    print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
    print(f"Initial memory: {get_memory_usage()}")
    
    # 각 split에 대해 순차적으로 학습
    for i, split_file in enumerate(split_files, 1):
        print(f"\n{'='*60}")
        print(f"Processing Split {i}/{len(split_files)}: {os.path.basename(split_file)}")
        print(f"{'='*60}")
        
        try:
            # 데이터 로드 및 전처리
            split_df = load_and_downsample_data(split_file, CFG['DOWNSAMPLE_RATIO'])
            
            # 학습 수행
            model = train_on_split(
                model=model,
                train_df=split_df,
                feature_cols=feature_cols,
                seq_col=seq_col,
                target_col=target_col,
                batch_size=CFG['BATCH_SIZE'],
                epochs=CFG['EPOCHS_PER_SPLIT'],
                lr=CFG['LEARNING_RATE'],
                device=device
            )
            
            # 중간 모델 저장 (선택적)
            if i % 3 == 0:  # 3번째마다 중간 저장
                checkpoint_path = os.path.join(CFG['MODELS_PATH'], f"{CFG['MODEL_NAME']}_checkpoint_split_{i:02d}.pth")
                save_model(model, checkpoint_path, model_config)
            
            print(f"Completed split {i}/{len(split_files)}")
            
        except Exception as e:
            print(f"Error processing split {i}: {str(e)}")
            continue
            
        finally:
            # 메모리 정리
            if 'split_df' in locals():
                del split_df
            clear_memory()
            print(f"Memory after cleanup: {get_memory_usage()}")
    
    # 최종 모델 저장
    print(f"\n{'='*60}")
    print("Saving Final Model")
    print(f"{'='*60}")
    
    final_model_path = os.path.join(CFG['MODELS_PATH'], f"{CFG['MODEL_NAME']}_final.pth")
    save_model(model, final_model_path, model_config)
    
    print(f"Training completed!")
    print(f"Final model saved to: {final_model_path}")
    print(f"Final memory usage: {get_memory_usage()}")
    
    return model, feature_cols


## Execute Training


In [24]:
# 모든 split 데이터로 학습 실행
start_time = datetime.now()
print(f"Training started at: {start_time}")

try:
    trained_model, feature_columns = train_all_splits()
    
    end_time = datetime.now()
    training_time = end_time - start_time
    print(f"\nTraining completed!")
    print(f"Training time: {training_time}")
    print(f"Final memory usage: {get_memory_usage()}")
    
except Exception as e:
    print(f"Training failed with error: {str(e)}")
    raise


Training started at: 2025-09-19 16:22:04.724861
🚀 Starting Enhanced Split Data Training with Advanced Gradient Descent
📁 Found 10 split files:
   - part_01.parquet
   - part_02.parquet
   - part_03.parquet
   - part_04.parquet
   - part_05.parquet
   - part_06.parquet
   - part_07.parquet
   - part_08.parquet
   - part_09.parquet
   - part_10.parquet

🔍 Analyzing first split for feature info...
Loading data from: ../data/processed/split_data\part_01.parquet
Original shape: (1070417, 119)
Downsampled shape: (61524, 119)
Clicked=0: 41016, Clicked=1: 20508
📊 Number of features: 117
📈 Sequence column: seq
🎯 Target column: clicked

🧠 Initializing model & optimization strategy...
✅ Model initialized with 97,003 parameters
⚙️  Optimizer: Adam with LR=1.0e-03, weight_decay=1e-5
📈 Scheduler: CosineAnnealingWarmRestarts
💾 Initial memory: GPU Memory - Allocated: 0.4MB, Cached: 2.0MB

🎯 Processing Split 1/10: part_01.parquet
Loading data from: ../data/processed/split_data\part_01.parquet
Original 

Split 1 Epoch 1: 100%|██████████| 13/13 [00:15<00:00,  1.20s/it]


   [Split 1 Epoch 1] Train: 0.6361 | Val: 0.6328
   LR: 0.001000 | Memory: GPU Memory - Allocated: 18.2MB, Cached: 5500.0MB


Split 1 Epoch 2: 100%|██████████| 13/13 [00:16<00:00,  1.29s/it]


   [Split 1 Epoch 2] Train: 0.6056 | Val: 0.6136
   LR: 0.001000 | Memory: GPU Memory - Allocated: 18.4MB, Cached: 8216.0MB


Split 1 Epoch 3: 100%|██████████| 13/13 [00:15<00:00,  1.21s/it]


   [Split 1 Epoch 3] Train: 0.5997 | Val: 0.6024
   LR: 0.001000 | Memory: GPU Memory - Allocated: 18.2MB, Cached: 8216.0MB
   📈 LR Scheduler: 0.001000 -> 0.000750
✅ Split 1 completed! Final validation loss: 0.6024
✅ Split 1/10 completed!
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 58.0MB

🎯 Processing Split 2/10: part_02.parquet
Loading data from: ../data/processed/split_data\part_02.parquet
Original shape: (1070417, 119)
Downsampled shape: (63441, 119)
Clicked=0: 42294, Clicked=1: 21147
🔄 Training on Split 2 with 63441 samples
📊 Current learning rate: 0.000750
   Train: 50752, Validation: 12689
🔍 Initial validation loss: 0.6002


Split 2 Epoch 1: 100%|██████████| 13/13 [00:18<00:00,  1.42s/it]


   [Split 2 Epoch 1] Train: 0.5900 | Val: 0.5907
   LR: 0.000750 | Memory: GPU Memory - Allocated: 35.0MB, Cached: 10454.0MB


Split 2 Epoch 2: 100%|██████████| 13/13 [00:20<00:00,  1.60s/it]


   [Split 2 Epoch 2] Train: 0.5845 | Val: 0.5849
   LR: 0.000750 | Memory: GPU Memory - Allocated: 33.7MB, Cached: 15594.0MB


Split 2 Epoch 3: 100%|██████████| 13/13 [00:21<00:00,  1.62s/it]


   [Split 2 Epoch 3] Train: 0.5831 | Val: 0.5828
   LR: 0.000750 | Memory: GPU Memory - Allocated: 35.5MB, Cached: 15594.0MB
   📈 LR Scheduler: 0.000750 -> 0.000251
✅ Split 2 completed! Final validation loss: 0.5828
✅ Split 2/10 completed!
📈 Validation Loss Change: +3.25%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

🎯 Processing Split 3/10: part_03.parquet
Loading data from: ../data/processed/split_data\part_03.parquet
Original shape: (1070417, 119)
Downsampled shape: (63387, 119)
Clicked=0: 42258, Clicked=1: 21129
🔄 Training on Split 3 with 63387 samples
📊 Current learning rate: 0.000251
   Train: 50709, Validation: 12678
🔍 Initial validation loss: 0.5835


Split 3 Epoch 1: 100%|██████████| 13/13 [02:21<00:00, 10.90s/it]


   [Split 3 Epoch 1] Train: 0.5795 | Val: 0.5815
   LR: 0.000251 | Memory: GPU Memory - Allocated: 35.6MB, Cached: 10440.0MB


Split 3 Epoch 2: 100%|██████████| 13/13 [02:12<00:00, 10.21s/it]


   [Split 3 Epoch 2] Train: 0.5774 | Val: 0.5807
   LR: 0.000251 | Memory: GPU Memory - Allocated: 37.0MB, Cached: 10440.0MB


Split 3 Epoch 3: 100%|██████████| 13/13 [01:40<00:00,  7.77s/it]


   [Split 3 Epoch 3] Train: 0.5777 | Val: 0.5803
   LR: 0.000251 | Memory: GPU Memory - Allocated: 37.3MB, Cached: 10440.0MB
   📈 LR Scheduler: 0.000251 -> 0.001000
✅ Split 3 completed! Final validation loss: 0.5803
💾 Checkpoint saved: ../models/ctr_lstm_mlp_enhanced_checkpoint_split_03.pth
✅ Split 3/10 completed!
📈 Validation Loss Change: +0.42%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 62.0MB

🎯 Processing Split 4/10: part_04.parquet
Loading data from: ../data/processed/split_data\part_04.parquet
Original shape: (1070417, 119)
Downsampled shape: (57975, 119)
Clicked=0: 38650, Clicked=1: 19325
🔄 Training on Split 4 with 57975 samples
📊 Current learning rate: 0.001000
   Train: 46380, Validation: 11595
🔍 Initial validation loss: 0.5766


Split 4 Epoch 1: 100%|██████████| 12/12 [00:34<00:00,  2.87s/it]


   [Split 4 Epoch 1] Train: 0.5820 | Val: 0.5765
   LR: 0.001000 | Memory: GPU Memory - Allocated: 31.1MB, Cached: 10350.0MB


Split 4 Epoch 2: 100%|██████████| 12/12 [00:39<00:00,  3.30s/it]


   [Split 4 Epoch 2] Train: 0.5794 | Val: 0.5769
   LR: 0.001000 | Memory: GPU Memory - Allocated: 31.4MB, Cached: 10350.0MB


Split 4 Epoch 3: 100%|██████████| 12/12 [00:39<00:00,  3.31s/it]


   [Split 4 Epoch 3] Train: 0.5761 | Val: 0.5750
   LR: 0.001000 | Memory: GPU Memory - Allocated: 33.1MB, Cached: 10350.0MB
   📈 LR Scheduler: 0.001000 -> 0.000750
✅ Split 4 completed! Final validation loss: 0.5750
✅ Split 4/10 completed!
📈 Validation Loss Change: +0.92%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 58.0MB

🎯 Processing Split 5/10: part_05.parquet
Loading data from: ../data/processed/split_data\part_05.parquet
Original shape: (1070417, 119)
Downsampled shape: (57609, 119)
Clicked=0: 38406, Clicked=1: 19203
🔄 Training on Split 5 with 57609 samples
📊 Current learning rate: 0.000750
   Train: 46087, Validation: 11522
🔍 Initial validation loss: 0.5761


Split 5 Epoch 1: 100%|██████████| 12/12 [00:41<00:00,  3.47s/it]


   [Split 5 Epoch 1] Train: 0.5826 | Val: 0.5760
   LR: 0.000750 | Memory: GPU Memory - Allocated: 28.2MB, Cached: 13190.0MB


Split 5 Epoch 2: 100%|██████████| 12/12 [00:39<00:00,  3.27s/it]


   [Split 5 Epoch 2] Train: 0.5788 | Val: 0.5740
   LR: 0.000750 | Memory: GPU Memory - Allocated: 28.4MB, Cached: 15840.0MB


Split 5 Epoch 3: 100%|██████████| 12/12 [00:37<00:00,  3.10s/it]


   [Split 5 Epoch 3] Train: 0.5777 | Val: 0.5738
   LR: 0.000750 | Memory: GPU Memory - Allocated: 29.2MB, Cached: 15840.0MB
   📈 LR Scheduler: 0.000750 -> 0.000251
✅ Split 5 completed! Final validation loss: 0.5738
✅ Split 5/10 completed!
📈 Validation Loss Change: +0.21%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

🎯 Processing Split 6/10: part_06.parquet
Loading data from: ../data/processed/split_data\part_06.parquet
Original shape: (1070417, 119)
Downsampled shape: (61506, 119)
Clicked=0: 41004, Clicked=1: 20502
🔄 Training on Split 6 with 61506 samples
📊 Current learning rate: 0.000251
   Train: 49204, Validation: 12302
🔍 Initial validation loss: 0.5743


Split 6 Epoch 1: 100%|██████████| 13/13 [00:35<00:00,  2.70s/it]


   [Split 6 Epoch 1] Train: 0.5710 | Val: 0.5757
   LR: 0.000251 | Memory: GPU Memory - Allocated: 18.1MB, Cached: 9406.0MB


Split 6 Epoch 2: 100%|██████████| 13/13 [00:26<00:00,  2.00s/it]


   [Split 6 Epoch 2] Train: 0.5754 | Val: 0.5746
   LR: 0.000251 | Memory: GPU Memory - Allocated: 18.1MB, Cached: 9406.0MB


Split 6 Epoch 3: 100%|██████████| 13/13 [00:25<00:00,  1.96s/it]


   [Split 6 Epoch 3] Train: 0.5754 | Val: 0.5745
   LR: 0.000251 | Memory: GPU Memory - Allocated: 18.1MB, Cached: 12058.0MB
   📈 LR Scheduler: 0.000251 -> 0.001000
✅ Split 6 completed! Final validation loss: 0.5745
💾 Checkpoint saved: ../models/ctr_lstm_mlp_enhanced_checkpoint_split_06.pth
✅ Split 6/10 completed!
📈 Validation Loss Change: -0.12%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

🎯 Processing Split 7/10: part_07.parquet
Loading data from: ../data/processed/split_data\part_07.parquet
Original shape: (1070417, 119)
Downsampled shape: (62148, 119)
Clicked=0: 41432, Clicked=1: 20716
🔄 Training on Split 7 with 62148 samples
📊 Current learning rate: 0.001000
   Train: 49718, Validation: 12430
🔍 Initial validation loss: 0.5585


Split 7 Epoch 1: 100%|██████████| 13/13 [00:18<00:00,  1.39s/it]


   [Split 7 Epoch 1] Train: 0.5773 | Val: 0.5590
   LR: 0.001000 | Memory: GPU Memory - Allocated: 23.9MB, Cached: 7876.0MB


Split 7 Epoch 2: 100%|██████████| 13/13 [01:46<00:00,  8.20s/it]


   [Split 7 Epoch 2] Train: 0.5777 | Val: 0.5612
   LR: 0.001000 | Memory: GPU Memory - Allocated: 24.2MB, Cached: 10460.0MB


Split 7 Epoch 3: 100%|██████████| 13/13 [02:56<00:00, 13.60s/it]


   [Split 7 Epoch 3] Train: 0.5731 | Val: 0.5584
   LR: 0.001000 | Memory: GPU Memory - Allocated: 23.1MB, Cached: 10460.0MB
   📈 LR Scheduler: 0.001000 -> 0.000750
✅ Split 7 completed! Final validation loss: 0.5584
✅ Split 7/10 completed!
📈 Validation Loss Change: +2.80%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

🎯 Processing Split 8/10: part_08.parquet
Loading data from: ../data/processed/split_data\part_08.parquet
Original shape: (1070417, 119)
Downsampled shape: (62640, 119)
Clicked=0: 41760, Clicked=1: 20880
🔄 Training on Split 8 with 62640 samples
📊 Current learning rate: 0.000750
   Train: 50112, Validation: 12528
🔍 Initial validation loss: 0.5577


Split 8 Epoch 1: 100%|██████████| 13/13 [00:32<00:00,  2.49s/it]


   [Split 8 Epoch 1] Train: 0.5766 | Val: 0.5554
   LR: 0.000750 | Memory: GPU Memory - Allocated: 28.8MB, Cached: 9914.0MB


Split 8 Epoch 2: 100%|██████████| 13/13 [00:32<00:00,  2.47s/it]


   [Split 8 Epoch 2] Train: 0.5734 | Val: 0.5554
   LR: 0.000750 | Memory: GPU Memory - Allocated: 28.7MB, Cached: 14798.0MB


Split 8 Epoch 3: 100%|██████████| 13/13 [00:24<00:00,  1.87s/it]


   [Split 8 Epoch 3] Train: 0.5723 | Val: 0.5537
   LR: 0.000750 | Memory: GPU Memory - Allocated: 28.5MB, Cached: 4654.0MB
   📈 LR Scheduler: 0.000750 -> 0.000251
✅ Split 8 completed! Final validation loss: 0.5537
✅ Split 8/10 completed!
📈 Validation Loss Change: +0.84%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

🎯 Processing Split 9/10: part_09.parquet
Loading data from: ../data/processed/split_data\part_09.parquet
Original shape: (1070417, 119)
Downsampled shape: (60402, 119)
Clicked=0: 40268, Clicked=1: 20134
🔄 Training on Split 9 with 60402 samples
📊 Current learning rate: 0.000251
   Train: 48321, Validation: 12081
🔍 Initial validation loss: 0.5718


Split 9 Epoch 1: 100%|██████████| 12/12 [01:51<00:00,  9.33s/it]


   [Split 9 Epoch 1] Train: 0.5751 | Val: 0.5726
   LR: 0.000251 | Memory: GPU Memory - Allocated: 62.6MB, Cached: 15714.0MB


Split 9 Epoch 2: 100%|██████████| 12/12 [04:14<00:00, 21.19s/it]


   [Split 9 Epoch 2] Train: 0.5732 | Val: 0.5715
   LR: 0.000251 | Memory: GPU Memory - Allocated: 55.9MB, Cached: 15714.0MB


Split 9 Epoch 3: 100%|██████████| 12/12 [04:28<00:00, 22.36s/it]


   [Split 9 Epoch 3] Train: 0.5735 | Val: 0.5713
   LR: 0.000251 | Memory: GPU Memory - Allocated: 62.6MB, Cached: 18366.0MB
   📈 LR Scheduler: 0.000251 -> 0.001000
✅ Split 9 completed! Final validation loss: 0.5713
💾 Checkpoint saved: ../models/ctr_lstm_mlp_enhanced_checkpoint_split_09.pth
✅ Split 9/10 completed!
📈 Validation Loss Change: -3.18%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

🎯 Processing Split 10/10: part_10.parquet
Loading data from: ../data/processed/split_data\part_10.parquet
Original shape: (1064179, 119)
Downsampled shape: (61575, 119)
Clicked=0: 41050, Clicked=1: 20525
🔄 Training on Split 10 with 61575 samples
📊 Current learning rate: 0.001000
   Train: 49260, Validation: 12315
🔍 Initial validation loss: 0.5921


Split 10 Epoch 1: 100%|██████████| 13/13 [00:24<00:00,  1.91s/it]


   [Split 10 Epoch 1] Train: 0.5700 | Val: 0.5916
   LR: 0.001000 | Memory: GPU Memory - Allocated: 18.6MB, Cached: 11880.0MB


Split 10 Epoch 2: 100%|██████████| 13/13 [00:30<00:00,  2.34s/it]


   [Split 10 Epoch 2] Train: 0.5695 | Val: 0.5884
   LR: 0.001000 | Memory: GPU Memory - Allocated: 18.7MB, Cached: 11880.0MB


Split 10 Epoch 3: 100%|██████████| 13/13 [00:35<00:00,  2.69s/it]


   [Split 10 Epoch 3] Train: 0.5718 | Val: 0.5949
   LR: 0.001000 | Memory: GPU Memory - Allocated: 18.7MB, Cached: 11880.0MB
   📈 LR Scheduler: 0.001000 -> 0.000750
✅ Split 10 completed! Final validation loss: 0.5949
✅ Split 10/10 completed!
📈 Validation Loss Change: -4.13%
🧹 Memory after cleanup: GPU Memory - Allocated: 17.7MB, Cached: 60.0MB

📊 Training Summary
Split-wise Performance:
   Split  1: Val Loss = 0.6024, LR = 0.000750
   Split  2: Val Loss = 0.5828, LR = 0.000251
   Split  3: Val Loss = 0.5803, LR = 0.001000
   Split  4: Val Loss = 0.5750, LR = 0.000750
   Split  5: Val Loss = 0.5738, LR = 0.000251
   Split  6: Val Loss = 0.5745, LR = 0.001000
   Split  7: Val Loss = 0.5584, LR = 0.000750
   Split  8: Val Loss = 0.5537, LR = 0.000251
   Split  9: Val Loss = 0.5713, LR = 0.001000
   Split 10: Val Loss = 0.5949, LR = 0.000750

🎯 Overall Improvement: +1.24%
📈 Initial Loss: 0.6024 → Final Loss: 0.5949

💾 Saving Final Model with Full State
✅ Training completed successfully!
💾

## Load Test Data


In [25]:
# 테스트 데이터 로드
print("Loading test data...")
test_df = pd.read_parquet("../data/raw/test.parquet", engine="pyarrow")

print(f"Test data shape: {test_df.shape}")
print("Test data columns:", test_df.columns.tolist())

# ID 컬럼 따로 보관 (제출용)
test_ids = test_df['ID'].copy()

# ID 컬럼 제거
test_df = test_df.drop(columns=['ID'])

print(f"Test data shape after removing ID: {test_df.shape}")
print(f"Memory usage: {get_memory_usage()}")


Loading test data...
Test data shape: (1527298, 119)
Test data columns: ['ID', 'gender', 'age_group', 'inventory_id', 'day_of_week', 'hour', 'seq', 'l_feat_1', 'l_feat_2', 'l_feat_3', 'l_feat_4', 'l_feat_5', 'l_feat_6', 'l_feat_7', 'l_feat_8', 'l_feat_9', 'l_feat_10', 'l_feat_11', 'l_feat_12', 'l_feat_13', 'l_feat_14', 'l_feat_15', 'l_feat_16', 'l_feat_17', 'l_feat_18', 'l_feat_19', 'l_feat_20', 'l_feat_21', 'l_feat_22', 'l_feat_23', 'l_feat_24', 'l_feat_25', 'l_feat_26', 'l_feat_27', 'feat_e_1', 'feat_e_2', 'feat_e_3', 'feat_e_4', 'feat_e_5', 'feat_e_6', 'feat_e_7', 'feat_e_8', 'feat_e_9', 'feat_e_10', 'feat_d_1', 'feat_d_2', 'feat_d_3', 'feat_d_4', 'feat_d_5', 'feat_d_6', 'feat_c_1', 'feat_c_2', 'feat_c_3', 'feat_c_4', 'feat_c_5', 'feat_c_6', 'feat_c_7', 'feat_c_8', 'feat_b_1', 'feat_b_2', 'feat_b_3', 'feat_b_4', 'feat_b_5', 'feat_b_6', 'feat_a_1', 'feat_a_2', 'feat_a_3', 'feat_a_4', 'feat_a_5', 'feat_a_6', 'feat_a_7', 'feat_a_8', 'feat_a_9', 'feat_a_10', 'feat_a_11', 'feat_a_12', 'f

## Inference with Saved Model


In [None]:
def perform_inference(model_path, test_df, feature_cols, batch_size=4096):
    """
    저장된 모델을 로드하여 테스트 데이터에 대해 추론 수행
    """
    print(f"Loading model from: {model_path}")
    
    # 모델 로드
    if not os.path.exists(model_path):
        print(f"Model file not found: {model_path}")
        return None
        
    model = load_model(model_path, len(feature_cols), device)
    model.eval()
    
    print(f"Model loaded successfully!")
    print(f"Memory after model loading: {get_memory_usage()}")
    
    # 테스트 데이터셋 생성
    seq_col = "seq"
    test_dataset = ClickDataset(test_df, feature_cols, seq_col, has_target=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_infer)
    
    print(f"Test dataset size: {len(test_dataset)}")
    print(f"Number of batches: {len(test_loader)}")
    
    # 추론 수행
    predictions = []
    
    with torch.no_grad():
        for batch_idx, (xs, seqs, lens) in enumerate(tqdm(test_loader, desc="Inference")):
            xs, seqs, lens = xs.to(device), seqs.to(device), lens.to(device)
            
            # 모델 예측
            logits = model(xs, seqs, lens)
            probs = torch.sigmoid(logits)
            
            predictions.append(probs.cpu())
            
            # 메모리 정리 (큰 배치 처리 시)
            if batch_idx % 100 == 0 and batch_idx > 0:
                clear_memory()
    
    # 예측 결과 합치기
    final_predictions = torch.cat(predictions).numpy()
    
    print(f"Inference completed!")
    print(f"Predictions shape: {final_predictions.shape}")
    print(f"Prediction range: [{final_predictions.min():.4f}, {final_predictions.max():.4f}]")
    print(f"Final memory usage: {get_memory_usage()}")
    
    # 메모리 정리
    del model
    clear_memory()
    
    return final_predictions


In [27]:
# 저장된 모델로 추론 수행
final_model_path = os.path.join(CFG['MODELS_PATH'], f"{CFG['MODEL_NAME']}_final.pth")

print("="*60)
print("Starting Inference")
print("="*60)

# 학습이 완료된 경우 feature_columns 사용, 아니면 모델에서 추출
if 'feature_columns' in locals() and feature_columns is not None:
    inference_feature_cols = feature_columns
else:
    # 첫 번째 split 파일에서 feature 정보 추출
    print("Extracting feature information from split data...")
    split_files = sorted(glob.glob(os.path.join(CFG['SPLIT_DATA_PATH'], "part_*.parquet")))
    if len(split_files) > 0:
        temp_df = pd.read_parquet(split_files[0], engine="pyarrow", nrows=1000)  # 작은 샘플만
        inference_feature_cols = get_feature_columns(temp_df)
        del temp_df
        clear_memory()
    else:
        print("Error: No split files found for feature extraction!")
        raise FileNotFoundError("Split files not found")

print(f"Using {len(inference_feature_cols)} features for inference")

# 추론 실행
test_predictions = perform_inference(
    model_path=final_model_path,
    test_df=test_df,
    feature_cols=inference_feature_cols,
    batch_size=CFG['BATCH_SIZE']
)



🔮 STARTING ENHANCED INFERENCE WITH TRAINED MODEL
✅ Using feature columns from training session: 117 features
🎯 Final feature count: 117
🔮 Starting Model Inference
🔄 Loading model from: ../models/ctr_lstm_mlp_enhanced_final.pth
📊 Features from checkpoint: 117
🏗️  Model config: LSTM=64, Hidden=[256, 128]
⏰ Model timestamp: 2025-09-19T17:00:18.930169
📚 Trained on 10 splits
🎯 Final validation loss: 0.5949
✅ Model loaded successfully!
✅ Using feature columns from checkpoint: 117 features
💾 Memory after model loading: GPU Memory - Allocated: 17.4MB, Cached: 60.0MB
📊 Test dataset size: 1,527,298
📦 Number of batches: 373
🔧 Batch size: 4,096

🚀 Starting inference...


🔮 Inference Progress:  27%|██▋       | 101/373 [01:36<04:17,  1.06it/s]

   📊 Processed 100 batches | Avg batch time: 0.143s


🔮 Inference Progress:  54%|█████▍    | 201/373 [03:10<02:53,  1.01s/it]

   📊 Processed 200 batches | Avg batch time: 0.143s


🔮 Inference Progress:  81%|████████  | 301/373 [04:42<01:08,  1.06it/s]

   📊 Processed 300 batches | Avg batch time: 0.137s


🔮 Inference Progress: 100%|██████████| 373/373 [05:46<00:00,  1.08it/s]



✅ Inference completed successfully!
⏱️  Total inference time: 346.79 seconds
📊 Average batch time: 0.141 seconds
🔢 Predictions shape: (1527298,)
📈 Prediction statistics:
   Min: 0.004356
   Max: 1.000000
   Mean: 0.304236
   Std: 0.131825
   Median: 0.291847
📊 Prediction distribution:
   0.0-0.1: 27,535 (1.8%)
   0.1-0.2: 363,634 (23.8%)
   0.2-0.3: 407,705 (26.7%)
   0.3-0.4: 377,544 (24.7%)
   0.4-0.5: 216,029 (14.1%)
   0.5-0.6: 102,595 (6.7%)
   0.6-0.7: 27,081 (1.8%)
   0.7-0.8: 4,295 (0.3%)
   0.8-0.9: 698 (0.0%)
   0.9-1.0: 182 (0.0%)
💾 Final memory usage: GPU Memory - Allocated: 71.2MB, Cached: 3230.0MB


## Create Submission


In [31]:
if test_predictions is not None:
    print("="*60)
    print("📋 Creating Submission File (베이스라인 호환 양식)")
    print("="*60)
    
    # 기존 sample_submission.csv 파일 읽기
    sample_submission = pd.read_csv('../data/raw/sample_submission.csv')
    print(f"Sample submission shape: {sample_submission.shape}")
    
    # 예측 결과로 clicked 컬럼 업데이트
    submission_df = sample_submission.copy()
    submission_df['clicked'] = test_predictions
    
    # 결과 확인
    print(f"Submission predictions stats:")
    print(f"  Min: {test_predictions.min():.6f}")
    print(f"  Max: {test_predictions.max():.6f}")
    print(f"  Mean: {test_predictions.mean():.6f}")
    print(f"  Std: {test_predictions.std():.6f}")
    
    # outputs 폴더 확인 및 생성
    output_dir = '../outputs'
    os.makedirs(output_dir, exist_ok=True)
    
    # 기존 제출 파일 확인하여 번호 결정 (베이스라인과 동일한 방식)
    existing_files = [f for f in os.listdir(output_dir) if f.startswith('submission_') and f.endswith('.csv')]
    
    if len(existing_files) == 0:
        next_num = 1
        print("📁 No existing submission files found. Starting with submission_1.csv")
    else:
        nums = [int(f.split('_')[1].split('.')[0]) for f in existing_files]
        next_num = max(nums) + 1
        print(f"📁 Found {len(existing_files)} existing submission files. Next: submission_{next_num}.csv")
    
    # 새로운 파일명으로 저장 (베이스라인과 동일한 양식)
    output_path = os.path.join(output_dir, f'submission_{next_num}.csv')
    
    submission_df.to_csv(output_path, index=False)
    
    print(f"💾 Submission file saved: {output_path}")
    print(f"📊 Submission shape: {submission_df.shape}")
    print(f"📁 File: submission_{next_num}.csv")
    
    # 최종 메모리 정리
    clear_memory()
    print(f"🧹 Final memory usage: {get_memory_usage()}")
    
else:
    print("Error: No predictions available for submission!")


📋 Creating Submission File (베이스라인 호환 양식)
Sample submission shape: (1527298, 2)
Submission predictions stats:
  Min: 0.004356
  Max: 1.000000
  Mean: 0.304236
  Std: 0.131825
📁 Found 2 existing submission files. Next: submission_3.csv
💾 Submission file saved: ../outputs\submission_3.csv
📊 Submission shape: (1527298, 2)
📁 File: submission_3.csv
🧹 Final memory usage: GPU Memory - Allocated: 17.0MB, Cached: 60.0MB


## Summary

이 노트북은 다음과 같은 작업을 수행합니다:

### 📊 **주요 기능**
1. **Split 데이터 순차 처리**: 10개로 분할된 데이터를 메모리 효율적으로 순차 학습
2. **메모리 관리**: 각 split 처리 후 자동 메모리 해제 및 모니터링
3. **증분 학습**: 이전 학습 결과를 바탕으로 다음 split 데이터 학습
4. **모델 저장**: models 폴더에 체크포인트 및 최종 모델 저장
5. **자동화된 추론**: 저장된 모델로 테스트 데이터 예측

### 🔧 **기술적 특징**
- **LSTM + MLP 하이브리드 모델**: 시퀀스와 테이블 피처를 동시 처리
- **다운샘플링**: 불균형 데이터 해결 (clicked=0 : clicked=1 = 2:1 비율)
- **배치 처리**: 대용량 데이터를 위한 효율적 배치 처리
- **GPU 메모리 최적화**: CUDA 메모리 캐시 정리 및 모니터링

### 📁 **출력 파일**
- **모델**: `../models/ctr_lstm_mlp_model_final.pth`
- **체크포인트**: `../models/ctr_lstm_mlp_model_checkpoint_split_XX.pth` (3번째마다)
- **제출 파일**: `../outputs/submission_split_training_XXX_YYYYMMDD_HHMMSS.csv`

### ⚡ **성능 최적화**
- 메모리 사용량 실시간 모니터링
- 각 split 처리 후 자동 가비지 컬렉션
- 배치별 메모리 정리 (100배치마다)
- PyTorch CUDA 캐시 정리

이 방식으로 대용량 데이터도 메모리 제한 없이 안정적으로 학습할 수 있습니다! 🚀
