In [6]:
import os
import re
import math
import random
import unicodedata
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from copy import deepcopy

# ÎùºÏù¥Î∏åÎü¨Î¶¨ ÏûÑÌè¨Ìä∏
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Î®∏Ïã†Îü¨Îãù ÎùºÏù¥Î∏åÎü¨Î¶¨
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# [AMD GPU ÏÑ§Ï†ï]
try:
    import torch_directml
    has_directml = True
except ImportError:
    has_directml = False
    print("‚ö†Ô∏è torch-directml ÎØ∏ÏÑ§Ïπò. CPUÎ°ú ÏßÑÌñâÎê©ÎãàÎã§.")

# ---------------------------
# 0. ÌôòÍ≤Ω ÏÑ§Ï†ï
# ---------------------------
SEED = 42
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(SEED)

# [ÌïµÏã¨ Î≥ÄÍ≤Ω] Device ÏÑ§Ï†ï Î°úÏßÅ
def get_device():
    # 1. DirectML (AMD GPU Ïö∞ÏÑ†)
    if has_directml:
        print("üöÄ AMD GPU Detected! Using DirectML (dml).")
        return torch_directml.device()
    
    # 2. CUDA (NVIDIA ÌòπÏãúÎÇò Ìï¥ÏÑú)
    if torch.cuda.is_available():
        print("üöÄ NVIDIA GPU Detected! Using CUDA.")
        return torch.device("cuda")
    
    # 3. Apple Silicon (M1/M2/M3)
    if torch.backends.mps.is_available():
        return torch.device("mps")
    
    print("üê¢ No GPU detected. Using CPU.")
    return torch.device("cpu")

DEVICE = get_device()
print(f"üöÄ Current Device: {DEVICE}")

# ---------------------------
# 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú
# ---------------------------
def locate_data_dir():
    cwd = Path(".").resolve()
    if (cwd / "train.csv").exists(): return cwd
    return cwd 

DATA_DIR = locate_data_dir()
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")
sub = pd.read_csv(DATA_DIR / "sample_submission.csv")

TARGET = "completed"
ID_COL = "ID"

# ---------------------------
# 2. ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨ (RoBERTaÏö©)
# ---------------------------
MISSING_MARKERS = {"", " ", "nan", "none", "null", ".", "-"}
NONE_EQUIV = {"ÏóÜÏùå", "ÏóÜÏäµÎãàÎã§", "Ìï¥ÎãπÏóÜÏùå", "Î¨¥ÏùëÎãµ", "ÎØ∏ÏùëÎãµ"}

def _nfkc(s): return unicodedata.normalize("NFKC", str(s))

def normalize_text(x):
    if pd.isna(x): return ""
    s = _nfkc(x).strip()
    if s.lower() in MISSING_MARKERS: return ""
    if s in NONE_EQUIV: return "ÏóÜÏùå"
    return s

feature_cols = [c for c in train.columns if c not in [ID_COL, TARGET]]

def make_sentence(row):
    segs = []
    for c in feature_cols:
        val = normalize_text(row[c])
        if val: segs.append(f"{c}:{val}")
    return " ".join(segs)

train['full_text'] = train.apply(make_sentence, axis=1)
test['full_text'] = test.apply(make_sentence, axis=1)

# ---------------------------
# 3. [Stage 1] RoBERTa Feature Extraction
# ---------------------------
# RoBERTa Î™®Îç∏ÏùÑ ÌïôÏäµÏãúÏºúÏÑú 'OOF ÏòàÏ∏°Í∞í'Í≥º 'Test ÏòàÏ∏°Í∞í'ÏùÑ ÎΩëÏïÑÎÉÖÎãàÎã§.
# Ïù¥ Í∞íÎì§ÏùÄ Îã§Ïùå Îã®Í≥Ñ Î™®Îç∏Ïùò ÌîºÏ≤òÎ°ú ÏÇ¨Ïö©Îê©ÎãàÎã§.

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in inputs.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def extract_roberta_features(train_df, test_df, n_splits=5):
    print("\n[Stage 1] Extracting RoBERTa Features...")
    
    # Î™®Îç∏ ÏÑ§Ï†ï (Í∞ÄÎ≥çÍ≥† Îπ†Î•∏ small Î™®Îç∏ ÏÇ¨Ïö©, ÏÑ±Îä• ÏöïÏã¨ÎÇòÎ©¥ baseÎ°ú Î≥ÄÍ≤Ω)
    MODEL_NM = "klue/roberta-base" 
    BS = 32 # RX 9070XTÎãàÍπå ÎÑâÎÑâÌïòÍ≤å
    LR = 1e-5
    EPOCHS = 4 # ÌîºÏ≤ò Ï∂îÏ∂úÏö©Ïù¥Îùº Í≥ºÏ†ÅÌï© Î∞©ÏßÄÎ•º ÏúÑÌï¥ Ï†ÅÍ≤å
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NM)
    y = train_df[TARGET].values
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    oof_preds = np.zeros(len(train_df))
    test_preds = np.zeros(len(test_df))
    
    # Test Dataset (Ìïú Î≤àÎßå ÏÉùÏÑ±)
    te_ds = TextDataset(test_df['full_text'].values, None, tokenizer)
    te_dl = DataLoader(te_ds, batch_size=BS, shuffle=False)
    
    for fold, (tr_idx, va_idx) in enumerate(skf.split(train_df, y)):
        print(f" - Fold {fold+1}/{n_splits} Processing...")
        
        # Data Setup
        tr_ds = TextDataset(train_df['full_text'].values[tr_idx], y[tr_idx], tokenizer)
        va_ds = TextDataset(train_df['full_text'].values[va_idx], y[va_idx], tokenizer)
        
        tr_dl = DataLoader(tr_ds, batch_size=BS, shuffle=True)
        va_dl = DataLoader(va_ds, batch_size=BS, shuffle=False)
        
        # Model Setup
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NM, num_labels=2)
        model.to(DEVICE)
        optim = torch.optim.AdamW(model.parameters(), lr=LR)
        
        # Training
        model.train()
        for ep in range(EPOCHS):
            for batch in tqdm(tr_dl, desc=f"Ep {ep+1}", leave=False):
                batch = {k: v.to(DEVICE) for k, v in batch.items()}
                optim.zero_grad()
                out = model(**batch)
                loss = out.loss
                loss.backward()
                optim.step()
        
        # Validation Inference (OOF)
        model.eval()
        fold_val_preds = []
        with torch.no_grad():
            for batch in va_dl:
                batch = {k: v.to(DEVICE) for k, v in batch.items() if k!='labels'}
                out = model(**batch)
                prob = torch.softmax(out.logits, dim=1)[:, 1].cpu().numpy()
                fold_val_preds.extend(prob)
        oof_preds[va_idx] = fold_val_preds
        
        # Test Inference
        fold_test_preds = []
        with torch.no_grad():
            for batch in te_dl:
                batch = {k: v.to(DEVICE) for k, v in batch.items()}
                out = model(**batch)
                prob = torch.softmax(out.logits, dim=1)[:, 1].cpu().numpy()
                fold_test_preds.extend(prob)
        test_preds += np.array(fold_test_preds) / n_splits
        
        # Î©îÎ™®Î¶¨ Ï†ïÎ¶¨
        del model, optim, tr_dl, va_dl
        if torch.cuda.is_available(): torch.cuda.empty_cache()
        
    return oof_preds, test_preds

# RoBERTa ÌîºÏ≤ò ÏÉùÏÑ± Ïã§Ìñâ
roberta_oof, roberta_test = extract_roberta_features(train, test, n_splits=5)

# ÏÉùÏÑ±Îêú ÌîºÏ≤òÎ•º Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑÏóê Ï∂îÍ∞Ä
train['roberta_prob'] = roberta_oof
test['roberta_prob'] = roberta_test

print("‚úÖ RoBERTa features added to dataset.")

# ---------------------------
# 4. [Stage 2] Main Models (CatBoost + XGBoost)
# ---------------------------
print("\n[Stage 2] Training Main Models with RoBERTa Feature...")

# Ï†ÑÏ≤òÎ¶¨ (ÏàòÏπòÌòï/Î≤îÏ£ºÌòï Î∂ÑÎ•ò)
# roberta_probÎäî ÏàòÏπòÌòï ÌîºÏ≤òÎ°ú Ï∑®Í∏âÎê©ÎãàÎã§.
final_features = [c for c in train.columns if c not in [ID_COL, TARGET, 'full_text']]
cat_features = [c for c in final_features if train[c].dtype == 'object']
num_features = [c for c in final_features if train[c].dtype != 'object']

# Í≤∞Ï∏°Ïπò Ï≤òÎ¶¨ (Í∞ÑÎã®ÌïòÍ≤å)
for c in cat_features:
    train[c] = train[c].fillna("MISSING").astype(str)
    test[c] = test[c].fillna("MISSING").astype(str)
for c in num_features:
    train[c] = train[c].fillna(0)
    test[c] = test[c].fillna(0)

# 4-1. CatBoost Training
print(" - Training CatBoost (CPU)...")
cb_oof = np.zeros(len(train))
cb_test = np.zeros(len(test))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
y = train[TARGET].values

for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y)):
    X_tr, X_val = train[final_features].iloc[tr_idx], train[final_features].iloc[va_idx]
    y_tr, y_val = y[tr_idx], y[va_idx]
    
    model = CatBoostClassifier(
        iterations=3000,
        learning_rate=0.01,
        depth=4,
        l2_leaf_reg=5,

        eval_metric='F1',
        random_seed=SEED,
        verbose=0,
        early_stopping_rounds=300,
        cat_features=cat_features,
        auto_class_weights='Balanced',
        task_type="CPU",  # ÏïàÏ†ÑÌïòÍ≤å CPU ÏÇ¨Ïö©
        thread_count=-1
    )
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)
    
    cb_oof[va_idx] = model.predict_proba(X_val)[:, 1]
    cb_test += model.predict_proba(test[final_features])[:, 1] / 5

# 4-2. XGBoost Training
print(" - Training XGBoost (CPU)...")
# XGBoostÎäî Î≤îÏ£ºÌòï Îç∞Ïù¥ÌÑ∞Î•º ÏúÑÌï¥ Ïù∏ÏΩîÎî© ÌïÑÏöî (CatBoostÏôÄ Îã¨Î¶¨)
# Í∞ÑÎã®ÌïòÍ≤å Ordinal Encoding Ï†ÅÏö©
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_xgb = train[final_features].copy()
test_xgb = test[final_features].copy()

train_xgb[cat_features] = oe.fit_transform(train_xgb[cat_features])
test_xgb[cat_features] = oe.transform(test_xgb[cat_features])

xgb_oof = np.zeros(len(train))
xgb_test = np.zeros(len(test))

for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y)):
    X_tr, X_val = train_xgb.iloc[tr_idx], train_xgb.iloc[va_idx]
    y_tr, y_val = y[tr_idx], y[va_idx]
    
    model = XGBClassifier(
        n_estimators=3000,
        learning_rate=0.015,
        max_depth=4,

        # [Ï∂îÍ∞Ä] Í≥ºÏ†ÅÌï© Î∞©ÏßÄ Î∞è ÏùºÎ∞òÌôî ÏÑ±Îä• Ìñ•ÏÉÅ ÏòµÏÖò
        min_child_weight=2,       # Í¥ÄÏ∏°Ïπò ÏµúÏÜå Î¨¥Í≤å Ìï© (ÎÖ∏Ïù¥Ï¶àÏóê Í∞ïÌï¥Ïßê)
        colsample_bytree=0.8,     # Ìä∏Î¶¨ ÏÉùÏÑ± Ïãú ÌîºÏ≤òÏùò 80%Îßå Î¨¥ÏûëÏúÑ ÏÇ¨Ïö© (Îã§ÏñëÏÑ± ÌôïÎ≥¥)
        subsample=0.8,            # Îç∞Ïù¥ÌÑ∞Ïùò 80%Îßå ÏÉòÌîåÎßÅÌï¥ÏÑú ÌïôÏäµ (Î∞∞ÍπÖ Ìö®Í≥º)

        eval_metric='logloss',
        random_state=SEED,
        n_jobs=-1,
        early_stopping_rounds=300,
        # scale_pos_weightÎäî Î∂àÍ∑†Ìòï Îç∞Ïù¥ÌÑ∞Ïóê Ïú†Ïö© (pos_weight = count(neg) / count(pos))
        scale_pos_weight=(len(y_tr) - sum(y_tr)) / sum(y_tr)
    )
    
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    
    xgb_oof[va_idx] = model.predict_proba(X_val)[:, 1]
    xgb_test += model.predict_proba(test_xgb)[:, 1] / 5

# ---------------------------
# 5. Ensemble & Threshold Optimization
# ---------------------------
print("\n[Final Ensemble]")

def get_best_threshold(y_true, y_prob):
    best_f1, best_thr = 0, 0.5
    for thr in np.arange(0.1, 0.9, 0.01):
        pred = (y_prob >= thr).astype(int)
        score = f1_score(y_true, pred)
        if score > best_f1:
            best_f1 = score
            best_thr = thr
    return best_f1, best_thr

# CatBoostÏôÄ XGBoostÎ•º 5:5 ÎòêÎäî ÏµúÏ†Å ÎπÑÏú®Î°ú ÏÑûÍ∏∞
best_score = 0
best_w = 0.5
best_thr = 0.5

for w in np.arange(0.0, 1.01, 0.05):
    blended_oof = (cb_oof * w) + (xgb_oof * (1 - w))
    f1, thr = get_best_threshold(y, blended_oof)
    if f1 > best_score:
        best_score = f1
        best_w = w
        best_thr = thr

print(f"‚úÖ Best Weight -> CatBoost: {best_w:.2f}, XGBoost: {1-best_w:.2f}")
print(f"‚úÖ Best Threshold: {best_thr:.3f}")
print(f"‚úÖ Expected F1 Score (OOF): {best_score:.4f}")

# Final Prediction
final_test_prob = (cb_test * best_w) + (xgb_test * (1 - best_w))
final_pred = (final_test_prob >= best_thr).astype(int)

# Cap Logic (Í≥ºÎèÑÌïú 1 ÏòàÏ∏° Î∞©ÏßÄ)
POS_CAP = 0.70
if final_pred.mean() > POS_CAP:
    print(f"‚ö†Ô∏è Applying Positive Cap ({POS_CAP})...")
    n_pos = int(len(final_pred) * POS_CAP)
    top_indices = np.argsort(-final_test_prob)[:n_pos]
    final_pred[:] = 0
    final_pred[top_indices] = 1

# Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû•
submission = pd.read_csv('sample_submission.csv')
submission[TARGET] = final_pred
submission.to_csv("submission_stacking_roberta_feat.csv", index=False)
print("\nDone! Saved to 'submission_stacking_roberta_feat.csv'")

üöÄ AMD GPU Detected! Using DirectML (dml).
üöÄ Current Device: privateuseone:0

[Stage 1] Extracting RoBERTa Features...
 - Fold 1/5 Processing...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 197/197 [00:00<00:00, 1013.84it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: klue/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialize

 - Fold 2/5 Processing...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 197/197 [00:00<00:00, 919.26it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: klue/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized

 - Fold 3/5 Processing...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 197/197 [00:00<00:00, 921.50it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: klue/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized

 - Fold 4/5 Processing...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 197/197 [00:00<00:00, 966.71it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: klue/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized

 - Fold 5/5 Processing...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 197/197 [00:00<00:00, 1011.57it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: klue/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized

‚úÖ RoBERTa features added to dataset.

[Stage 2] Training Main Models with RoBERTa Feature...
 - Training CatBoost (CPU)...
 - Training XGBoost (CPU)...

[Final Ensemble]
‚úÖ Best Weight -> CatBoost: 0.90, XGBoost: 0.10
‚úÖ Best Threshold: 0.490
‚úÖ Expected F1 Score (OOF): 0.4694

Done! Saved to 'submission_stacking_roberta_feat.csv'
