In [87]:
import pandas as pd
from sklearn.metrics import roc_auc_score
import numpy as np


def recall_at_k(y_true, y_prob, k=0.1):
    """
    Tahmin edilen olasılıkların en üst k%'sını pozitif etiketleyerek recall değerini hesaplar.

    Parametreler:
        y_true (list): Gerçek ikili etiketler.
        y_prob (list): Tahmin edilen olasılıklar.
        k (float): Pozitif etiketlenecek olasılıkların yüzdelik dilimi (varsayılan 0.1).

    Döndürür:
        float: En iyi k% tahminlerindeki recall oranı.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    n = len(y_true)
    m = max(1, int(np.round(k * n)))
    order = np.argsort(-y_prob, kind="mergesort")
    top = order[:m]

    tp_at_k = y_true[top].sum()
    P = y_true.sum()

    return float(tp_at_k / P) if P > 0 else 0.0


def lift_at_k(y_true, y_prob, k=0.1):
    """
    Tahmin edilen olasılıkların en üst k%'sını pozitif etiketleyerek lift (precision/prevalence) değerini hesaplar.

    Parametreler:
        y_true (list): Gerçek ikili etiketler.
        y_prob (list): Tahmin edilen olasılıklar.
        k (float): Pozitif etiketlenecek olasılıkların yüzdelik dilimi (varsayılan 0.1).

    Döndürür:
        float: En iyi k% tahminlerindeki lift değeri.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    n = len(y_true)
    m = max(1, int(np.round(k * n)))
    order = np.argsort(-y_prob, kind="mergesort")
    top = order[:m]

    tp_at_k = y_true[top].sum()
    precision_at_k = tp_at_k / m
    prevalence = y_true.mean()

    return float(precision_at_k / prevalence) if prevalence > 0 else 0.0


def convert_auc_to_gini(auc):
    """
    ROC AUC skorunu Gini katsayısına dönüştürür.

    Gini katsayısı, ROC AUC skorunun doğrusal bir dönüşümüdür.

    Parametreler:
        auc (float): ROC AUC skoru (0 ile 1 arasında).

    Döndürür:
        float: Gini katsayısı (-1 ile 1 arasında).
    """
    return 2 * auc - 1


def ing_hubs_datathon_metric(y_true, y_prob):
    """
    Gini, recall@10% ve lift@10% metriklerini birleştiren özel bir metrik hesaplar.

    Metrik, her bir skoru bir baseline modelin metrik değerlerine göre oranlar ve aşağıdaki ağırlıkları uygular:
    - Gini: %40
    - Recall@10%: %30
    - Lift@10%: %30

    Parametreler:
        y_true (list): Gerçek ikili etiketler.
        y_prob (list): Tahmin edilen olasılıklar.

    Döndürür:
        float: Ağırlıklandırılmış bileşik skor.
    """
    # final metrik için ağırlıklar
    score_weights = {
        "gini": 0.4,
        "recall_at_10perc": 0.3,
        "lift_at_10perc": 0.3,
    }

    # baseline modelin her bir metrik için değerleri
    baseline_scores = {
        "roc_auc": 0.6925726757936908,
        "recall_at_10perc": 0.18469015795868773,
        "lift_at_10perc": 1.847159286784029,
    }

    # y_prob tahminleri için metriklerin hesaplanması
    roc_auc = roc_auc_score(y_true, y_prob)
    recall_at_10perc = recall_at_k(y_true, y_prob, k=0.1)
    lift_at_10perc = lift_at_k(y_true, y_prob, k=0.1)

    new_scores = {
        "roc_auc": roc_auc,
        "recall_at_10perc": recall_at_10perc,
        "lift_at_10perc": lift_at_10perc,
    }

    # roc auc değerlerinin gini değerine dönüştürülmesi
    baseline_scores["gini"] = convert_auc_to_gini(baseline_scores["roc_auc"])
    new_scores["gini"] = convert_auc_to_gini(new_scores["roc_auc"])

    # baseline modeline oranlama
    final_gini_score = new_scores["gini"] / baseline_scores["gini"]
    final_recall_score = new_scores["recall_at_10perc"] / baseline_scores["recall_at_10perc"]
    final_lift_score = new_scores["lift_at_10perc"] / baseline_scores["lift_at_10perc"]

    # ağırlıklandırılmış metriğin hesaplanması
    final_score = (
        final_gini_score * score_weights["gini"] +
        final_recall_score * score_weights["recall_at_10perc"] + 
        final_lift_score * score_weights["lift_at_10perc"]
    )
    return final_score


In [88]:
import polars as pl

# Dosya yollarını tanımlayalım
train_ref_path = '/kaggle/input/ing-hubs-turkiye-datathon/referance_data.csv'
test_ref_path = '/kaggle/input/ing-hubs-turkiye-datathon/referance_data_test.csv'
customers_path = '/kaggle/input/ing-hubs-turkiye-datathon/customers.csv'
customer_history_path = '/kaggle/input/ing-hubs-turkiye-datathon/customer_history.csv'

# Tüm CSV dosyalarını LazyFrame olarak oku
train_ref_data = pl.scan_csv(train_ref_path)
test_ref_data = pl.scan_csv(test_ref_path)
customers = pl.scan_csv(customers_path)
customer_history = pl.scan_csv(customer_history_path)


In [76]:
customer_history =inflation_adjusted_history.drop([
    'mobile_eft_all_amt',
    'cc_transaction_all_amt',
    'year_month',
    'cpi_index'
])


In [89]:
train_ref_data = train_ref_data.with_columns(split=pl.lit("Train"))

# Test setine, train setiyle aynı şemaya sahip olması için null bir 'churn' kolonu ekle.
# Bu, 'concat' işleminin çalışması için gereklidir.
# 'churn' kolonunun veri tipini (dtype) train setinden dinamik olarak alıyoruz.
test_ref_data = test_ref_data.with_columns(
    split=pl.lit("Test"),
    churn=pl.lit(None, dtype=train_ref_data.schema["churn"])
)

test_ref_data = test_ref_data.select(train_ref_data.columns)
ref_data = pl.concat([train_ref_data, test_ref_data])


ref_data = ref_data.with_columns(
    ref_date=pl.col('ref_date').str.to_date("%Y-%m-%d")
)

  churn=pl.lit(None, dtype=train_ref_data.schema["churn"])
  test_ref_data = test_ref_data.select(train_ref_data.columns)


In [90]:
customers = customers.with_columns([
    pl.col("work_sector").fill_null("Not Working"),
    ((pl.col("tenure") / 365) / pl.col("age")).alias("tenure_per_age")
])

ref_data_v2 = ref_data.join(
    customers,
    on="cust_id",
    how="left"
)

In [91]:
cat_cols = [name for name, dtype in ref_data_v2.schema.items() if dtype == pl.Utf8]

# Lazy ile category (Polars'ta Categorical) dönüşümü
ref_data_v2 = ref_data_v2.with_columns([
    pl.col(col).cast(pl.Categorical) for col in cat_cols
])

  cat_cols = [name for name, dtype in ref_data_v2.schema.items() if dtype == pl.Utf8]


In [92]:
ref_data_v3 = ref_data_v2
ref_data_v4 = ref_data_v3

In [93]:
ref_data_v4 = ref_data_v4.collect().lazy()

In [94]:
import polars as pl
import numpy as np
from datetime import datetime, date

# =========================================================================
# CRITICAL FIX: Parse date column at the very beginning
# =========================================================================
customer_history = customer_history.with_columns(
    pl.col('date').str.strptime(pl.Date, format='%Y-%m-%d').alias('date')
)

# =========================================================================
# PART 1: MONTHLY SUM, MEAN, MIN, MAX FEATURES
# =========================================================================
month_windows = [1,3, 6, 9, 12]
numeric_cols = ['mobile_eft_all_cnt', 'mobile_eft_all_amt', 'cc_transaction_all_amt', 'cc_transaction_all_cnt', 'active_product_category_nbr']

# Başlangıç LazyFrame
customer_last_feats_month = (
    customer_history
    .select(pl.col('cust_id').unique())
)

# Her müşteri için son tarih
last_dates = (
    customer_history
    .group_by('cust_id')
    .agg(pl.col('date').max().alias('last_date'))
)

# Son tarihi müşteri geçmişine ekle
customer_history_with_last = customer_history.join(last_dates, on='cust_id', how='left')

# Her zaman penceresi için feature üret
for m in month_windows:
    print(f"Processing last {m} month(s)")
    
    temp = (
        customer_history_with_last
        .filter(pl.col('date') >= pl.col('last_date').dt.offset_by(f'-{m}mo'))
    )
    
    sum_exprs = [pl.col(col).sum().alias(f"{col}_last_{m}m_sum") for col in numeric_cols]
    mean_exprs = [pl.col(col).mean().alias(f"{col}_last_{m}m_mean") for col in numeric_cols]
    min_exprs = [pl.col(col).min().alias(f"{col}_last_{m}m_min") for col in numeric_cols]
    max_exprs = [pl.col(col).max().alias(f"{col}_last_{m}m_max") for col in numeric_cols]
    
    temp_agg = temp.group_by('cust_id').agg(sum_exprs + mean_exprs + min_exprs + max_exprs)
    
    customer_last_feats_month = (
        customer_last_feats_month
        .join(temp_agg, on='cust_id', how='left')
    )

# --- SUM & MEAN RATIO FEATURES ---
ratio_exprs = []
for col in numeric_cols:
    for i in range(len(month_windows)):
        for j in range(i + 1, len(month_windows)):
            w1, w2 = month_windows[i], month_windows[j]
            ratio_exprs.extend([
                (pl.col(f"{col}_last_{w1}m_sum") / (pl.col(f"{col}_last_{w2}m_sum") + 1e-6))
                .alias(f"{col}_sum_ratio_{w1}_{w2}_m"),
                (pl.col(f"{col}_last_{w1}m_mean") / (pl.col(f"{col}_last_{w2}m_mean") + 1e-6))
                .alias(f"{col}_mean_ratio_{w1}_{w2}_m"),
                (pl.col(f"{col}_last_{w1}m_min") / (pl.col(f"{col}_last_{w2}m_min") + 1e-6))
                .alias(f"{col}_min_ratio_{w1}_{w2}_m"),
                (pl.col(f"{col}_last_{w1}m_max") / (pl.col(f"{col}_last_{w2}m_max") + 1e-6))
                .alias(f"{col}_max_ratio_{w1}_{w2}_m")
            ])
customer_last_feats_month = customer_last_feats_month.with_columns(ratio_exprs)

# Ref data ile birleştir
ref_data_v5 = ref_data_v4.join(customer_last_feats_month, on='cust_id', how='left')
print("Part 1 completed: Monthly sum, mean, min, max features added (with ratios)")

# =========================================================================
# PART 2: AVERAGE, MIN, MAX FEATURES (mobile_eft_avg_amt, cc_transaction_avg_amt)
# =========================================================================
month_windows = [1,3, 6, 9, 12]
numeric_cols = ['mobile_eft_all_cnt', 'mobile_eft_all_amt', 
                'cc_transaction_all_amt', 'cc_transaction_all_cnt']

customer_avg_feats = customer_history.select(pl.col('cust_id').unique())

for m in month_windows:
    print(f"Processing average, min, max features for last {m} month(s)")
    
    temp = (
        customer_history_with_last
        .filter(pl.col('date') >= pl.col('last_date').dt.offset_by(f'-{m}mo'))
    )
    
    temp_agg = (
        temp
        .group_by('cust_id')
        .agg([
            pl.col('mobile_eft_all_amt').sum().alias('mobile_eft_all_amt_sum'),
            pl.col('mobile_eft_all_cnt').sum().alias('mobile_eft_all_cnt_sum'),
            pl.col('cc_transaction_all_amt').sum().alias('cc_transaction_all_amt_sum'),
            pl.col('cc_transaction_all_cnt').sum().alias('cc_transaction_all_cnt_sum'),
            pl.col('mobile_eft_all_amt').min().alias('mobile_eft_all_amt_min'),
            pl.col('mobile_eft_all_amt').max().alias('mobile_eft_all_amt_max'),
            pl.col('cc_transaction_all_amt').min().alias('cc_transaction_all_amt_min'),
            pl.col('cc_transaction_all_amt').max().alias('cc_transaction_all_amt_max'),
        ])
        .with_columns([
            # Average amount features
            (pl.col('mobile_eft_all_amt_sum') / pl.col('mobile_eft_all_cnt_sum'))
            .fill_nan(0).fill_null(0)
            .alias(f'mobile_eft_avg_amt_last_{m}m'),
            
            (pl.col('cc_transaction_all_amt_sum') / pl.col('cc_transaction_all_cnt_sum'))
            .fill_nan(0).fill_null(0)
            .alias(f'cc_transaction_avg_amt_last_{m}m'),
            
            # Min/Max features
            pl.col('mobile_eft_all_amt_min').alias(f'mobile_eft_amt_min_last_{m}m'),
            pl.col('mobile_eft_all_amt_max').alias(f'mobile_eft_amt_max_last_{m}m'),
            pl.col('cc_transaction_all_amt_min').alias(f'cc_transaction_amt_min_last_{m}m'),
            pl.col('cc_transaction_all_amt_max').alias(f'cc_transaction_amt_max_last_{m}m'),
        ])
        .select([
            'cust_id', 
            f'mobile_eft_avg_amt_last_{m}m', 
            f'cc_transaction_avg_amt_last_{m}m',
            f'mobile_eft_amt_min_last_{m}m',
            f'mobile_eft_amt_max_last_{m}m',
            f'cc_transaction_amt_min_last_{m}m',
            f'cc_transaction_amt_max_last_{m}m'
        ])
    )
    
    customer_avg_feats = customer_avg_feats.join(temp_agg, on='cust_id', how='left')

# --- AVG, MIN, MAX RATIO FEATURES ---
avg_ratio_exprs = []
eft_avg_cols = [f"mobile_eft_avg_amt_last_{m}m" for m in month_windows]
cc_avg_cols = [f"cc_transaction_avg_amt_last_{m}m" for m in month_windows]
eft_min_cols = [f"mobile_eft_amt_min_last_{m}m" for m in month_windows]
eft_max_cols = [f"mobile_eft_amt_max_last_{m}m" for m in month_windows]
cc_min_cols = [f"cc_transaction_amt_min_last_{m}m" for m in month_windows]
cc_max_cols = [f"cc_transaction_amt_max_last_{m}m" for m in month_windows]

for i in range(len(month_windows)):
    for j in range(i + 1, len(month_windows)):
        w1, w2 = month_windows[i], month_windows[j]
        avg_ratio_exprs.extend([
            # Average ratios
            (pl.col(f"mobile_eft_avg_amt_last_{w1}m") / (pl.col(f"mobile_eft_avg_amt_last_{w2}m") + 1e-6))
            .alias(f"mobile_eft_avg_ratio_{w1}_{w2}_m"),
            (pl.col(f"cc_transaction_avg_amt_last_{w1}m") / (pl.col(f"cc_transaction_avg_amt_last_{w2}m") + 1e-6))
            .alias(f"cc_transaction_avg_ratio_{w1}_{w2}_m"),
            
            # Min ratios
            (pl.col(f"mobile_eft_amt_min_last_{w1}m") / (pl.col(f"mobile_eft_amt_min_last_{w2}m") + 1e-6))
            .alias(f"mobile_eft_min_ratio_{w1}_{w2}_m"),
            (pl.col(f"cc_transaction_amt_min_last_{w1}m") / (pl.col(f"cc_transaction_amt_min_last_{w2}m") + 1e-6))
            .alias(f"cc_transaction_min_ratio_{w1}_{w2}_m"),
            
            # Max ratios
            (pl.col(f"mobile_eft_amt_max_last_{w1}m") / (pl.col(f"mobile_eft_amt_max_last_{w2}m") + 1e-6))
            .alias(f"mobile_eft_max_ratio_{w1}_{w2}_m"),
            (pl.col(f"cc_transaction_amt_max_last_{w1}m") / (pl.col(f"cc_transaction_amt_max_last_{w2}m") + 1e-6))
            .alias(f"cc_transaction_max_ratio_{w1}_{w2}_m"),
        ])
customer_avg_feats = customer_avg_feats.with_columns(avg_ratio_exprs)

# Ref data merge
ref_data_v6 = ref_data_v5.join(customer_avg_feats, on='cust_id', how='left')
ref_data_v6 = ref_data_v6.collect().lazy()

print("Part 2 completed: Average, min, max amount features added (with ratios)")

# =========================================================================
# PART 3: VOLATILITY FEATURES (Monthly Aggregated)
# =========================================================================
print("\nVolatility hesaplama başlıyor...\n")

month_windows = [1, 3, 6, 9, 12]
numeric_cols = ['mobile_eft_all_cnt', 'mobile_eft_all_amt', 
                'cc_transaction_all_amt', 'cc_transaction_all_cnt']

# Aylık bazda toplama
monthly = (
    customer_history
    .with_columns(
        pl.col('date').dt.truncate('1mo').alias('year_month')
    )
    .group_by(['cust_id', 'year_month'])
    .agg([pl.col(col).sum() for col in numeric_cols])
)

# Son tarihleri ekle
monthly = monthly.join(last_dates, on='cust_id', how='left')

# Boş LazyFrame
customer_vol_feats = monthly.select(pl.col('cust_id').unique())

for m in month_windows:
    print(f"\nProcessing metrics for last {m} month(s)...\n")
    
    temp = (
        monthly
        .filter(pl.col('year_month') >= pl.col('last_date').dt.offset_by(f'-{m}mo'))
        .sort(['cust_id', 'year_month'])
    )
    
    # STD (Volatility)
    std_exprs = [
        pl.col(col).std().alias(f"{col}_std_last_{m}m")
        for col in numeric_cols
    ]
    temp_std = temp.group_by('cust_id').agg(std_exprs)
    
    # Window-adjusted Volatility
    vol_exprs = [
        (pl.col(f"{col}_std_last_{m}m") * np.sqrt(m)).alias(f"{col}_vol_last_{m}m")
        for col in numeric_cols
    ]
    temp_vol = temp_std.select(['cust_id'] + vol_exprs)
    
    # EMA
    ema_exprs = [
        pl.col(col).ewm_mean(span=m, min_samples=1).last().alias(f"{col}_ema_last_{m}m")
        for col in numeric_cols
    ]
    temp_ema = temp.group_by('cust_id').agg(ema_exprs)
    
    # DELTA (Son - Önceki)
    delta_exprs = [
        pl.when(pl.col(col).len() > 1)
        .then(pl.col(col).last() - pl.col(col).slice(-2, 1).first())
        .otherwise(0)
        .alias(f"{col}_delta_last_{m}m")
        for col in numeric_cols
    ]
    temp_delta = temp.group_by('cust_id').agg(delta_exprs)
    
    # GROWTH RATE
    growth_exprs = [
        pl.when(pl.col(col).len() > 1)
        .then(
            (pl.col(col).last() / 
             pl.when(pl.col(col).slice(-2, 1).first() == 0)
             .then(pl.lit(None))
             .otherwise(pl.col(col).slice(-2, 1).first()) - 1)
            .fill_null(0)
        )
        .otherwise(0)
        .alias(f"{col}_growth_rate_last_{m}m")
        for col in numeric_cols
    ]
    temp_growth = temp.group_by('cust_id').agg(growth_exprs)
    
    # TREND DIRECTION
    trend_exprs = [
        pl.when(pl.col(col).ewm_mean(span=3, min_samples=1).len() > 1)
        .then(
            pl.when(
                pl.col(col).ewm_mean(span=3, min_samples=1).last() - 
                pl.col(col).ewm_mean(span=3, min_samples=1).first() > 0
            )
            .then(1)
            .when(
                pl.col(col).ewm_mean(span=3, min_samples=1).last() - 
                pl.col(col).ewm_mean(span=3, min_samples=1).first() < 0
            )
            .then(-1)
            .otherwise(0)
        )
        .otherwise(0)
        .alias(f"{col}_trend_dir_last_{m}m")
        for col in numeric_cols
    ]
    temp_trend = temp.group_by('cust_id').agg(trend_exprs)
    
    # VA (Volatility Adjusted) = std / mean
    mean_exprs_for_va = [
        pl.col(col).mean().alias(f"{col}_mean_temp")
        for col in numeric_cols
    ]
    temp_mean = temp.group_by('cust_id').agg(mean_exprs_for_va)
    
    va_exprs = [
        (pl.col(f"{col}_std_last_{m}m") / pl.col(f"{col}_mean_temp"))
        .alias(f"{col}_va_last_{m}m")
        for col in numeric_cols
    ]
    temp_va = (
        temp_std
        .join(temp_mean, on='cust_id', how='left')
        .select(['cust_id'] + va_exprs)
    )
    
    # VS (Volatility Stability) window=2
    vs_2_exprs = [
        (pl.col(col).rolling_std(window_size=2).diff().last().fill_null(0))
        .alias(f"{col}_vs_2_last_{m}m")
        for col in numeric_cols
    ]
    temp_vs_2 = temp.group_by('cust_id').agg(vs_2_exprs)
    
    # VS (Volatility Stability) window=3
    vs_3_exprs = [
        (pl.col(col).rolling_std(window_size=3).diff().last().fill_null(0))
        .alias(f"{col}_vs_3_last_{m}m")
        for col in numeric_cols
    ]
    temp_vs_3 = temp.group_by('cust_id').agg(vs_3_exprs)
    
    # RANK (Percentile Rank)
    print(f"Calculating rank features for last {m} month(s)...")
    
    rank_cols_exprs = [
        (pl.col(col).rank(method='average').over('year_month') / pl.col(col).count().over('year_month'))
        .alias(f"{col}_rank_pct")
        for col in numeric_cols
    ]
    
    temp_rank = temp.with_columns(rank_cols_exprs)
    
    rank_mean_exprs = [
        pl.col(f"{col}_rank_pct").tail(m).mean().alias(f"{col}_rank_pct_mean_last_{m}m")
        for col in numeric_cols
    ]
    temp_rank_mean = temp_rank.group_by('cust_id').agg(rank_mean_exprs)
    
    # Merge all metrics
    customer_vol_feats = (
        customer_vol_feats
        .join(temp_std, on='cust_id', how='left')
        .join(temp_vol, on='cust_id', how='left')
        .join(temp_ema, on='cust_id', how='left')
        .join(temp_delta, on='cust_id', how='left')
        .join(temp_growth, on='cust_id', how='left')
        .join(temp_trend, on='cust_id', how='left')
        .join(temp_va, on='cust_id', how='left')
        .join(temp_vs_2, on='cust_id', how='left')
        .join(temp_vs_3, on='cust_id', how='left')
        .join(temp_rank_mean, on='cust_id', how='left')
    )

# Final merge
ref_data_v7 = ref_data_v6.join(customer_vol_feats, on='cust_id', how='left')

print("Part 3 completed: Volatility features added")


print("\nEk feature hesaplamaları başlıyor...\n")

# Boş container - sadece cust_id
customer_extra_feats = ref_data_v7.select(pl.col('cust_id').unique())

# TREND / FARK FEATURES - Using available columns from Parts 1-3
month_windows = [1, 3, 6, 9, 12]
trend_feats_list = []

for w in month_windows:
    # Önceki pencereler için window seçimi
    short_w = w
    # Sadece mevcut month_windows içinden bir sonraki window'u seç
    available_longer_windows = [mo for mo in month_windows if mo > w]
    if not available_longer_windows:
        continue  # Son window için karşılaştırma yapma
    long_w = available_longer_windows[0]  # İlk büyük window'u al

    temp_feats = (
        ref_data_v7
        .select([
            'cust_id',
            f'cc_transaction_all_amt_last_{short_w}m_sum',
            f'cc_transaction_all_amt_last_{long_w}m_mean',
            f'mobile_eft_all_cnt_last_{short_w}m_sum',
            f'mobile_eft_all_cnt_last_{long_w}m_mean'
        ])
        .with_columns([
            # Fark
            (pl.col(f'cc_transaction_all_amt_last_{short_w}m_sum') - 
             pl.col(f'cc_transaction_all_amt_last_{long_w}m_mean'))
            .alias(f'cc_transaction_amt_short{short_w}_vs_long{long_w}_diff'),

            # Oran
            (pl.col(f'mobile_eft_all_cnt_last_{short_w}m_sum') / 
             (pl.col(f'mobile_eft_all_cnt_last_{long_w}m_mean') + 1e-6))
            .alias(f'mobile_eft_cnt_short{short_w}_vs_long{long_w}_ratio')
        ])
        .select(['cust_id', f'cc_transaction_amt_short{short_w}_vs_long{long_w}_diff',
                 f'mobile_eft_cnt_short{short_w}_vs_long{long_w}_ratio'])
    )
    trend_feats_list.append(temp_feats)

# Tüm trend features'ları birleştir
from functools import reduce
customer_trend_feats = reduce(lambda left, right: left.join(right, on='cust_id', how='left'), trend_feats_list)

# Final merge
ref_data_v8 = ref_data_v7.join(customer_trend_feats, on='cust_id', how='left')

spike_feats_list = []

for m in month_windows:
    temp = (
        monthly
        .filter(pl.col('year_month') >= pl.col('last_date').dt.offset_by(f'-{m}mo'))
        .sort(['cust_id', 'year_month'])
    )

    for col in numeric_cols:
        spike_count = (
            (pl.col(col).diff().fill_null(0) > 0).sum().alias(f"{col}_spike_count_last_{m}m")
        )

        spike_maxabs = (
            pl.col(col).diff().fill_null(0).abs().max().alias(f"{col}_spike_maxabs_last_{m}m")
        )

        temp_feats = temp.group_by('cust_id').agg([spike_count, spike_maxabs])
        spike_feats_list.append(temp_feats)

# Tüm spikeness features'ları birleştir
from functools import reduce
customer_spike_feats = reduce(lambda left, right: left.join(right, on='cust_id', how='left'), spike_feats_list)

# Final merge
ref_data_v9 = ref_data_v8.join(customer_spike_feats, on='cust_id', how='left')

print("Part 4 completed: Trend and spike features added")

import polars as pl
from functools import reduce
import scipy.stats as stats
import numpy as np

rolling_feats_list = []
window_size = 3  # rolling window

for m in month_windows:
    temp = (
        monthly
        .filter(pl.col('year_month') >= pl.col('last_date').dt.offset_by(f'-{m}mo'))
        .sort(['cust_id', 'year_month'])
    )
    
    for col in numeric_cols:
        # cust_id bazında liste halinde topla ve hepsini tek seferde hesapla
        temp_feats = (
            temp.group_by('cust_id')
            .agg([
                pl.col(col).alias(f"{col}_list")
            ])
            .with_columns([
                # Skew
                pl.col(f"{col}_list")
                .map_elements(
                    lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
                    return_dtype=pl.Float64
                ).alias(f"{col}_roll_skew_last_{m}m"),
                
                # Kurtosis
                pl.col(f"{col}_list")
                .map_elements(
                    lambda x: float(stats.kurtosis(x[-window_size:])) if len(x) >= window_size else 0,
                    return_dtype=pl.Float64
                ).alias(f"{col}_roll_kurt_last_{m}m"),
                
                # Z-Score
                pl.col(f"{col}_list")
                .map_elements(
                    lambda x: (x[-1] - np.mean(x)) / (np.std(x) + 1e-6) if len(x) >= 1 else 0,
                    return_dtype=pl.Float64
                ).alias(f"{col}_zscore_last_{m}m")
            ])
            .drop(f"{col}_list")
            .select(['cust_id', f"{col}_roll_skew_last_{m}m", f"{col}_roll_kurt_last_{m}m", f"{col}_zscore_last_{m}m"])
        )
        
        rolling_feats_list.append(temp_feats)

# Tüm rolling feature'ları birleştir (hepsi lazy)
customer_rolling_feats = reduce(lambda left, right: left.join(right, on='cust_id', how='left'), rolling_feats_list)

# Final merge (burada da lazy kalır, ref_data_v10 ne ise ona göre)
ref_data_v11 = ref_data_v9.join(customer_rolling_feats, on='cust_id', how='left')
print("Part 5 completed: Rolling skew, kurtosis, and z-score features added")

Processing last 1 month(s)
Processing last 3 month(s)
Processing last 6 month(s)
Processing last 9 month(s)
Processing last 12 month(s)
Part 1 completed: Monthly sum, mean, min, max features added (with ratios)
Processing average, min, max features for last 1 month(s)
Processing average, min, max features for last 3 month(s)
Processing average, min, max features for last 6 month(s)
Processing average, min, max features for last 9 month(s)
Processing average, min, max features for last 12 month(s)
Part 2 completed: Average, min, max amount features added (with ratios)

Volatility hesaplama başlıyor...


Processing metrics for last 1 month(s)...

Calculating rank features for last 1 month(s)...

Processing metrics for last 3 month(s)...

Calculating rank features for last 3 month(s)...

Processing metrics for last 6 month(s)...

Calculating rank features for last 6 month(s)...

Processing metrics for last 9 month(s)...

Calculating rank features for last 9 month(s)...

Processing metrics

In [None]:
df=ref_data_v11.collect()

  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.skew(x[-window_size:])) if len(x) >= window_size else 0,
  lambda x: float(stats.kurt

In [None]:
df

In [None]:
import polars as pl
import numpy as np
import pandas as pd

# =========================================================================
# UYARIYA SEBEP OLAN FEATURE'LARI TESPİT ET
# =========================================================================

def detect_problematic_features(df):
    """
    Skewness ve Kurtosis uyarılarına sebep olan feature'ları tespit eder
    """
    
    # Skew ve Kurtosis feature'larını bul
    skew_cols = [col for col in df.columns if '_roll_skew_last_' in col]
    kurt_cols = [col for col in df.columns if '_roll_kurt_last_' in col]
    zscore_cols = [col for col in df.columns if '_zscore_last_' in col]
    
    print("=" * 80)
    print("PROBLEMATIC FEATURE DETECTION")
    print("=" * 80)
    
    # DataFrame'i collect et (eğer lazy ise)
    if isinstance(df, pl.LazyFrame):
        df = df.collect()
    
    results = {
        'feature': [],
        'null_count': [],
        'inf_count': [],
        'zero_count': [],
        'unique_values': [],
        'mean': [],
        'std': [],
        'min': [],
        'max': [],
        'issue_type': []
    }
    
    all_problem_cols = skew_cols + kurt_cols + zscore_cols
    
    for col in all_problem_cols:
        data = df[col].to_numpy()
        
        null_cnt = np.sum(np.isnan(data))
        inf_cnt = np.sum(np.isinf(data))
        zero_cnt = np.sum(data == 0)
        unique_vals = len(np.unique(data[~np.isnan(data)]))
        
        issue = []
        
        # Problemi tespit et
        if null_cnt > len(data) * 0.5:
            issue.append('HIGH_NULL')
        if inf_cnt > 0:
            issue.append('HAS_INF')
        if zero_cnt > len(data) * 0.8:
            issue.append('MOSTLY_ZERO')
        if unique_vals < 5:
            issue.append('LOW_VARIANCE')
        
        # İstatistikler
        valid_data = data[~np.isnan(data) & ~np.isinf(data)]
        
        if len(valid_data) > 0:
            results['feature'].append(col)
            results['null_count'].append(null_cnt)
            results['inf_count'].append(inf_cnt)
            results['zero_count'].append(zero_cnt)
            results['unique_values'].append(unique_vals)
            results['mean'].append(np.mean(valid_data))
            results['std'].append(np.std(valid_data))
            results['min'].append(np.min(valid_data))
            results['max'].append(np.max(valid_data))
            results['issue_type'].append(','.join(issue) if issue else 'OK')
    
    # Pandas DataFrame olarak döndür
    report = pd.DataFrame(results)
    
    # Sadece problemli olanları filtrele
    problematic = report[report['issue_type'] != 'OK']
    
    print(f"\nTotal Features Analyzed: {len(all_problem_cols)}")
    print(f"Problematic Features Found: {len(problematic)}")
    print("\n" + "=" * 80)
    
    if len(problematic) > 0:
        print("\nPROBLEMATIC FEATURES:")
        print(problematic.to_string())
    else:
        print("\n✅ No problematic features found!")
    
    print("\n" + "=" * 80)
    print("FEATURE STATISTICS SUMMARY:")
    print("=" * 80)
    
    # Kategori bazında özet
    for category in ['skew', 'kurt', 'zscore']:
        cat_cols = [col for col in all_problem_cols if category in col]
        cat_data = report[report['feature'].str.contains(category)]
        
        print(f"\n{category.upper()} Features ({len(cat_cols)} total):")
        print(f"  - Mean null count: {cat_data['null_count'].mean():.1f}")
        print(f"  - Mean zero count: {cat_data['zero_count'].mean():.1f}")
        print(f"  - Mean unique values: {cat_data['unique_values'].mean():.1f}")
        print(f"  - Issues: {cat_data[cat_data['issue_type'] != 'OK'].shape[0]} features")
    
    return report, problematic


def check_identical_values(df, numeric_cols=['mobile_eft_all_cnt', 'mobile_eft_all_amt', 
                                              'cc_transaction_all_amt', 'cc_transaction_all_cnt']):
    """
    Aylık verilerde aynı değerlere sahip müşterileri bulur (uyarının asıl sebebi)
    """
    
    print("\n" + "=" * 80)
    print("IDENTICAL VALUE DETECTION (Root Cause of Warnings)")
    print("=" * 80)
    
    if isinstance(df, pl.LazyFrame):
        df = df.collect()
    
    # Her müşteri için aylık varyasyonu kontrol et
    results = []
    
    for col in numeric_cols:
        # Eğer aylık aggregated data varsa
        if 'monthly' in globals():
            monthly_df = globals()['monthly']
            if isinstance(monthly_df, pl.LazyFrame):
                monthly_df = monthly_df.collect()
            
            # Her müşteri için std hesapla
            variance_check = (
                monthly_df
                .group_by('cust_id')
                .agg([
                    pl.col(col).std().alias(f'{col}_std'),
                    pl.col(col).count().alias(f'{col}_count')
                ])
                .with_columns([
                    (pl.col(f'{col}_std') < 1e-10).alias(f'{col}_identical')
                ])
            )
            
            identical_count = variance_check.filter(pl.col(f'{col}_identical'))[f'{col}_identical'].sum()
            total_customers = variance_check.shape[0]
            
            results.append({
                'column': col,
                'customers_with_identical_values': identical_count,
                'total_customers': total_customers,
                'percentage': f"{(identical_count/total_customers*100):.2f}%"
            })
    
    if results:
        result_df = pd.DataFrame(results)
        print("\nCustomers with Identical Monthly Values:")
        print(result_df.to_string(index=False))
        print("\n⚠️  These customers cause 'catastrophic cancellation' warnings")
        print("   because skew/kurtosis cannot be calculated for constant values.")
    
    return result_df if results else None


# =========================================================================
# KULLANIM
# =========================================================================

# 1. Problemli feature'ları tespit et
report, problematic = detect_problematic_features(ref_data_v11)

# 2. Aynı değerlere sahip müşterileri tespit et (uyarının asıl sebebi)
if 'monthly' in globals():
    identical_report = check_identical_values(ref_data_v11)

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)

In [16]:
df.write_parquet("ref_data_v8_last_11.parquet")

In [17]:
import shutil, os
kaggle_file = "/kaggle/input/kagglejson/kaggle (2).json"
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
shutil.copy(kaggle_file, os.path.expanduser("~/.kaggle/kaggle.json"))
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

In [7]:
import polars as pl

ref_final = pl.read_parquet('/kaggle/input/datasetv8final/ref_data_v8_last_12.parquet').to_pandas()

In [8]:
DROP_COLS = ['cust_id','ref_date','split']
train_ref_data = ref_final[ref_final['split'] == "Train"].drop(DROP_COLS,axis=1)
test_ref_data = ref_final[ref_final['split'] == "Test"].drop(DROP_COLS + ['churn'],axis=1)
train_ref_data[train_ref_data.select_dtypes(exclude=['object','category']).columns] = train_ref_data.select_dtypes(exclude=['object','category']).fillna(-9999)
test_ref_data[test_ref_data.select_dtypes(exclude=['object','category']).columns] = test_ref_data.select_dtypes(exclude=['object','category']).fillna(-9999)

In [None]:
from catboost import CatBoostClassifier, Pool, EFeaturesSelectionAlgorithm, EShapCalcType
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import pandas as pd

FEATURE_SELECTION = True

if FEATURE_SELECTION:
    summaries = []

    # Eğitim verisi
    X = train_ref_data.drop(columns=['churn', #'churn_2',
                                     'split', 'date', 
                                     'cust_id'], 
                            errors="ignore", axis=1)
    y = train_ref_data['churn']

    # Kategorik sütun isimleri
    cat_idx = X.select_dtypes(include=["object", "category"]).columns.tolist()

    # StratifiedKFold tanımı
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx].copy()
        X_val,   y_val   = X.iloc[val_idx].copy(),   y.iloc[val_idx].copy()


        params = {
            'iterations': 1000,
            'learning_rate': 0.05,
            'depth': 6,
            'loss_function': 'Logloss',
            'eval_metric': 'AUC',
            'verbose': False,
            'border_count': 256,
            'task_type': 'GPU',
            'auto_class_weights': "Balanced",
            'boosting_type': "Ordered",
            'use_best_model': False
        }

        model = CatBoostClassifier(**params)

        train_pool = Pool(X_train, y_train, cat_features=cat_idx)
        val_pool   = Pool(X_val,   y_val,   cat_features=cat_idx)

        summary = model.select_features(
            train_pool,
            eval_set=val_pool,
            features_for_select=f'0-{X_train.shape[1]-1}',
            num_features_to_select=50,
            steps=5,
            algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
            shap_calc_type=EShapCalcType.Regular,
            train_final_model=False,
            logging_level='Silent',
            plot=False  # Hız için False yapabilirsin
        )
        summaries.append(summary)

    # Her fold için seçilen 25 özelliğin isim listelerini çıkar
    selected_lists = [s["selected_features_names"] for s in summaries]

    # Tüm fold'lardaki seçimleri tek listeye açıp say
    all_selected = [feat for sublist in selected_lists for feat in sublist]
    counts = Counter(all_selected)

    # 1+, 2+, 3+ listeleri (en az 1, 2, 3 defa ilk 25'e girenler)
    top25_any   = sorted([f for f, c in counts.items() if c >= 1])
    top25_2plus = sorted([f for f, c in counts.items() if c >= 2])
    top25_3plus = sorted([f for f, c in counts.items() if c >= 3])

    # Özet tablo
    top25_summary_df = pd.DataFrame({
        "feature": list(counts.keys()),
        "selected_in_folds": list(counts.values())
    }).sort_values("selected_in_folds", ascending=False).reset_index(drop=True)

In [32]:
top25_summary_df.to_csv('top_features.csv',index=False)

In [17]:
top25_summary_df = pd.read_csv('/kaggle/working/top_features.csv')

In [18]:
features = top25_summary_df.query('selected_in_folds >= 2').feature.tolist()

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import xgboost as xgb
import ray
from ray import tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import ASHAScheduler
import optuna

# ---- Veri ----
target_col = 'churn'
cat_features = ['gender', 'work_type', 'province', 'religion', 'work_sector']
cat_features = [c for c in cat_features if c in features]
features = list(set(features + cat_features))
df = train_ref_data.copy()
X_test_orig = test_ref_data.copy()[features]
X_orig = df.drop(target_col, axis=1)[features]
y = df[target_col]

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---- Ray Tune Fonksiyonu (GPU) ----
def train_model(config, model_type=None):
    oof_preds = np.zeros(len(X_orig))
    
    for train_idx, val_idx in kf.split(X_orig, y):
        X_train, X_val = X_orig.iloc[train_idx], X_orig.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        if model_type == "catboost":
            model = CatBoostClassifier(
                iterations=int(config["iterations"]),
                depth=int(config["depth"]),
                learning_rate=config["learning_rate"],
                l2_leaf_reg=config["l2_leaf_reg"],
                border_count=int(config["border_count"]),
                random_seed=42,
                eval_metric='AUC',
                verbose=0,
                task_type='GPU'
            )
            model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val), early_stopping_rounds=300)
            y_val_pred = model.predict_proba(X_val)[:, 1]

        elif model_type == "xgboost":
            model = xgb.XGBClassifier(
                n_estimators=int(config["n_estimators"]),
                max_depth=int(config["max_depth"]),
                learning_rate=config["learning_rate"],
                subsample=config["subsample"],
                colsample_bytree=config["colsample_bytree"],
                use_label_encoder=False,
                eval_metric='auc',
                tree_method='gpu_hist',
                gpu_id=0,
                random_state=42
            )
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=300, verbose=False)
            y_val_pred = model.predict_proba(X_val)[:, 1]

        oof_preds[val_idx] = y_val_pred

    # Ray Tune'a özel metric olarak raporla
    score = ing_hubs_datathon_metric(y, oof_preds)
    tune.report({"score": score})


# ---- Arama Alanları ----
search_spaces = {
    "catboost": {
        "iterations": tune.choice([500, 1000, 1500, 2000]),
        "depth": tune.randint(3, 10),
        "learning_rate": tune.loguniform(0.005, 0.2),
        "l2_leaf_reg": tune.uniform(1, 10),
        "border_count": tune.choice([64, 128, 254])
    },
    "xgboost": {
        "n_estimators": tune.choice([500, 1000, 1500, 2000]),
        "max_depth": tune.randint(3, 10),
        "learning_rate": tune.loguniform(0.005, 0.2),
        "subsample": tune.uniform(0.6, 1.0),
        "colsample_bytree": tune.uniform(0.6, 1.0)
    }
}

# ---- Optimize Fonksiyonu ----
def optimize_model(model_type):
    # Scheduler artık metric ve mode ile oluşturuluyor
    scheduler = ASHAScheduler(
        max_t=10,
        grace_period=1,
        reduction_factor=2,
        metric="score", 
        mode="max"        
    )

    analysis = tune.run(
        tune.with_parameters(train_model, model_type=model_type),
        config=search_spaces[model_type],
        num_samples=30,
        scheduler=scheduler,
        search_alg=OptunaSearch(metric="score", mode="max"),
        resources_per_trial={"cpu": 4, "gpu": 2},
        verbose=1
    )
    
    best_config = analysis.get_best_config(metric="score", mode="max")
    print(f"Best config for {model_type}: {best_config}")
    return best_config


# ---- Optimize ----
best_cat = optimize_model("catboost")
best_xgb = optimize_model("xgboost")


# ---- Final 5-Fold Eğitim ve Ensemble ----
models_config = {
    "CatBoost": ("catboost", best_cat),
    "XGBoost": ("xgboost", best_xgb)
}

all_model_oof_preds = {name: np.zeros(len(X_orig)) for name in models_config.keys()}
all_model_test_preds = {name: [] for name in models_config.keys()}

for model_name, (model_type, params) in models_config.items():
    print(f"\nTraining {model_name} final folds")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_orig, y), 1):
        X_train, X_val = X_orig.iloc[train_idx], X_orig.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        if model_type == "catboost":
            model = CatBoostClassifier(
                iterations=int(params["iterations"]),
                depth=int(params["depth"]),
                learning_rate=params["learning_rate"],
                l2_leaf_reg=params["l2_leaf_reg"],
                border_count=int(params["border_count"]),
                random_seed=42,
                eval_metric='AUC',
                verbose=0,
                task_type='GPU'
            )
            model.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val), early_stopping_rounds=300)
            y_val_pred = model.predict_proba(X_val)[:,1]

        elif model_type == "xgboost":
            model = xgb.XGBClassifier(
                n_estimators=int(params["n_estimators"]),
                max_depth=int(params["max_depth"]),
                learning_rate=params["learning_rate"],
                subsample=params["subsample"],
                colsample_bytree=params["colsample_bytree"],
                use_label_encoder=False,
                eval_metric='auc',
                tree_method='gpu_hist',
                gpu_id=0,
                random_state=42
            )
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=300, verbose=False)
            y_val_pred = model.predict_proba(X_val)[:,1]

        all_model_oof_preds[model_name][val_idx] = y_val_pred
        all_model_test_preds[model_name].append(model.predict_proba(X_test_orig)[:,1])

    all_model_test_preds[model_name] = np.mean(all_model_test_preds[model_name], axis=0)


# ---- Optuna ile Ensemble Ağırlık Optimizasyonu ----
def ensemble_objective(trial):
    w_cat = trial.suggest_float("w_cat", 0.0, 1.0)
    w_xgb = 1.0 - w_cat
    ensemble_oof = (w_cat*all_model_oof_preds["CatBoost"] +
                    w_xgb*all_model_oof_preds["XGBoost"])
    score = ing_hubs_datathon_metric(y, ensemble_oof)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(ensemble_objective, n_trials=50)

best_weights = (study.best_params["w_cat"], 1 - study.best_params["w_cat"])
best_score = study.best_value

print(f"Best ensemble weights (CatBoost, XGBoost): {best_weights}, Score: {best_score:.4f}")

ensemble_pred = (best_weights[0]*all_model_test_preds["CatBoost"] +
                 best_weights[1]*all_model_test_preds["XGBoost"])

# ---- Ensemble OOF Metrikleri ----
ensemble_oof = (best_weights[0]*all_model_oof_preds["CatBoost"] +
                best_weights[1]*all_model_oof_preds["XGBoost"])

print("\n===== Ensemble OOF Metrics =====")
print(f"OOF AUC: {roc_auc_score(y, ensemble_oof):.4f}")
print(f"OOF Gini: {convert_auc_to_gini(roc_auc_score(y, ensemble_oof)):.4f}")
print(f"OOF Custom Score: {ing_hubs_datathon_metric(y, ensemble_oof):.4f}")


0,1
Current time:,2025-10-19 15:52:36
Running for:,00:15:55.98
Memory:,7.9/31.4 GiB

Trial name,status,loc,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,iter,total time (s),score
train_model_2235c263,TERMINATED,172.19.2.2:8336,0.901514,0.0102229,5,500,0.937309,1,14.4091,1.16577
train_model_c300a264,TERMINATED,172.19.2.2:8421,0.665897,0.0442343,3,1500,0.953045,1,17.2587,1.16375
train_model_67fa5156,TERMINATED,172.19.2.2:8506,0.861552,0.078248,6,1000,0.616479,1,12.9582,1.1664
train_model_743bdf6a,TERMINATED,172.19.2.2:8592,0.909598,0.00794963,4,2000,0.821795,1,36.8506,1.17051
train_model_8838ca6a,TERMINATED,172.19.2.2:8681,0.717821,0.0227352,7,1000,0.962478,1,21.0546,1.17081
train_model_832e4045,TERMINATED,172.19.2.2:8767,0.743082,0.0114851,4,1500,0.672078,1,27.8258,1.17524
train_model_84cb03bb,TERMINATED,172.19.2.2:8853,0.604909,0.022091,8,500,0.974003,1,22.137,1.16448
train_model_5d71ed4e,TERMINATED,172.19.2.2:8939,0.804045,0.0456596,9,1500,0.958218,1,22.8452,1.15757
train_model_87663be4,TERMINATED,172.19.2.2:9025,0.715009,0.0134274,5,500,0.755328,1,14.0863,1.1729
train_model_c8ebe637,TERMINATED,172.19.2.2:9111,0.703639,0.00616272,9,2000,0.831458,1,59.0749,1.16451




[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m     E.g. tree_method = "hist", device = "cuda"
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m     E.g. tree_method = "hist", device = "cuda"
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m Potential solutions:
[36m(train_model pid=8336)[0m - Use a data structure that matches the device ordinal in the booster.
[36m(train_model pid=8336)[0m - Set the device for booster before call to inplace_predict.
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m     E.g. tree_method = "hist", device = "cuda"
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m     E.g. tree_method = "hist", device = "cuda"
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m 
[36m(train_model pid=8336)[0m     E.g. tree_method = "hist", device = 

Best config for xgboost: {'n_estimators': 1500, 'max_depth': 4, 'learning_rate': 0.011485145069777995, 'subsample': 0.6720778331515599, 'colsample_bytree': 0.7430816334003801}

Training CatBoost final folds


Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU



Training XGBoost final folds



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

[I 2025-10-19 15:54:42,417] A new study created in memory with name: no-name-9b0067c6-c334-42ab-9063-00e0598cb499
[I 2025-10-19 15:54:42,511] Trial 0 finished with value: 1.1757262487194244 and parameters: {'w_cat': 0.7967228226056815}. Best is trial 0 with value: 1.1757262487194244.
[I 2025-10-19 15:54:42,595] Trial 1 finished with value: 1.17606040719

Best ensemble weights (CatBoost, XGBoost): (0.9444016673550206, 0.055598332644979376), Score: 1.1784

===== Ensemble OOF Metrics =====
OOF AUC: 0.7186
OOF Gini: 0.4373
OOF Custom Score: 1.1784


In [22]:
best_cat

{'iterations': 2000,
 'depth': 6,
 'learning_rate': 0.007484348431669206,
 'l2_leaf_reg': 3.5135289637920772,
 'border_count': 254}

In [23]:
best_xgb

{'n_estimators': 1500,
 'max_depth': 4,
 'learning_rate': 0.011485145069777995,
 'subsample': 0.6720778331515599,
 'colsample_bytree': 0.7430816334003801}

In [34]:
features

['cc_transaction_all_cnt_rank_pct_mean_last_1m',
 'cc_transaction_all_cnt_last_1m_max',
 'active_product_category_nbr_max_ratio_3_12_m',
 'mobile_eft_all_amt_real_min_ratio_1_3_m',
 'active_product_category_nbr_last_6m_sum',
 'mobile_eft_all_amt_real_rank_pct_mean_last_1m',
 'active_product_category_nbr_min_ratio_1_3_m',
 'days_since_last_transaction',
 'active_product_category_nbr_last_9m_max',
 'cc_transaction_all_cnt_delta_last_1m',
 'mobile_eft_all_cnt_last_9m_mean',
 'mobile_eft_all_cnt_ema_last_6m',
 'mobile_eft_all_cnt_va_last_6m',
 'active_product_category_nbr_last_12m_min',
 'cc_transaction_all_cnt_delta_last_9m',
 'mobile_eft_all_cnt_rank_pct_mean_last_1m',
 'mobile_eft_all_cnt_rank_pct_mean_last_3m',
 'mobile_eft_all_cnt_ema_last_3m',
 'active_product_category_nbr_last_12m_max',
 'cc_transaction_all_cnt_last_12m_max',
 'cc_transaction_all_cnt_sum_ratio_1_9_m',
 'cc_transaction_all_cnt_rank_pct_mean_last_12m',
 'cc_transaction_all_cnt_delta_last_6m',
 'mobile_eft_all_amt_real

In [27]:
sub = pd.read_csv('/kaggle/input/ing-hubs-turkiye-datathon/sample_submission.csv')
sub['churn'] = ensemble_pred
sub.to_csv('ensemble_v8_catboost_xgb_ray_tuned.csv',index=False)

In [29]:
import shutil, os
kaggle_file = "/kaggle/input/kagglejson/kaggle (2).json"
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
shutil.copy(kaggle_file, os.path.expanduser("~/.kaggle/kaggle.json"))
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

!kaggle competitions submit -c ing-hubs-turkiye-datathon -f /kaggle/working/ensemble_v8_catboost_xgb_ray_tuned.csv -m "ray_tune_hill_climbing"

100%|██████████████████████████████████████| 1.08M/1.08M [00:00<00:00, 2.71MB/s]
Successfully submitted to ING Hubs Türkiye Datathon