In [3]:

import os, pandas as pd, numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
# Veri yükleme (sizin kodunuzdan)
BASE = "/content/sample_data/datathon2025"
OUT = os.path.join(BASE, "outputs")
ARTI = os.path.join(OUT, "artifacts")

df = pd.read_csv(os.path.join(BASE, "train.csv"))
test = pd.read_csv(os.path.join(BASE, "test.csv"))
sub = pd.read_csv(os.path.join(BASE, "sample_submission.csv"))

In [5]:

import os, pandas as pd
pd.set_option("display.max_columns", None)

BASE = "/content/sample_data/datathon2025"
OUT  = os.path.join(BASE, "outputs"); os.makedirs(OUT, exist_ok=True)

df   = pd.read_csv(os.path.join(BASE, "train.csv"))
test = pd.read_csv(os.path.join(BASE, "test.csv"))
sub  = pd.read_csv(os.path.join(BASE, "sample_submission.csv"))

# Zaman alanını düzgünleştir
for d in (df, test):
    d["event_time"] = pd.to_datetime(d["event_time"], utc=True, errors="coerce")

print(df.shape, test.shape, sub.shape)
print(df.head(2))


(141219, 7) (62951, 6) (30789, 2)
                 event_time event_type   product_id category_id      user_id  \
0 2025-06-19 10:23:07+00:00   ADD_CART  PROD_011223   CAT_00054  USER_097562   
1 2025-06-07 21:34:45+00:00   ADD_CART  PROD_005519   CAT_00144  USER_006535   

     user_session  session_value  
0  SESSION_158779          90.29  
1  SESSION_029987          16.39  


In [6]:
sv_per_sess = df.groupby("user_session")["session_value"].nunique(dropna=False)
print("Tekil session_value oranı:", (sv_per_sess==1).mean())
print("Farklı value taşıyan oturum sayısı:", int((sv_per_sess>1).sum()))

Tekil session_value oranı: 1.0
Farklı value taşıyan oturum sayısı: 0


In [7]:
for name, d in [("TRAIN", df), ("TEST", test)]:
    c = d.groupby("user_session").size()
    print(f"{name}  oturum sayısı: {c.shape[0]:,} | toplam event: {d.shape[0]:,}")
    print(c.describe(percentiles=[.1,.25,.5,.75,.9]))

TRAIN  oturum sayısı: 70,736 | toplam event: 141,219
count    70736.000000
mean         1.996423
std          2.579703
min          1.000000
10%          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
90%          4.000000
max        116.000000
dtype: float64
TEST  oturum sayısı: 30,789 | toplam event: 62,951
count    30789.000000
mean         2.044594
std          2.508177
min          1.000000
10%          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
90%          4.000000
max         87.000000
dtype: float64


In [8]:
print("Satır düzeyi sayımlar:")
print(df["event_type"].value_counts())
evt_pivot = df.pivot_table(index="user_session", columns="event_type",
                           values="product_id", aggfunc="count", fill_value=0)
evt_pivot["total"] = evt_pivot.sum(axis=1)

non_view_cols = [c for c in evt_pivot.columns if c not in ["VIEW","total"]]
only_view = ((evt_pivot.get("VIEW",0) > 0) & (evt_pivot[non_view_cols].sum(axis=1)==0)).mean()
has_buy   = (evt_pivot.get("BUY",0) > 0).mean()
print("Sadece VIEW içeren oturum oranı:", round(only_view,4))
print("BUY içeren oturum oranı:", round(has_buy,4))

Satır düzeyi sayımlar:
event_type
VIEW           58829
ADD_CART       42304
REMOVE_CART    25615
BUY            14471
Name: count, dtype: int64
Sadece VIEW içeren oturum oranı: 0.4746
BUY içeren oturum oranı: 0.124


In [9]:
train_users = set(df["user_id"].unique())
test_users  = set(test["user_id"].unique())
print("train∩test user oranı:", round(len(train_users & test_users)/len(test_users), 4))

sess_per_user = df.groupby("user_id")["user_session"].nunique()
print(sess_per_user.describe(percentiles=[.5,.9,.99]))

train∩test user oranı: 0.2038
count    51821.000000
mean         1.365412
std          1.202873
min          1.000000
50%          1.000000
90%          2.000000
99%          6.000000
max         47.000000
Name: user_session, dtype: float64


In [10]:
sv_by_sess = df.groupby("user_session")["session_value"].first()
print(sv_by_sess.describe(percentiles=[.01,.05,.1,.25,.5,.75,.9,.95,.99]))
print("Skewness:", round(sv_by_sess.skew(),3))

count    70736.000000
mean        42.198130
std         47.552369
min          5.380000
1%           5.480000
5%           7.490000
10%          9.890000
25%         18.530000
50%         30.750000
75%         46.620000
90%         86.480000
95%        121.330000
99%        225.401000
max       2328.660000
Name: session_value, dtype: float64
Skewness: 7.99


In [11]:
test_us_unique = pd.Index(test['user_session'].astype(str).unique())
sub_us         = pd.Index(sub['user_session'].astype(str))
print("Aynı set:", set(test_us_unique)==set(sub_us))
print("Aynı uzunluk:", len(test_us_unique)==len(sub_us))
print("Aynı sıra:", test_us_unique.equals(sub_us))

Aynı set: True
Aynı uzunluk: True
Aynı sıra: True


In [12]:
# Train & test'te aynı user_session olanlar (210 adet vardı)
overlap = pd.Index(test["user_session"].unique()).intersection(df["user_session"].unique())
print("Çakışan oturum adedi:", len(overlap))

# Bu oturumların train'deki gerçek hedefleri:
leak_map = (df.loc[df["user_session"].isin(overlap)]
              .groupby("user_session")["session_value"].first())
ARTI = os.path.join(OUT, "artifacts"); os.makedirs(ARTI, exist_ok=True)
leak_path = os.path.join(ARTI, "leak_map_session_value.csv")
leak_map.to_csv(leak_path)
print("leak_map kaydedildi ->", leak_path)

Çakışan oturum adedi: 210
leak_map kaydedildi -> /content/sample_data/datathon2025/outputs/artifacts/leak_map_session_value.csv


In [36]:
def build_session_features_fixed(events, prod_freq=None, cat_freq=None, is_train=True):
    e = events.copy()
    e["event_time"] = pd.to_datetime(e["event_time"], utc=True, errors="coerce")

    # Tür dummies (VIEW / ADD_CART / REMOVE_CART / BUY)
    dummies = pd.get_dummies(e["event_type"])
    e = pd.concat([e[["user_session","user_id","product_id","category_id","event_time"]], dummies], axis=1)

    # Sayım & süre
    agg = e.groupby("user_session").agg(
        user_id=("user_id","first"),
        n_events=("event_time","size"),
        t_min=("event_time","min"),
        t_max=("event_time","max"),
        uniq_products=("product_id","nunique"),
        uniq_categories=("category_id","nunique"),
    )

    # Event type counts
    for col in ["VIEW","ADD_CART","REMOVE_CART","BUY"]:
        if col in e.columns:
            agg[f"n_{col}"] = e.groupby("user_session")[col].sum()
        else:
            agg[f"n_{col}"] = 0

    # Son event tipi
    last_evt = (events.sort_values("event_time")
                      .groupby("user_session")["event_type"].last())
    last_dum = pd.get_dummies(last_evt, prefix="last")
    agg = pd.concat([agg, last_dum], axis=1)

    # === MEVCUT FEATURES ===
    agg["duration_sec"] = (agg["t_max"] - agg["t_min"]).dt.total_seconds().fillna(0)
    agg["share_add"]    = agg["n_ADD_CART"]   / agg["n_events"]
    agg["share_remove"] = agg["n_REMOVE_CART"]/ agg["n_events"]
    agg["share_buy"]    = agg["n_BUY"]        / agg["n_events"]
    agg["repeat_rate"]  = (agg["n_events"] - agg["uniq_products"]) / agg["n_events"]

    # Saat / hafta içi modu
    tmp = e.copy()
    tmp["hour"] = tmp["event_time"].dt.hour
    tmp["wday"] = tmp["event_time"].dt.weekday
    agg["hour_mode"] = tmp.groupby("user_session")["hour"].agg(lambda x: x.mode().iloc[0] if len(x)>0 else 0)
    agg["wday_mode"] = tmp.groupby("user_session")["wday"].agg(lambda x: x.mode().iloc[0] if len(x)>0 else 0)

    # Popülerlik
    if is_train or (prod_freq is None or cat_freq is None):
        prod_freq = e["product_id"].value_counts()
        cat_freq  = e["category_id"].value_counts()
    e["prod_freq"] = e["product_id"].map(prod_freq).fillna(0).astype(float)
    e["cat_freq"]  = e["category_id"].map(cat_freq).fillna(0).astype(float)
    agg["avg_prod_freq"] = e.groupby("user_session")["prod_freq"].mean()
    agg["avg_cat_freq"]  = e.groupby("user_session")["cat_freq"].mean()

    # === FIXED: VALUE-BASED FEATURES KALDIRILDI ===
    # Test'te session_value olmadığı için bu feature'lar train/test mismatch yaratıyor!
    # Onun yerine sadece diğer güçlü feature'ları tutalım

    # 2. TIME-BASED FEATURES (Güvenli!)
    agg["is_morning"] = ((agg["hour_mode"] >= 6) & (agg["hour_mode"] < 12)).astype(int)
    agg["is_afternoon"] = ((agg["hour_mode"] >= 12) & (agg["hour_mode"] < 18)).astype(int)
    agg["is_evening"] = ((agg["hour_mode"] >= 18) & (agg["hour_mode"] < 22)).astype(int)
    agg["is_night"] = ((agg["hour_mode"] >= 22) | (agg["hour_mode"] < 6)).astype(int)
    agg["is_weekend"] = (agg["wday_mode"] >= 5).astype(int)

    # Average event spacing
    agg["avg_event_spacing"] = agg["duration_sec"] / (agg["n_events"] + 0.1)

    # 3. BEHAVIORAL FEATURES (Güvenli!)
    agg["conversion_rate"] = agg["n_BUY"] / (agg["n_ADD_CART"] + agg["n_BUY"] + 0.1)
    agg["exploration_rate"] = agg["uniq_products"] / agg["n_events"]
    agg["cart_efficiency"] = (agg["n_ADD_CART"] - agg["n_REMOVE_CART"]) / (agg["n_ADD_CART"] + 0.1)
    agg["view_to_cart_rate"] = agg["n_ADD_CART"] / (agg["n_VIEW"] + 0.1)

    # Activity intensity
    agg["activity_intensity"] = agg["n_events"] / (agg["duration_sec"] + 1)  # events per second

    # Purchase decision metrics
    agg["purchase_decisiveness"] = agg["n_BUY"] / (agg["n_events"] + 0.1)
    agg["cart_to_buy_ratio"] = agg["n_BUY"] / (agg["n_ADD_CART"] + 0.1)

    # 4. CATEGORY/PRODUCT DIVERSITY
    agg["product_diversity"] = agg["uniq_products"] / (agg["n_events"] + 0.1)
    agg["category_diversity"] = agg["uniq_categories"] / (agg["n_events"] + 0.1)

    # Product repeat behavior
    agg["product_repeat_rate"] = 1 - agg["product_diversity"]

    # 5. SEQUENCE FEATURES
    # İlk event
    first_evt = events.sort_values("event_time").groupby("user_session")["event_type"].first()
    first_dum = pd.get_dummies(first_evt, prefix="first")
    agg = pd.concat([agg, first_dum], axis=1)

    # Journey completion metrics
    agg["started_with_view"] = agg.get("first_VIEW", 0)
    agg["ended_with_buy"] = agg.get("last_BUY", 0)
    agg["complete_journey"] = agg["started_with_view"] * agg["ended_with_buy"]

    # Temizlik
    agg = agg.drop(columns=["t_min","t_max"])
    agg = agg.replace([np.inf, -np.inf], 0).fillna(0)

    return agg.reset_index(), prod_freq, cat_freq

In [37]:
# Sonra bu kodu çalıştır:
print("=== ENHANCED FEATURES ÜRETİLİYOR ===")

# Train ve test özellikleri yeni fonksiyon ile
train_feat, prod_freq, cat_freq = build_session_features_fixed(df, is_train=True)
test_feat, _, _ = build_session_features_fixed(test, prod_freq=prod_freq, cat_freq=cat_freq, is_train=False)

print(f"Train features shape: {train_feat.shape}")
print(f"Test features shape: {test_feat.shape}")
print(f"Feature sayısı: {train_feat.shape[1]} (eski: 22)")
print("Yeni feature'lar eklendi!")
print("\nİlk 5 feature adı:", train_feat.columns[:5].tolist())
print("Son 5 feature adı:", train_feat.columns[-5:].tolist())

=== ENHANCED FEATURES ÜRETİLİYOR ===
Train features shape: (70736, 45)
Test features shape: (30789, 45)
Feature sayısı: 45 (eski: 22)
Yeni feature'lar eklendi!

İlk 5 feature adı: ['user_session', 'user_id', 'n_events', 'uniq_products', 'uniq_categories']
Son 5 feature adı: ['first_REMOVE_CART', 'first_VIEW', 'started_with_view', 'ended_with_buy', 'complete_journey']


In [38]:
# Hedefi oturum seviyesine indir
y = df.groupby("user_session")["session_value"].first().rename("session_value").reset_index()
train_sess = (train_feat.merge(y, on="user_session", how="left", validate="one_to_one"))

# Zaman tabanlı kesim: train oturumlarının son event zamanına göre
last_time = df.groupby("user_session")["event_time"].max()
cutoff = last_time.max() - pd.Timedelta(days=2)  # son 2 günü valid
print("Cutoff:", cutoff)

train_feat_ = train_feat.set_index("user_session").join(last_time.rename("tmax")).reset_index()
tr_idx = train_feat_["tmax"] <= cutoff
va_idx = train_feat_["tmax"] >  cutoff
# YENİ:
drop_cols = ["user_session","tmax","user_id"]  # ham ID'yi çıkar
X_tr = train_feat_.loc[tr_idx].drop(columns=drop_cols)
X_va = train_feat_.loc[va_idx].drop(columns=drop_cols)

# Güvenlik: sadece sayısal kolonları tut ve bool'ları uint8 yap
def make_numeric(X):
    if len(X.select_dtypes(include=["bool"]).columns):
        X = X.astype({c:"uint8" for c in X.select_dtypes(include=["bool"]).columns})
    num_cols = X.select_dtypes(include=["number"]).columns
    return X[num_cols].replace([np.inf, -np.inf], 0).fillna(0)

X_tr = make_numeric(X_tr)
X_va = make_numeric(X_va)

y_all = train_sess.set_index("user_session")["session_value"]
y_tr = y_all.loc[train_feat_.loc[tr_idx,"user_session"]]
y_va = y_all.loc[train_feat_.loc[va_idx,"user_session"]]

print("Train oturum:", X_tr.shape, " | Valid oturum:", X_va.shape)

Cutoff: 2025-06-19 23:59:52+00:00
Train oturum: (64450, 43)  | Valid oturum: (6286, 43)


In [39]:

# Hedef transform (log1p)
ytr = np.log1p(y_tr.values)
yva = np.log1p(y_va.values)


In [40]:

#  XGBoost (mevcut)
xgb1 = XGBRegressor(
    n_estimators=600, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    tree_method="hist", random_state=42, verbosity=0
)
xgb1.fit(X_tr, ytr)
p_xgb1 = xgb1.predict(X_va)
rmse_xgb1 = np.sqrt(mean_squared_error(yva, p_xgb1))
print(f"XGBoost-1 RMSE: {rmse_xgb1:.4f}")

XGBoost-1 RMSE: 0.4775


In [41]:

# XGBoost (farklı parametreler)
xgb2 = XGBRegressor(
    n_estimators=800, max_depth=5, learning_rate=0.03,
    subsample=0.9, colsample_bytree=0.7, reg_lambda=2.0,
    tree_method="hist", random_state=123, verbosity=0
)
xgb2.fit(X_tr, ytr)
p_xgb2 = xgb2.predict(X_va)
rmse_xgb2 = np.sqrt(mean_squared_error(yva, p_xgb2))
print(f"XGBoost-2 RMSE: {rmse_xgb2:.4f}")

XGBoost-2 RMSE: 0.4746


In [42]:
# LightGBM
lgb = LGBMRegressor(
    n_estimators=700, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.5,
    random_state=42, verbosity=-1
)
lgb.fit(X_tr, ytr)
p_lgb = lgb.predict(X_va)
rmse_lgb = np.sqrt(mean_squared_error(yva, p_lgb))
print(f"LightGBM RMSE: {rmse_lgb:.4f}")


LightGBM RMSE: 0.4762


In [43]:
# CatBoost
cat = CatBoostRegressor(
    iterations=600, depth=6, learning_rate=0.05,
    subsample=0.8, reg_lambda=1.0, random_state=42, verbose=False
)
cat.fit(X_tr, ytr)
p_cat = cat.predict(X_va)
rmse_cat = np.sqrt(mean_squared_error(yva, p_cat))
print(f"CatBoost RMSE: {rmse_cat:.4f}")


CatBoost RMSE: 0.4745


In [44]:

# Random Forest
rf = RandomForestRegressor(
    n_estimators=300, max_depth=12, min_samples_split=5,
    random_state=42, n_jobs=-1
)
rf.fit(X_tr, ytr)
p_rf = rf.predict(X_va)
rmse_rf = np.sqrt(mean_squared_error(yva, p_rf))
print(f"Random Forest RMSE: {rmse_rf:.4f}")

Random Forest RMSE: 0.4784


In [45]:
# Extra Trees
et = ExtraTreesRegressor(
    n_estimators=300, max_depth=12, min_samples_split=5,
    random_state=42, n_jobs=-1
)
et.fit(X_tr, ytr)
p_et = et.predict(X_va)
rmse_et = np.sqrt(mean_squared_error(yva, p_et))
print(f"Extra Trees RMSE: {rmse_et:.4f}")

Extra Trees RMSE: 0.4795


In [46]:
# ENSEMBLE: Basit ortalama
p_ensemble_simple = (p_xgb1 + p_xgb2 + p_lgb + p_cat + p_rf + p_et) / 6
rmse_ensemble_simple = np.sqrt(mean_squared_error(yva, p_ensemble_simple))
print(f"Basit Ortalama RMSE: {rmse_ensemble_simple:.4f}")

#ENSEMBLE: Ağırlıklı ortalama (RMSE'ye göre)
rmse_scores = np.array([rmse_xgb1, rmse_xgb2, rmse_lgb, rmse_cat, rmse_rf, rmse_et])
weights = 1 / rmse_scores
weights = weights / weights.sum()  # normalize et

model_names = ["XGB1", "XGB2", "LGB", "CAT", "RF", "ET"]
for name, w in zip(model_names, weights):
    print(f"{name}: {w:.3f}")

p_ensemble_weighted = (p_xgb1 * weights[0] + p_xgb2 * weights[1] +
                      p_lgb * weights[2] + p_cat * weights[3] +
                      p_rf * weights[4] + p_et * weights[5])
rmse_ensemble_weighted = np.sqrt(mean_squared_error(yva, p_ensemble_weighted))
print(f"Ağırlıklı Ortalama RMSE: {rmse_ensemble_weighted:.4f}")

#ENSEMBLE: Stacking (Ridge ile)

stack_X = np.column_stack([p_xgb1, p_xgb2, p_lgb, p_cat, p_rf, p_et])
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(stack_X, yva)
p_ensemble_stack = ridge.predict(stack_X)
rmse_ensemble_stack = np.sqrt(mean_squared_error(yva, p_ensemble_stack))
print(f"Stacking (Ridge) RMSE: {rmse_ensemble_stack:.4f}")

print(f"Stacking katsayıları: {ridge.coef_}")

# En iyi ensemble seç
best_rmse = min(rmse_ensemble_simple, rmse_ensemble_weighted, rmse_ensemble_stack)
if best_rmse == rmse_ensemble_simple:
    best_method = "simple"
    print(f"\nEn iyi method: Basit Ortalama (RMSE: {best_rmse:.4f})")
elif best_rmse == rmse_ensemble_weighted:
    best_method = "weighted"
    print(f"\nEn iyi method: Ağırlıklı Ortalama (RMSE: {best_rmse:.4f})")
else:
    best_method = "stacking"
    print(f"\nEn iyi method: Stacking (RMSE: {best_rmse:.4f})")

Basit Ortalama RMSE: 0.4749
XGB1: 0.166
XGB2: 0.167
LGB: 0.167
CAT: 0.167
RF: 0.166
ET: 0.166
Ağırlıklı Ortalama RMSE: 0.4749
Stacking (Ridge) RMSE: 0.4737
Stacking katsayıları: [-0.24879839  0.69575129  0.02535835  0.49452387 -0.00885122  0.03803709]

En iyi method: Stacking (RMSE: 0.4737)


In [47]:
# === ENSEMBLE MODEL İYİLEŞTİRMESİ ===
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

print("=== ENSEMBLE MODEL EĞİTİMİ BAŞLIYOR ===")

# Hedef transform (log1p)
ytr = np.log1p(y_tr.values)
yva = np.log1p(y_va.values)

#  XGBoost
xgb1 = XGBRegressor(
    n_estimators=600, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    tree_method="hist", random_state=42, verbosity=0
)
xgb1.fit(X_tr, ytr)
p_xgb1 = xgb1.predict(X_va)
rmse_xgb1 = np.sqrt(mean_squared_error(yva, p_xgb1))
print(f"XGBoost-1 RMSE: {rmse_xgb1:.4f}")

# XGBoost (farklı parametreler)
xgb2 = XGBRegressor(
    n_estimators=800, max_depth=5, learning_rate=0.03,
    subsample=0.9, colsample_bytree=0.7, reg_lambda=2.0,
    tree_method="hist", random_state=123, verbosity=0
)
xgb2.fit(X_tr, ytr)
p_xgb2 = xgb2.predict(X_va)
rmse_xgb2 = np.sqrt(mean_squared_error(yva, p_xgb2))
print(f"XGBoost-2 RMSE: {rmse_xgb2:.4f}")

#LightGBM
lgb = LGBMRegressor(
    n_estimators=700, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.5,
    random_state=42, verbosity=-1
)
lgb.fit(X_tr, ytr)
p_lgb = lgb.predict(X_va)
rmse_lgb = np.sqrt(mean_squared_error(yva, p_lgb))
print(f"LightGBM RMSE: {rmse_lgb:.4f}")

#CatBoost
cat = CatBoostRegressor(
    iterations=600, depth=6, learning_rate=0.05,
    subsample=0.8, reg_lambda=1.0, random_state=42, verbose=False
)
cat.fit(X_tr, ytr)
p_cat = cat.predict(X_va)
rmse_cat = np.sqrt(mean_squared_error(yva, p_cat))
print(f"CatBoost RMSE: {rmse_cat:.4f}")

#Random Forest
rf = RandomForestRegressor(
    n_estimators=300, max_depth=12, min_samples_split=5,
    random_state=42, n_jobs=-1
)
rf.fit(X_tr, ytr)
p_rf = rf.predict(X_va)
rmse_rf = np.sqrt(mean_squared_error(yva, p_rf))
print(f"Random Forest RMSE: {rmse_rf:.4f}")

#Extra Trees
et = ExtraTreesRegressor(
    n_estimators=300, max_depth=12, min_samples_split=5,
    random_state=42, n_jobs=-1
)
et.fit(X_tr, ytr)
p_et = et.predict(X_va)
rmse_et = np.sqrt(mean_squared_error(yva, p_et))
print(f"Extra Trees RMSE: {rmse_et:.4f}")

#ENSEMBLE: Basit ortalama
p_ensemble_simple = (p_xgb1 + p_xgb2 + p_lgb + p_cat + p_rf + p_et) / 6
rmse_ensemble_simple = np.sqrt(mean_squared_error(yva, p_ensemble_simple))
print(f"Basit Ortalama RMSE: {rmse_ensemble_simple:.4f}")

# ENSEMBLE: Ağırlıklı ortalama (RMSE'ye göre)
rmse_scores = np.array([rmse_xgb1, rmse_xgb2, rmse_lgb, rmse_cat, rmse_rf, rmse_et])
weights = 1 / rmse_scores
weights = weights / weights.sum()  # normalize et

print("Model ağırlıkları:")
model_names = ["XGB1", "XGB2", "LGB", "CAT", "RF", "ET"]
for name, w in zip(model_names, weights):
    print(f"{name}: {w:.3f}")

p_ensemble_weighted = (p_xgb1 * weights[0] + p_xgb2 * weights[1] +
                      p_lgb * weights[2] + p_cat * weights[3] +
                      p_rf * weights[4] + p_et * weights[5])
rmse_ensemble_weighted = np.sqrt(mean_squared_error(yva, p_ensemble_weighted))
print(f"Ağırlıklı Ortalama RMSE: {rmse_ensemble_weighted:.4f}")

# ENSEMBLE: Stacking (Ridge ile)
stack_X = np.column_stack([p_xgb1, p_xgb2, p_lgb, p_cat, p_rf, p_et])
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(stack_X, yva)
p_ensemble_stack = ridge.predict(stack_X)
rmse_ensemble_stack = np.sqrt(mean_squared_error(yva, p_ensemble_stack))
print(f"Stacking (Ridge) RMSE: {rmse_ensemble_stack:.4f}")

print(f"Stacking katsayıları: {ridge.coef_}")

# En iyi ensemble seç
best_rmse = min(rmse_ensemble_simple, rmse_ensemble_weighted, rmse_ensemble_stack)
if best_rmse == rmse_ensemble_simple:
    best_method = "simple"
    print(f"\nEn iyi method: Basit Ortalama (RMSE: {best_rmse:.4f})")
elif best_rmse == rmse_ensemble_weighted:
    best_method = "weighted"
    print(f"\nEn iyi method: Ağırlıklı Ortalama (RMSE: {best_rmse:.4f})")
else:
    best_method = "stacking"
    print(f"\nEn iyi method: Stacking (RMSE: {best_rmse:.4f})")

# FULL TRAIN: Tüm veri ile eğit
print("\n=== TÜM VERİ İLE EĞİTİM ===")
y_full = y_all.loc[train_feat_["user_session"]].values
y_full_log = np.log1p(y_full)

# X_full ve X_te hazırla
drop_cols = ["user_session", "tmax", "user_id"]
X_full = train_feat_.drop(columns=drop_cols)
X_te = test_feat.drop(columns=["user_session", "user_id"], errors="ignore")

# Sayısal hale getir
def make_numeric(X):
    bool_cols = X.select_dtypes(include=["bool"]).columns
    if len(bool_cols):
        X = X.astype({c: "uint8" for c in bool_cols})
    num_cols = X.select_dtypes(include=["number"]).columns
    X = X[num_cols].replace([np.inf, -np.inf], 0).fillna(0)
    return X

X_full = make_numeric(X_full)
X_te = make_numeric(X_te)
X_te = X_te.reindex(columns=X_full.columns, fill_value=0)

print(f"X_full shape: {X_full.shape}, X_te shape: {X_te.shape}")

# Modelleri tekrar eğit
xgb1_full = XGBRegressor(n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, tree_method="hist", random_state=42, verbosity=0)
xgb2_full = XGBRegressor(n_estimators=800, max_depth=5, learning_rate=0.03, subsample=0.9, colsample_bytree=0.7, reg_lambda=2.0, tree_method="hist", random_state=123, verbosity=0)
lgb_full = LGBMRegressor(n_estimators=700, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.5, random_state=42, verbosity=-1)
cat_full = CatBoostRegressor(iterations=600, depth=6, learning_rate=0.05, subsample=0.8, reg_lambda=1.0, random_state=42, verbose=False)
rf_full = RandomForestRegressor(n_estimators=300, max_depth=12, min_samples_split=5, random_state=42, n_jobs=-1)
et_full = ExtraTreesRegressor(n_estimators=300, max_depth=12, min_samples_split=5, random_state=42, n_jobs=-1)

models_full = [xgb1_full, xgb2_full, lgb_full, cat_full, rf_full, et_full]
for i, model in enumerate(models_full):
    print(f"Model {i+1} eğitiliyor...")
    model.fit(X_full, y_full_log)

# Test tahminleri
print("Test tahminleri yapılıyor...")
test_preds = []
for model in models_full:
    pred = model.predict(X_te)
    test_preds.append(pred)

# Ensemble test tahmini
if best_method == "simple":
    p_te_ensemble = sum(test_preds) / len(test_preds)
elif best_method == "weighted":
    p_te_ensemble = sum(pred * w for pred, w in zip(test_preds, weights))
else:  # stacking
    stack_X_test = np.column_stack(test_preds)
    p_te_ensemble = ridge.predict(stack_X_test)

=== ENSEMBLE MODEL EĞİTİMİ BAŞLIYOR ===
XGBoost-1 RMSE: 0.4775
XGBoost-2 RMSE: 0.4746
LightGBM RMSE: 0.4762
CatBoost RMSE: 0.4745
Random Forest RMSE: 0.4784
Extra Trees RMSE: 0.4795
Basit Ortalama RMSE: 0.4749
Model ağırlıkları:
XGB1: 0.166
XGB2: 0.167
LGB: 0.167
CAT: 0.167
RF: 0.166
ET: 0.166
Ağırlıklı Ortalama RMSE: 0.4749
Stacking (Ridge) RMSE: 0.4737
Stacking katsayıları: [-0.24879839  0.69575129  0.02535835  0.49452387 -0.00885122  0.03803709]

En iyi method: Stacking (RMSE: 0.4737)

=== TÜM VERİ İLE EĞİTİM ===
X_full shape: (70736, 43), X_te shape: (30789, 43)
Model 1 eğitiliyor...
Model 2 eğitiliyor...
Model 3 eğitiliyor...
Model 4 eğitiliyor...
Model 5 eğitiliyor...
Model 6 eğitiliyor...
Test tahminleri yapılıyor...


In [48]:

# Log'dan geri dönüş
p_te_final = np.expm1(p_te_ensemble)

In [49]:
# === SUBMISSION HAZIRLA ===
print("\n=== SUBMISSION HAZIRLIĞI ===")
preds_ensemble = pd.DataFrame({
    "user_session": test_feat["user_session"].values,
    "session_value": p_te_final
})

# Sub sırasına hizala
submission = sub[["user_session"]].merge(
    preds_ensemble, on="user_session", how="left", validate="one_to_one"
)
submission["session_value"] = pd.to_numeric(submission["session_value"], errors="coerce").astype("float64")

# 210 overlap doldur
leak_csv = os.path.join(ARTI, "leak_map_session_value.csv")
if os.path.exists(leak_csv):
    leak_df = pd.read_csv(leak_csv)
    if {"user_session", "session_value"}.issubset(leak_df.columns):
        leak_map = pd.Series(leak_df["session_value"].values, index=leak_df["user_session"].astype(str).values)
    else:
        leak_df = pd.read_csv(leak_csv, index_col=0)
        leak_map = leak_df.iloc[:, 0]
        leak_map.index = leak_map.index.astype(str)

    leak_map = pd.to_numeric(leak_map, errors="coerce").astype("float64")
    hits = submission["user_session"].astype(str).isin(leak_map.index)
    mapped = submission.loc[hits, "user_session"].astype(str).map(leak_map).to_numpy(dtype="float64")
    submission.loc[hits, "session_value"] = mapped
    print("Overlap fill uygulanan satır:", int(hits.sum()))

# Temizle ve kaydet
submission["session_value"] = submission["session_value"].fillna(0.0)
submission["session_value"] = np.clip(submission["session_value"].values, 0.0, None).astype("float64")
submission["session_value"] = submission["session_value"].round(5)

out_csv = os.path.join(OUT, "submission_ensemble2.csv")
submission.to_csv(out_csv, index=False)
print(f"Ensemble submission yazıldı -> {out_csv} | satır: {submission.shape[0]}")
print(f"Kullanılan method: {best_method}")
print(f"Validation RMSE: {best_rmse:.4f}")


=== SUBMISSION HAZIRLIĞI ===
Overlap fill uygulanan satır: 210
Ensemble submission yazıldı -> /content/sample_data/datathon2025/outputs/submission_ensemble2.csv | satır: 30789
Kullanılan method: stacking
Validation RMSE: 0.4737


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

xgb = XGBRegressor(tree_method="hist", random_state=123, verbosity=0)

param_dist = {
    'n_estimators': [800, 1000, 1200],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.01, 0.03, 0.05],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_lambda': [1.0, 2.0, 3.0],
    'reg_alpha': [0.0, 0.1, 0.5]
}

search = RandomizedSearchCV(
    xgb, param_distributions=param_dist,
    n_iter=20, scoring='neg_root_mean_squared_error', cv=3, verbose=1, n_jobs=-1
)

search.fit(X_tr, ytr)
print(search.best_params_)
print(-search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
