In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
import pandas as pd, numpy as np, gc

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
orig = pd.read_csv('Exam_Score_Prediction.csv')

print("train_shape:",train.shape)
print("test.shape:",test.shape)
print("orig.shape:",orig.shape)

orig

# 今後のためにリストを作る
target = 'exam_score'
base = [col for col in train.columns if col not in ['id', target]]
categories = train.select_dtypes('object').columns.to_list()
nums = [col for col in base if col not in categories]
print(f'{len(base)} Base Features:{base}')

train_shape: (630000, 13)
test.shape: (270000, 12)
orig.shape: (20000, 13)
11 Base Features:['age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [3]:
ORIG = []

# 外部データの各カラムのユニークごとの平均値というカラムを追加する。
for col in base:
    # 一つの列に対してgroupbyで固有の値をまとめる。それらのtargetをそれぞれ平均する
    mean_map = orig.groupby(col)[target].mean() 
    new_mean_col_name = f"orig_mean_{col}"
    mean_map.name = new_mean_col_name
    
    train = train.merge(mean_map, on=col, how='left') # colをキーにして
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)
    
# 外部データの各カラムのユニークごとのサイズというカラムを追加する。
    new_count_col_name = f"orig_count_{col}"
    count_map = orig.groupby(col).size().reset_index(name=new_count_col_name)
    
    train = train.merge(count_map, on=col, how='left')
    test = test.merge(count_map, on=col, how='left')
    ORIG.append(new_count_col_name)

print(f'{len(ORIG)} ORIG Features Created.')

22 ORIG Features Created.


In [4]:
# origには存在するが、trainには存在しないカテゴリを全体平均で埋める
for col in ORIG:
    if 'mean' in col:
        train[col] = train[col].fillna(orig[target].mean())
        test[col] = test[col].fillna(orig[target].mean())
    else:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

In [5]:
# reduce_mem_usage はここに定義（そのままでOK）

features = base + ORIG

# まず X, y を作る（これが先）
X = train[features].copy()
y = train[target].copy()

# test側も、モデルに入れる列だけにそろえる（重要）
X_test = test[features].copy()

gc.collect()


23

In [6]:
from sklearn.model_selection import KFold
import numpy as np

# =========================
# Target Encoding (OOFでリーク防止) + 列選別条件つき
# =========================

def select_te_cols(
    df_train, df_test, cols,
    min_unique=3,              # unique <=2 は除外
    max_unique_abs=5000,       # 高カーディナリティ除外
    max_unique_ratio=0.30,     # unique/行数 が大きすぎる列は除外（ID化）
    max_missing=0.60,          # 欠損率が高い列は除外
    rare_thr=5,                # レア判定（出現回数<=5）
    max_rare_points_ratio=0.80,# レアカテゴリが占める割合が大きい列は除外
    max_unseen_ratio=0.20      # testにしかないカテゴリが多い列は除外
):
    n = len(df_train)
    chosen = []
    for col in cols:
        s_tr = df_train[col]
        s_te = df_test[col]

        # 欠損
        if s_tr.isna().mean() > max_missing:
            continue

        # unique
        nunq = s_tr.nunique(dropna=True)
        if nunq < min_unique:
            continue
        if nunq > max_unique_abs:
            continue
        if nunq / n > max_unique_ratio:
            continue

        # レアカテゴリ比率
        vc = s_tr.value_counts(dropna=True)
        rare_points_ratio = (s_tr.map(vc).fillna(0) <= rare_thr).mean()
        if rare_points_ratio > max_rare_points_ratio:
            continue

        # unseen比率（testにあるがtrainにないカテゴリの比率）
        tr_set = set(s_tr.dropna().unique())
        te_set = set(s_te.dropna().unique())
        if len(te_set) > 0:
            unseen_ratio = len(te_set - tr_set) / len(te_set)
            if unseen_ratio > max_unseen_ratio:
                continue

        chosen.append(col)
    return chosen


def add_target_encoding_oof(train_df, test_df, y, te_cols, n_splits=5, seed=42, smoothing=20):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for col in te_cols:
        te_name = f"te_{col}"
        train_te = np.zeros(len(train_df), dtype=np.float64)
        test_te_folds = []

        for tr_idx, va_idx in kf.split(train_df):
            X_tr = train_df.iloc[tr_idx]
            y_tr = y.iloc[tr_idx]
            X_va = train_df.iloc[va_idx]

            prior = y_tr.mean()

            stats = (
                pd.DataFrame({col: X_tr[col].values, "y": y_tr.values})
                .groupby(col)["y"]
                .agg(["mean", "count"])
            )

            smooth_map = (stats["count"] * stats["mean"] + smoothing * prior) / (stats["count"] + smoothing)

            train_te[va_idx] = X_va[col].map(smooth_map).fillna(prior).astype(np.float64).values
            test_te_folds.append(test_df[col].map(smooth_map).fillna(prior).astype(np.float64).values)

        train_df[te_name] = train_te
        test_df[te_name] = np.mean(np.vstack(test_te_folds), axis=0)

    return train_df, test_df


# ---- ここがあなたのコードの差し替え部分 ----

# TE対象を「object列のうち、条件を満たす列」に絞る
TE_COLS_RAW = categories
TE_COLS = select_te_cols(
    train, test, TE_COLS_RAW,
    min_unique=3,
    max_unique_abs=5000,
    max_unique_ratio=0.30,
    max_missing=0.60,
    rare_thr=5,
    max_rare_points_ratio=0.80,
    max_unseen_ratio=0.20
)
print(f"Target Encoding applied to {len(TE_COLS)} features.")
print("TE_COLS:", TE_COLS)

# OOF TE作成
train, test = add_target_encoding_oof(train, test, y, TE_COLS, n_splits=5, seed=42, smoothing=20)

TE_FEATURES = [f"te_{c}" for c in TE_COLS]
features = base + ORIG + TE_FEATURES

X = train[features].copy()
y = train[target].copy()
X_test = test[features].copy()

gc.collect()

print(X.columns)
print(X_test.columns)


Target Encoding applied to 6 features.
TE_COLS: ['gender', 'course', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
Index(['age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty', 'orig_mean_age', 'orig_count_age',
       'orig_mean_gender', 'orig_count_gender', 'orig_mean_course',
       'orig_count_course', 'orig_mean_study_hours', 'orig_count_study_hours',
       'orig_mean_class_attendance', 'orig_count_class_attendance',
       'orig_mean_internet_access', 'orig_count_internet_access',
       'orig_mean_sleep_hours', 'orig_count_sleep_hours',
       'orig_mean_sleep_quality', 'orig_count_sleep_quality',
       'orig_mean_study_method', 'orig_count_study_method',
       'orig_mean_facility_rating', 'orig_count_facility_rating',
       'orig_mean_exam_difficulty', 'orig_count_exam_difficulty', 'te_gender',
       'te_course', 'te_sleep_q

In [7]:
def reduce_mem_usage_safe(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float32)
        elif df[col].dtype == np.int64:
            df[col] = df[col].astype(np.int32)
    return df

In [8]:
# =========================
# one-hot（fold外で1回）
# =========================

# 線形モデル用
X_lin = pd.get_dummies(train[features], drop_first=False)
test_lin_X = pd.get_dummies(test[features], drop_first=False)
X_lin, test_lin_X = X_lin.align(test_lin_X, join="left", axis=1, fill_value=0)

# 残差XGB用（同じでOKだが分けておく）
X_xgb = pd.get_dummies(train[features], drop_first=False)
test_xgb = pd.get_dummies(test[features], drop_first=False)
X_xgb, test_xgb = X_xgb.align(test_xgb, join="left", axis=1, fill_value=0)

y = train[target]

print("X_lin:", X_lin.shape)
print("X_xgb:", X_xgb.shape)
print("y:", y.shape)


X_lin: (630000, 58)
X_xgb: (630000, 58)
y: (630000,)


In [10]:
# ============================================================
# 重み付き Pseudo Label 版（xgb.train / 古いxgboost対応）
# 線形モデル + 残差XGB
# - Model1: train真残差で学習 → val/test残差を予測（擬似ラベル）
# - Model2: train(真) + val/test(擬似) で学習（擬似には小さめ重み）
# - early stopping は train fold の真データから切った holdout のみで実施（leak回避）
# CV → test予測 → submission 作成まで全部
# ============================================================

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet, HuberRegressor
import numpy as np
import xgboost as xgb
import pandas as pd

# =========================
# 前提:
# X_lin, X_xgb, test_lin_X, test_xgb, y, test が既に定義済み
# =========================

MODEL_NAME = "Ridge"  # Ridge / Lasso / ElasticNet / Huber
N_SPLITS = 5
SEED = 42

# 擬似ラベル重み（ここが主な調整ノブ）
W_PSEUDO_VAL  = 0.3
W_PSEUDO_TEST = 0.3

# early stopping 用 holdout（真データからのみ）
ES_FRAC = 0.10

def make_linear(name, seed):
    if name == "Ridge":
        return Ridge(alpha=1.0, random_state=seed)
    if name == "Lasso":
        return Lasso(alpha=1e-3, random_state=seed, max_iter=5000)
    if name == "ElasticNet":
        return ElasticNet(alpha=1e-3, l1_ratio=0.5, random_state=seed, max_iter=5000)
    if name == "Huber":
        return HuberRegressor(alpha=1e-4, epsilon=1.35, max_iter=1000)
    raise ValueError("invalid MODEL_NAME")

def dm(X, y=None, w=None):
    d = xgb.DMatrix(X, label=y) if y is not None else xgb.DMatrix(X)
    if w is not None:
        d.set_weight(w)
    return d

# xgb.train params
params = dict(
    objective="reg:squarederror",
    eta=0.02,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=SEED,
    tree_method="hist",
)

NUM_BOOST_ROUND = 20000
EARLY_STOP = 200

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_m1 = np.zeros(len(X_lin))
oof_m2 = np.zeros(len(X_lin))
test_pred_folds_m2 = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_lin), 1):
    # split
    X_tr_lin, X_va_lin = X_lin.iloc[tr_idx], X_lin.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    X_tr_xgb, X_va_xgb = X_xgb.iloc[tr_idx], X_xgb.iloc[va_idx]

    # 1) 線形
    lin = make_linear(MODEL_NAME, SEED)
    lin.fit(X_tr_lin, y_tr)

    tr_lin = lin.predict(X_tr_lin)
    va_lin = lin.predict(X_va_lin)
    te_lin = lin.predict(test_lin_X)

    # 2) 残差（真）
    y_tr_res = (y_tr.values - tr_lin)
    y_va_res_true = (y_va.values - va_lin)

    # -------------------------
    # Model1: 残差XGB（真の残差で学習）
    # -------------------------
    dtr1 = dm(X_tr_xgb, y_tr_res)
    dva1 = dm(X_va_xgb, y_va_res_true)
    dte  = dm(test_xgb)

    bst1 = xgb.train(
        params=params,
        dtrain=dtr1,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dtr1, "train"), (dva1, "valid")],
        early_stopping_rounds=EARLY_STOP,
        verbose_eval=False,
    )

    va_res_m1 = bst1.predict(dva1, iteration_range=(0, bst1.best_iteration + 1))
    te_res_m1 = bst1.predict(dte,  iteration_range=(0, bst1.best_iteration + 1))

    va_pred_m1 = va_lin + va_res_m1
    oof_m1[va_idx] = va_pred_m1
    rmse1 = np.sqrt(mean_squared_error(y_va, va_pred_m1))

    # -------------------------
    # 擬似ラベル（残差）
    # -------------------------
    y_va_res_pseudo = va_res_m1
    y_te_res_pseudo = te_res_m1

    # -------------------------
    # Model2: 真(train) + 擬似(val/test) を重み付きで学習
    # early stopping は train真データから切った holdout のみ
    # -------------------------
    idx_all = np.arange(len(tr_idx))
    idx_fit, idx_es = train_test_split(
        idx_all, test_size=ES_FRAC, random_state=SEED + fold, shuffle=True
    )

    # fit 用（真 train の一部 + val擬似 + test擬似）
    X_fit_true = X_tr_xgb.iloc[idx_fit]
    y_fit_true = y_tr_res[idx_fit]
    w_fit_true = np.ones(len(idx_fit), dtype=np.float32)

    X_fit_all = pd.concat([X_fit_true, X_va_xgb, test_xgb], axis=0, ignore_index=True)
    y_fit_all = np.concatenate([y_fit_true, y_va_res_pseudo, y_te_res_pseudo], axis=0)

    w_fit_all = np.concatenate([
        w_fit_true,
        np.full(len(X_va_xgb),  W_PSEUDO_VAL,  dtype=np.float32),
        np.full(len(test_xgb),  W_PSEUDO_TEST, dtype=np.float32),
    ], axis=0)

    # early stop 用（真 train の holdout）
    X_es_true = X_tr_xgb.iloc[idx_es]
    y_es_true = y_tr_res[idx_es]
    w_es_true = np.ones(len(idx_es), dtype=np.float32)

    dfit2 = dm(X_fit_all, y_fit_all, w_fit_all)
    des2  = dm(X_es_true, y_es_true, w_es_true)

    bst2 = xgb.train(
        params=params,
        dtrain=dfit2,
        num_boost_round=NUM_BOOST_ROUND,
        evals=[(dfit2, "train"), (des2, "valid")],
        early_stopping_rounds=EARLY_STOP,
        verbose_eval=False,
    )

    va_res_m2 = bst2.predict(dva1, iteration_range=(0, bst2.best_iteration + 1))
    te_res_m2 = bst2.predict(dte,  iteration_range=(0, bst2.best_iteration + 1))

    va_pred_m2 = va_lin + va_res_m2
    oof_m2[va_idx] = va_pred_m2

    te_pred_m2 = te_lin + te_res_m2
    test_pred_folds_m2.append(te_pred_m2)

    rmse2 = np.sqrt(mean_squared_error(y_va, va_pred_m2))

    print(f"[{MODEL_NAME}] Fold {fold} | RMSE Model1: {rmse1:.5f} | RMSE Model2(pseudo,w): {rmse2:.5f}")

# OOF
rmse_oof_m1 = np.sqrt(mean_squared_error(y, oof_m1))
rmse_oof_m2 = np.sqrt(mean_squared_error(y, oof_m2))
print(f"[{MODEL_NAME}] OOF RMSE Model1: {rmse_oof_m1:.5f}")
print(f"[{MODEL_NAME}] OOF RMSE Model2(pseudo,w): {rmse_oof_m2:.5f}")

# test（fold平均）
test_pred = np.mean(np.vstack(test_pred_folds_m2), axis=0)
test_pred = np.clip(test_pred, y.min(), y.max())

# submission
sub = pd.DataFrame({"id": test["id"], "exam_score": test_pred})
out_path = f"submission_{MODEL_NAME}_pseudo_weighted.csv"
sub.to_csv(out_path, index=False)

print(sub.head())
print(f"saved: {out_path}")


[Ridge] Fold 1 | RMSE Model1: 8.67853 | RMSE Model2(pseudo,w): 8.68340
[Ridge] Fold 2 | RMSE Model1: 8.68095 | RMSE Model2(pseudo,w): 8.68183
[Ridge] Fold 3 | RMSE Model1: 8.67886 | RMSE Model2(pseudo,w): 8.68156
[Ridge] Fold 4 | RMSE Model1: 8.69098 | RMSE Model2(pseudo,w): 8.69702
[Ridge] Fold 5 | RMSE Model1: 8.70321 | RMSE Model2(pseudo,w): 8.71120
[Ridge] OOF RMSE Model1: 8.68651
[Ridge] OOF RMSE Model2(pseudo,w): 8.69101
       id  exam_score
0  630000   70.640470
1  630001   70.106281
2  630002   89.464373
3  630003   55.330491
4  630004   45.911533
saved: submission_Ridge_pseudo_weighted.csv
