In [77]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [78]:
import pandas as pd, numpy as np, gc

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
orig = pd.read_csv('Exam_Score_Prediction.csv')

print("train_shape:",train.shape)
print("test.shape:",test.shape)
print("orig.shape:",orig.shape)

orig

# 今後のためにリストを作る
target = 'exam_score'
base = [col for col in train.columns if col not in ['id', target]]
categories = train.select_dtypes('object').columns.to_list()
nums = [col for col in base if col not in categories]
print(f'{len(base)} Base Features:{base}')

train_shape: (630000, 13)
test.shape: (270000, 12)
orig.shape: (20000, 13)
11 Base Features:['age', 'gender', 'course', 'study_hours', 'class_attendance', 'internet_access', 'sleep_hours', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']


In [79]:
ORIG = []

# 外部データの各カラムのユニークごとの平均値というカラムを追加する。
for col in base:
    # 一つの列に対してgroupbyで固有の値をまとめる。それらのtargetをそれぞれ平均する
    mean_map = orig.groupby(col)[target].mean() 
    new_mean_col_name = f"orig_mean_{col}"
    mean_map.name = new_mean_col_name
    
    train = train.merge(mean_map, on=col, how='left') # colをキーにして
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)
    
# 外部データの各カラムのユニークごとのサイズというカラムを追加する。
    new_count_col_name = f"orig_count_{col}"
    count_map = orig.groupby(col).size().reset_index(name=new_count_col_name)
    
    train = train.merge(count_map, on=col, how='left')
    test = test.merge(count_map, on=col, how='left')
    ORIG.append(new_count_col_name)

print(f'{len(ORIG)} ORIG Features Created.')

22 ORIG Features Created.


In [80]:
# origには存在するが、trainには存在しないカテゴリを全体平均で埋める
for col in ORIG:
    if 'mean' in col:
        train[col] = train[col].fillna(orig[target].mean())
        test[col] = test[col].fillna(orig[target].mean())
    else:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

In [81]:
# reduce_mem_usage はここに定義（そのままでOK）

features = base + ORIG

# まず X, y を作る（これが先）
X = train[features].copy()
y = train[target].copy()

# test側も、モデルに入れる列だけにそろえる（重要）
X_test = test[features].copy()

gc.collect()


22

In [82]:
from sklearn.model_selection import KFold
import numpy as np

# =========================
# Target Encoding (OOFでリーク防止) + 列選別条件つき
# =========================

def select_te_cols(
    df_train, df_test, cols,
    min_unique=3,              # unique <=2 は除外
    max_unique_abs=5000,       # 高カーディナリティ除外
    max_unique_ratio=0.30,     # unique/行数 が大きすぎる列は除外（ID化）
    max_missing=0.60,          # 欠損率が高い列は除外
    rare_thr=5,                # レア判定（出現回数<=5）
    max_rare_points_ratio=0.80,# レアカテゴリが占める割合が大きい列は除外
    max_unseen_ratio=0.20      # testにしかないカテゴリが多い列は除外
):
    n = len(df_train)
    chosen = []
    for col in cols:
        s_tr = df_train[col]
        s_te = df_test[col]

        # 欠損
        if s_tr.isna().mean() > max_missing:
            continue

        # unique
        nunq = s_tr.nunique(dropna=True)
        if nunq < min_unique:
            continue
        if nunq > max_unique_abs:
            continue
        if nunq / n > max_unique_ratio:
            continue

        # レアカテゴリ比率
        vc = s_tr.value_counts(dropna=True)
        rare_points_ratio = (s_tr.map(vc).fillna(0) <= rare_thr).mean()
        if rare_points_ratio > max_rare_points_ratio:
            continue

        # unseen比率（testにあるがtrainにないカテゴリの比率）
        tr_set = set(s_tr.dropna().unique())
        te_set = set(s_te.dropna().unique())
        if len(te_set) > 0:
            unseen_ratio = len(te_set - tr_set) / len(te_set)
            if unseen_ratio > max_unseen_ratio:
                continue

        chosen.append(col)
    return chosen


def add_target_encoding_oof(train_df, test_df, y, te_cols, n_splits=5, seed=42, smoothing=20):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for col in te_cols:
        te_name = f"te_{col}"
        train_te = np.zeros(len(train_df), dtype=np.float64)
        test_te_folds = []

        for tr_idx, va_idx in kf.split(train_df):
            X_tr = train_df.iloc[tr_idx]
            y_tr = y.iloc[tr_idx]
            X_va = train_df.iloc[va_idx]

            prior = y_tr.mean()

            stats = (
                pd.DataFrame({col: X_tr[col].values, "y": y_tr.values})
                .groupby(col)["y"]
                .agg(["mean", "count"])
            )

            smooth_map = (stats["count"] * stats["mean"] + smoothing * prior) / (stats["count"] + smoothing)

            train_te[va_idx] = X_va[col].map(smooth_map).fillna(prior).astype(np.float64).values
            test_te_folds.append(test_df[col].map(smooth_map).fillna(prior).astype(np.float64).values)

        train_df[te_name] = train_te
        test_df[te_name] = np.mean(np.vstack(test_te_folds), axis=0)

    return train_df, test_df


# ---- ここがあなたのコードの差し替え部分 ----

# TE対象を「object列のうち、条件を満たす列」に絞る
TE_COLS_RAW = categories
TE_COLS = select_te_cols(
    train, test, TE_COLS_RAW,
    min_unique=3,
    max_unique_abs=5000,
    max_unique_ratio=0.30,
    max_missing=0.60,
    rare_thr=5,
    max_rare_points_ratio=0.80,
    max_unseen_ratio=0.20
)
print(f"Target Encoding applied to {len(TE_COLS)} features.")
print("TE_COLS:", TE_COLS)

# OOF TE作成
train, test = add_target_encoding_oof(train, test, y, TE_COLS, n_splits=5, seed=42, smoothing=20)

TE_FEATURES = [f"te_{c}" for c in TE_COLS]
features = base + ORIG + TE_FEATURES

X = train[features].copy()
y = train[target].copy()
X_test = test[features].copy()

gc.collect()

print(X.columns)
print(X_test.columns)


Target Encoding applied to 6 features.
TE_COLS: ['gender', 'course', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
Index(['age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty', 'orig_mean_age', 'orig_count_age',
       'orig_mean_gender', 'orig_count_gender', 'orig_mean_course',
       'orig_count_course', 'orig_mean_study_hours', 'orig_count_study_hours',
       'orig_mean_class_attendance', 'orig_count_class_attendance',
       'orig_mean_internet_access', 'orig_count_internet_access',
       'orig_mean_sleep_hours', 'orig_count_sleep_hours',
       'orig_mean_sleep_quality', 'orig_count_sleep_quality',
       'orig_mean_study_method', 'orig_count_study_method',
       'orig_mean_facility_rating', 'orig_count_facility_rating',
       'orig_mean_exam_difficulty', 'orig_count_exam_difficulty', 'te_gender',
       'te_course', 'te_sleep_q

In [83]:
def reduce_mem_usage_safe(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float32)
        elif df[col].dtype == np.int64:
            df[col] = df[col].astype(np.int32)
    return df

In [84]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor

# =========================
# 改良ポイント（過小予測の解消を優先）
# 1) 線形予測値（lin_pred）を残差モデルの特徴量に追加（超効く）
# 2) 高得点側に重みを付けて学習（過小予測を減らす）
# 3) 最終予測を y の範囲にクリップ（破綻防止）
# =========================

# カラム型
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_linear = np.zeros(len(X))
oof_resid  = np.zeros(len(X))
test_linear_folds = []
test_resid_folds  = []

linear_model = Pipeline(steps=[
    ("pre", ColumnTransformer(
        transformers=[
            ("num", "passthrough", num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ],
        remainder="drop"
    )),
    ("ridge", Ridge(alpha=1.0, random_state=42))
])

xgb_params = dict(
    n_estimators=3000,          # 探索しやすいように少し軽く
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,         # 過学習しづらく、安定しやすい
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="rmse",
    early_stopping_rounds=100,
    tree_method="hist",         # GPUなら "gpu_hist" に
)

def factorize_fit_transform(train_df, val_df, test_df, cat_cols):
    train_df = train_df.copy()
    val_df   = val_df.copy()
    test_df  = test_df.copy()
    for c in cat_cols:
        comb = pd.concat([train_df[c], val_df[c], test_df[c]], axis=0)
        codes, _ = comb.factorize(sort=True)
        n_tr = len(train_df)
        n_va = len(val_df)
        train_df[c] = codes[:n_tr]
        val_df[c]   = codes[n_tr:n_tr+n_va]
        test_df[c]  = codes[n_tr+n_va:]
    return train_df, val_df, test_df

def reduce_mem_usage_safe(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float32)
        elif df[col].dtype == np.int64:
            df[col] = df[col].astype(np.int32)
    return df

y_min, y_max = y.min(), y.max()

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx].copy(), y.iloc[tr_idx].copy()
    X_va, y_va = X.iloc[va_idx].copy(), y.iloc[va_idx].copy()

    # ---- (A) 線形 ----
    linear_model.fit(X_tr, y_tr)

    pred_lin_tr = linear_model.predict(X_tr)
    pred_lin_va = linear_model.predict(X_va)
    pred_lin_te = linear_model.predict(X_test)

    oof_linear[va_idx] = pred_lin_va
    test_linear_folds.append(pred_lin_te)

    # ---- (B) 残差 ----
    resid_tr = y_tr - pred_lin_tr
    resid_va_true = y_va - pred_lin_va

    # ---- (C) 残差モデル入力 ----
    X_tr2, X_va2, X_te2 = factorize_fit_transform(X_tr, X_va, X_test, cat_cols)

    #★★★ ここに入れる ★★★
    X_tr2["dist_to_max"] = (100 - pred_lin_tr).clip(0).astype(np.float32)
    X_va2["dist_to_max"] = (100 - pred_lin_va).clip(0).astype(np.float32)
    X_te2["dist_to_max"] = (100 - pred_lin_te).clip(0).astype(np.float32)
    
    LOW = float(y.min())  # まずこれでOK。仮説値(19.6)を使うのは後で  
    X_tr2["dist_to_min"] = np.clip(pred_lin_tr - LOW, 0, None).astype(np.float32)
    X_va2["dist_to_min"] = np.clip(pred_lin_va - LOW, 0, None).astype(np.float32)
    X_te2["dist_to_min"] = np.clip(pred_lin_te - LOW, 0, None).astype(np.float32)

    # ★ lin_pred を追加（過小予測を直す最優先の一手）
    X_tr2["lin_pred"] = pred_lin_tr.astype(np.float32)
    X_va2["lin_pred"] = pred_lin_va.astype(np.float32)
    X_te2["lin_pred"] = pred_lin_te.astype(np.float32)

    # 軽量化
    X_tr2 = reduce_mem_usage_safe(X_tr2)
    X_va2 = reduce_mem_usage_safe(X_va2)
    X_te2 = reduce_mem_usage_safe(X_te2)

    # ★ 高得点側を重視（過小予測を減らす）
    # 1〜2程度の軽い重み（極端にしない）
    w_tr = (1.0 + (y_tr / y_tr.max())).astype(np.float32)
    w_va = (1.0 + (y_va / y_tr.max())).astype(np.float32)  # eval用の重み（任意）

    resid_model = XGBRegressor(**xgb_params)
    resid_model.fit(
        X_tr2, resid_tr,
        sample_weight=w_tr,
        eval_set=[(X_va2, resid_va_true)],
        verbose=200
    )

    pred_resid_va = resid_model.predict(X_va2)
    pred_resid_te = resid_model.predict(X_te2)

    oof_resid[va_idx] = pred_resid_va
    test_resid_folds.append(pred_resid_te)

    # ---- (D) 合成 + クリップ ----
    pred_final_va = pred_lin_va + pred_resid_va
    pred_final_va = np.clip(pred_final_va, y_min, y_max)

    rmse = np.sqrt(mean_squared_error(y_va, pred_final_va))
    print(f"Fold {fold} RMSE: {rmse:.5f}")

oof_final = oof_linear + oof_resid
oof_final = np.clip(oof_final, y_min, y_max)

print("-" * 30)
print(f"OOF RMSE: {np.sqrt(mean_squared_error(y, oof_final)):.5f}")

test_linear = np.mean(np.vstack(test_linear_folds), axis=0)
test_resid  = np.mean(np.vstack(test_resid_folds), axis=0)
test_pred   = test_linear + test_resid
test_pred   = np.clip(test_pred, y_min, y_max)

# submission（必要なら）
# sub = pd.DataFrame({"id": test["id"], "exam_score": test_pred})
# sub.to_csv("submission.csv", index=False)


[0]	validation_0-rmse:8.85098
[200]	validation_0-rmse:8.80050
[400]	validation_0-rmse:8.77889
[600]	validation_0-rmse:8.76239
[800]	validation_0-rmse:8.74938
[1000]	validation_0-rmse:8.73911
[1200]	validation_0-rmse:8.73109
[1400]	validation_0-rmse:8.72459
[1600]	validation_0-rmse:8.71889
[1800]	validation_0-rmse:8.71331
[2000]	validation_0-rmse:8.70866
[2200]	validation_0-rmse:8.70480
[2400]	validation_0-rmse:8.70153
[2600]	validation_0-rmse:8.69874
[2800]	validation_0-rmse:8.69576
[2999]	validation_0-rmse:8.69394
Fold 1 RMSE: 8.69363
[0]	validation_0-rmse:8.85333
[200]	validation_0-rmse:8.80530
[400]	validation_0-rmse:8.78453
[600]	validation_0-rmse:8.76797
[800]	validation_0-rmse:8.75463
[1000]	validation_0-rmse:8.74425
[1200]	validation_0-rmse:8.73550
[1400]	validation_0-rmse:8.72817
[1600]	validation_0-rmse:8.72140
[1800]	validation_0-rmse:8.71598
[2000]	validation_0-rmse:8.71163
[2200]	validation_0-rmse:8.70799
[2400]	validation_0-rmse:8.70445
[2600]	validation_0-rmse:8.70129
[28

In [85]:
# submission 作成（id列は自動で推定して作る）
pred_col = target  # 'exam_score'

# id列名を推定
id_col = "id" if "id" in test.columns else ("Id" if "Id" in test.columns else None)
if id_col is None:
    raise ValueError("testに id / Id 列が見つかりません。id列名を手動で指定してください。")

sub = pd.DataFrame({
    id_col: test[id_col].values,
    pred_col: test_pred  # 直前で作った最終予測（線形 + 残差）
})

sub.to_csv("submission.csv", index=False)
sub.head()


Unnamed: 0,id,exam_score
0,630000,71.368789
1,630001,69.648101
2,630002,89.487619
3,630003,55.742865
4,630004,46.082344


In [86]:
# =========================
# 前提：
# - train, test, X, y, X_test がある
# - oof_linear がある（線形モデルのOOF予測）
#   ※もし最終OOF(oof_final)で見たいなら、oof_linear を oof_final に置換
# - ORIG: ORIG特徴量名リスト（例: ["orig_mean_x", "orig_count_x", ...]）
# - base: 元の特徴量名リスト
# =========================

import numpy as np
import pandas as pd

# 0) 残差上位100件を作る
resid = y - pd.Series(oof_linear, index=y.index)
resid_abs = resid.abs()

topN = 100
top_idx = resid_abs.sort_values(ascending=False).head(topN).index

# X は特徴量DF（trainのfeaturesだけにしてある想定）
top = X.loc[top_idx].copy()
top["y"] = y.loc[top_idx]
top["pred"] = pd.Series(oof_linear, index=y.index).loc[top_idx]
top["resid"] = resid.loc[top_idx]
top["abs_resid"] = resid_abs.loc[top_idx]

print("Top residual samples shape:", top.shape)
print(top[["y", "pred", "resid", "abs_resid"]].head())


# 1) ① 特定カテゴリに偏っていないか
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("\n" + "="*80)
print("① カテゴリ偏りチェック（Top100の上位頻出カテゴリ）")
print("="*80)

for c in cat_cols:
    print(f"\n[{c}] Top5")
    print(top[c].value_counts(dropna=False).head(5))

# 追加：全体との差を見たい場合（Top100比率 - 全体比率）
print("\n" + "="*80)
print("①-追加: Top100と全体の比率差（上位カテゴリ）")
print("="*80)

for c in cat_cols:
    top_ratio = top[c].value_counts(normalize=True, dropna=False)
    all_ratio = X[c].value_counts(normalize=True, dropna=False)
    diff = (top_ratio - all_ratio).sort_values(ascending=False).head(5)
    print(f"\n[{c}] Top100比率 - 全体比率 (Top5)")
    print(diff)


# 2) ② 予測が一方向に外れていないか（残差の符号の偏り）
print("\n" + "="*80)
print("② 残差の符号偏り（過小/過大）")
print("="*80)

print("Top100 resid describe:")
print(top["resid"].describe())

pos = (top["resid"] > 0).sum()
neg = (top["resid"] < 0).sum()
zero = (top["resid"] == 0).sum()
print(f"\nTop100 resid sign count:  +:{pos}, -:{neg}, 0:{zero}")
print(f"Top100 resid sign ratio:  +:{pos/topN:.2%}, -:{neg/topN:.2%}, 0:{zero/topN:.2%}")

# 追加：残差の大きい順で「過小評価トップ」「過大評価トップ」
print("\n過小評価（residが大きい）Top10")
print(top.sort_values("resid", ascending=False)[["y","pred","resid","abs_resid"]].head(10))

print("\n過大評価（residが小さい）Top10")
print(top.sort_values("resid", ascending=True)[["y","pred","resid","abs_resid"]].head(10))


# 3) ③ ORIG / TE が極端でないか（ORIG中心）
print("\n" + "="*80)
print("③ ORIG特徴量の極端さチェック")
print("="*80)

# ORIGが無い場合でも落ちないように
orig_cols_in_X = [c for c in (ORIG if "ORIG" in globals() else []) if c in X.columns]

if len(orig_cols_in_X) == 0:
    print("ORIG特徴量が X に見つかりません（orig_cols_in_X が空）")
else:
    # Top100の統計
    top_desc = top[orig_cols_in_X].describe().T

    # 全体の統計
    all_desc = X[orig_cols_in_X].describe().T

    # 差分（Top100平均 - 全体平均）
    diff_mean = (top_desc["mean"] - all_desc["mean"]).sort_values(ascending=False)

    print("\nTop100平均 - 全体平均（差が大きい順 Top20）")
    print(diff_mean.head(20))

    # count系が小さいのにmeanが極端な候補を探す
    mean_cols = [c for c in orig_cols_in_X if "orig_mean_" in c]
    cnt_cols  = [c for c in orig_cols_in_X if "orig_count_" in c]

    if len(mean_cols) > 0 and len(cnt_cols) > 0:
        # mean列に対応するcount列のペアを作る
        pairs = []
        for m in mean_cols:
            base_name = m.replace("orig_mean_", "")
            cnt_name = f"orig_count_{base_name}"
            if cnt_name in X.columns:
                pairs.append((m, cnt_name))

        if len(pairs) == 0:
            print("\n(mean,count)の対応ペアが見つかりませんでした。")
        else:
            # 「countが小さいのにmeanが全体平均との差が大きい」行を探す
            global_mean = y.mean()
            rows = []
            for m, cnt in pairs:
                tmp = top[[m, cnt]].copy()
                tmp["abs_mean_dev"] = (tmp[m] - global_mean).abs()
                tmp["m"] = m
                tmp["cnt"] = cnt
                rows.append(tmp)

            chk = pd.concat(rows, axis=0)
            chk = chk.sort_values(["abs_mean_dev",], ascending=False)

            print("\ncountが小さいのに mean が極端そう（候補 Top20）")
            # cnt列名が行ごとに違うので表示を揃える
            print(chk.head(20))
    else:
        print("\nmean_cols または cnt_cols が見つかりませんでした。")


# 4) ④ 数値特徴量のスケール外れ
print("\n" + "="*80)
print("④ 数値特徴量のスケール外れチェック（Top100 vs 全体）")
print("="*80)

num_cols = [c for c in X.columns if c not in cat_cols]

# Top100と全体で平均・標準偏差がズレてる列
if len(num_cols) == 0:
    print("数値列が見つかりません。")
else:
    top_mu = top[num_cols].mean(numeric_only=True)
    all_mu = X[num_cols].mean(numeric_only=True)
    top_sd = top[num_cols].std(numeric_only=True).replace(0, np.nan)
    all_sd = X[num_cols].std(numeric_only=True).replace(0, np.nan)

    # z差分っぽい指標（Top100平均との差 / 全体std）
    z_shift = ((top_mu - all_mu) / all_sd).abs().sort_values(ascending=False)

    print("\nTop100平均との差が大きい数値列（| (TopMean - AllMean)/AllStd | 上位20）")
    print(z_shift.head(20))

    # Top100で極端な値が出ている列（max/min）
    top_min = top[num_cols].min(numeric_only=True)
    top_max = top[num_cols].max(numeric_only=True)
    all_min = X[num_cols].min(numeric_only=True)
    all_max = X[num_cols].max(numeric_only=True)

    extreme = pd.DataFrame({
        "top_min": top_min, "all_min": all_min,
        "top_max": top_max, "all_max": all_max,
    })

    # 全体範囲に対してTop100が端に寄りすぎている列を上位表示（簡易）
    extreme["min_gap"] = (extreme["top_min"] - extreme["all_min"]).abs()
    extreme["max_gap"] = (extreme["top_max"] - extreme["all_max"]).abs()

    print("\nTop100が全体レンジの端っこに寄ってそうな列（max_gap上位20）")
    print(extreme.sort_values("max_gap", ascending=False).head(20))


Top residual samples shape: (100, 43)
            y       pred      resid  abs_resid
553891  100.0  52.207457  47.792543  47.792543
78392   100.0  54.239219  45.760781  45.760781
428160   72.6  29.026905  43.573095  43.573095
526766   93.6  50.835317  42.764683  42.764683
613957   92.9  50.293985  42.606015  42.606015

① カテゴリ偏りチェック（Top100の上位頻出カテゴリ）

[gender] Top5
gender
male      37
other     35
female    28
Name: count, dtype: int64

[course] Top5
course
b.tech    22
b.com     20
bca       18
bba       14
b.sc      12
Name: count, dtype: int64

[internet_access] Top5
internet_access
yes    96
no      4
Name: count, dtype: int64

[sleep_quality] Top5
sleep_quality
poor       35
average    33
good       32
Name: count, dtype: int64

[study_method] Top5
study_method
coaching         25
online videos    23
self-study       23
mixed            19
group study      10
Name: count, dtype: int64

[facility_rating] Top5
facility_rating
high      43
medium    34
low       23
Name: count, dtype: 