In [33]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import pandas as pd, numpy as np, gc

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
orig = pd.read_csv('Exam_Score_Prediction.csv')

print("train_shape:",train.shape)
print("test.shape:",test.shape)
print("orig.shape:",orig.shape)

orig

# 今後のためにリストを作る
target = 'exam_score'
base = [col for col in train.columns if col not in ['id', target]]
categories = train.select_dtypes('object').columns.to_list()
nums = [col for col in base if col not in categories]
print(f'{len(base)} Base Features:{base}')

train_shape: (630000, 13)
test.shape: (270000, 12)
orig.shape: (20000, 13)


Unnamed: 0,student_id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,1,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,58.9
1,2,23,other,bca,3.37,64.8,yes,4.6,average,online videos,medium,moderate,54.8
2,3,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,90.3
3,4,20,other,diploma,0.67,48.4,yes,5.8,average,online videos,low,moderate,29.7
4,5,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,43.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19997,18,other,bba,6.50,71.3,yes,5.0,good,self-study,low,easy,86.5
19996,19998,18,male,b.com,3.71,41.6,no,5.9,average,coaching,medium,moderate,60.9
19997,19999,19,other,diploma,7.88,68.2,yes,4.6,poor,group study,low,easy,64.5
19998,20000,19,male,bba,4.60,76.3,no,6.1,good,self-study,medium,moderate,79.0


In [38]:
ORIG = []

# 外部データの各カラムのユニークごとの平均値というカラムを追加する。
for col in base:
    # 一つの列に対してgroupbyで固有の値をまとめる。それらのtargetをそれぞれ平均する
    mean_map = orig.groupby(col)[target].mean() 
    new_mean_col_name = f"orig_mean_{col}"
    mean_map.name = new_mean_col_name
    
    train = train.merge(mean_map, on=col, how='left') # colをキーにして
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)
    
# 外部データの各カラムのユニークごとのサイズというカラムを追加する。
    new_count_col_name = f"orig_count_{col}"
    count_map = orig.groupby(col).size().reset_index(name=new_count_col_name)
    
    train = train.merge(count_map, on=col, how='left')
    test = test.merge(count_map, on=col, how='left')
    ORIG.append(new_count_col_name)

print(f'{len(ORIG)} ORIG Features Created.')

22 ORIG Features Created.


In [39]:
# origには存在するが、trainには存在しないカテゴリを全体平均で埋める
for col in ORIG:
    if 'mean' in col:
        train[col] = train[col].fillna(orig[target].mean())
        test[col] = test[col].fillna(orig[target].mean())
    else:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

In [40]:
# reduce_mem_usage はここに定義（そのままでOK）

features = base + ORIG

# まず X, y を作る（これが先）
X = train[features].copy()
y = train[target].copy()

# test側も、モデルに入れる列だけにそろえる（重要）
X_test = test[features].copy()

gc.collect()


0

In [48]:
from sklearn.model_selection import KFold
import numpy as np

# =========================
# Target Encoding (OOFでリーク防止) + 列選別条件つき
# =========================

def select_te_cols(
    df_train, df_test, cols,
    min_unique=3,              # unique <=2 は除外
    max_unique_abs=5000,       # 高カーディナリティ除外
    max_unique_ratio=0.30,     # unique/行数 が大きすぎる列は除外（ID化）
    max_missing=0.60,          # 欠損率が高い列は除外
    rare_thr=5,                # レア判定（出現回数<=5）
    max_rare_points_ratio=0.80,# レアカテゴリが占める割合が大きい列は除外
    max_unseen_ratio=0.20      # testにしかないカテゴリが多い列は除外
):
    n = len(df_train)
    chosen = []
    for col in cols:
        s_tr = df_train[col]
        s_te = df_test[col]

        # 欠損
        if s_tr.isna().mean() > max_missing:
            continue

        # unique
        nunq = s_tr.nunique(dropna=True)
        if nunq < min_unique:
            continue
        if nunq > max_unique_abs:
            continue
        if nunq / n > max_unique_ratio:
            continue

        # レアカテゴリ比率
        vc = s_tr.value_counts(dropna=True)
        rare_points_ratio = (s_tr.map(vc).fillna(0) <= rare_thr).mean()
        if rare_points_ratio > max_rare_points_ratio:
            continue

        # unseen比率（testにあるがtrainにないカテゴリの比率）
        tr_set = set(s_tr.dropna().unique())
        te_set = set(s_te.dropna().unique())
        if len(te_set) > 0:
            unseen_ratio = len(te_set - tr_set) / len(te_set)
            if unseen_ratio > max_unseen_ratio:
                continue

        chosen.append(col)
    return chosen


def add_target_encoding_oof(train_df, test_df, y, te_cols, n_splits=5, seed=42, smoothing=20):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for col in te_cols:
        te_name = f"te_{col}"
        train_te = np.zeros(len(train_df), dtype=np.float64)
        test_te_folds = []

        for tr_idx, va_idx in kf.split(train_df):
            X_tr = train_df.iloc[tr_idx]
            y_tr = y.iloc[tr_idx]
            X_va = train_df.iloc[va_idx]

            prior = y_tr.mean()

            stats = (
                pd.DataFrame({col: X_tr[col].values, "y": y_tr.values})
                .groupby(col)["y"]
                .agg(["mean", "count"])
            )

            smooth_map = (stats["count"] * stats["mean"] + smoothing * prior) / (stats["count"] + smoothing)

            train_te[va_idx] = X_va[col].map(smooth_map).fillna(prior).astype(np.float64).values
            test_te_folds.append(test_df[col].map(smooth_map).fillna(prior).astype(np.float64).values)

        train_df[te_name] = train_te
        test_df[te_name] = np.mean(np.vstack(test_te_folds), axis=0)

    return train_df, test_df


# ---- ここがあなたのコードの差し替え部分 ----

# TE対象を「object列のうち、条件を満たす列」に絞る
TE_COLS_RAW = categories
TE_COLS = select_te_cols(
    train, test, TE_COLS_RAW,
    min_unique=3,
    max_unique_abs=5000,
    max_unique_ratio=0.30,
    max_missing=0.60,
    rare_thr=5,
    max_rare_points_ratio=0.80,
    max_unseen_ratio=0.20
)
print(f"Target Encoding applied to {len(TE_COLS)} features.")
print("TE_COLS:", TE_COLS)

# OOF TE作成
train, test = add_target_encoding_oof(train, test, y, TE_COLS, n_splits=5, seed=42, smoothing=20)

TE_FEATURES = [f"te_{c}" for c in TE_COLS]
features = base + ORIG + TE_FEATURES

X = train[features].copy()
y = train[target].copy()
X_test = test[features].copy()

gc.collect()

print(X.columns)
print(X_test.columns)


Target Encoding applied to 6 features.
TE_COLS: ['gender', 'course', 'sleep_quality', 'study_method', 'facility_rating', 'exam_difficulty']
Index(['age', 'gender', 'course', 'study_hours', 'class_attendance',
       'internet_access', 'sleep_hours', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty', 'orig_mean_age', 'orig_count_age',
       'orig_mean_gender', 'orig_count_gender', 'orig_mean_course',
       'orig_count_course', 'orig_mean_study_hours', 'orig_count_study_hours',
       'orig_mean_class_attendance', 'orig_count_class_attendance',
       'orig_mean_internet_access', 'orig_count_internet_access',
       'orig_mean_sleep_hours', 'orig_count_sleep_hours',
       'orig_mean_sleep_quality', 'orig_count_sleep_quality',
       'orig_mean_study_method', 'orig_count_study_method',
       'orig_mean_facility_rating', 'orig_count_facility_rating',
       'orig_mean_exam_difficulty', 'orig_count_exam_difficulty', 'te_gender',
       'te_course', 'te_sleep_q

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# 非線形モデル例（どれか1つでOK）
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
# from sklearn.neural_network import MLPRegressor


# =========================
# 0) 前提: X, y, X_test が既にできている想定
#   X: train特徴量(DataFrame)
#   y: target(Series)
#   X_test: test特徴量(DataFrame)
# =========================

# カラム型を自動判定（object/category をカテゴリ扱い）
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 1段目（線形）と 2段目（残差）のOOFを作る
oof_linear = np.zeros(len(X))
oof_resid  = np.zeros(len(X))
test_linear_folds = []
test_resid_folds  = []

# =========================
# 1) 線形モデル（Ridge推奨）
# =========================
linear_model = Pipeline(steps=[
    ("pre", ColumnTransformer(
        transformers=[
            ("num", "passthrough", num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ]
    )),
    ("ridge", Ridge(alpha=1.0, random_state=42))
])

# =========================
# 2) 残差モデル（XGB / LGBM / MLP）
#   ここではXGB例（カテゴリは factorize して数値化）
# =========================
xgb_params = dict(
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric="rmse",
    early_stopping_rounds=200,
)

def factorize_fit_transform(train_df, val_df, test_df, cat_cols):
    """fold内でリークしないように、train+val+test を結合して factorize して整合を取る"""
    train_df = train_df.copy()
    val_df   = val_df.copy()
    test_df  = test_df.copy()

    for c in cat_cols:
        comb = pd.concat([train_df[c], val_df[c], test_df[c]], axis=0)
        codes, _ = comb.factorize(sort=True)
        n_tr = len(train_df)
        n_va = len(val_df)

        train_df[c] = codes[:n_tr]
        val_df[c]   = codes[n_tr:n_tr+n_va]
        test_df[c]  = codes[n_tr+n_va:]

    return train_df, val_df, test_df


# =========================
# 3) CV学習: 線形→残差→合成
# =========================
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx].copy(), y.iloc[tr_idx].copy()
    X_va, y_va = X.iloc[va_idx].copy(), y.iloc[va_idx].copy()

    # ---- (A) 線形 fit -> pred ----
    linear_model.fit(X_tr, y_tr)
    pred_lin_va = linear_model.predict(X_va)
    oof_linear[va_idx] = pred_lin_va

    # testの線形pred（fold平均）
    pred_lin_te = linear_model.predict(X_test)
    test_linear_folds.append(pred_lin_te)

    # ---- (B) 残差を作る ----
    resid_tr = y_tr - linear_model.predict(X_tr)
    resid_va_true = y_va - pred_lin_va  # 評価用

    # ---- (C) 残差モデル fit -> pred ----
    # XGB用にカテゴリを数値化（fold内で整合）
    X_tr2, X_va2, X_te2 = factorize_fit_transform(X_tr, X_va, X_test, cat_cols)

    resid_model = XGBRegressor(**xgb_params)
    resid_model.fit(
        X_tr2, resid_tr,
        eval_set=[(X_va2, resid_va_true)],
        verbose=200
    )

    pred_resid_va = resid_model.predict(X_va2)
    oof_resid[va_idx] = pred_resid_va

    pred_resid_te = resid_model.predict(X_te2)
    test_resid_folds.append(pred_resid_te)

    # ---- (D) 合成 ----
    pred_final_va = pred_lin_va + pred_resid_va
    rmse = np.sqrt(mean_squared_error(y_va, pred_final_va))
    print(f"Fold {fold} RMSE: {rmse:.5f}")

# 全体OOF
oof_final = oof_linear + oof_resid
print("-"*30)
print(f"OOF RMSE: {np.sqrt(mean_squared_error(y, oof_final)):.5f}")

# test予測（fold平均）
test_linear = np.mean(np.vstack(test_linear_folds), axis=0)
test_resid  = np.mean(np.vstack(test_resid_folds), axis=0)
test_pred   = test_linear + test_resid

# 送信用（id列名は適宜）
# sub = pd.DataFrame({"id": test["id"], "exam_score": test_pred})
# sub.to_csv("submission.csv", index=False)


[0]	validation_0-rmse:8.83617
[200]	validation_0-rmse:8.79972
[400]	validation_0-rmse:8.78195
[600]	validation_0-rmse:8.77016
[800]	validation_0-rmse:8.76038
[1000]	validation_0-rmse:8.75169
[1200]	validation_0-rmse:8.74433
[1400]	validation_0-rmse:8.73851
[1600]	validation_0-rmse:8.73309
[1800]	validation_0-rmse:8.72833
[2000]	validation_0-rmse:8.72396
[2200]	validation_0-rmse:8.72059
[2400]	validation_0-rmse:8.71744
[2600]	validation_0-rmse:8.71456
[2800]	validation_0-rmse:8.71177
[3000]	validation_0-rmse:8.70922
[3200]	validation_0-rmse:8.70699
[3400]	validation_0-rmse:8.70481
[3600]	validation_0-rmse:8.70298
[3800]	validation_0-rmse:8.70124
[4000]	validation_0-rmse:8.70004
[4200]	validation_0-rmse:8.69864
[4400]	validation_0-rmse:8.69738
[4600]	validation_0-rmse:8.69593
[4800]	validation_0-rmse:8.69508
[4999]	validation_0-rmse:8.69393
Fold 1 RMSE: 8.69393
[0]	validation_0-rmse:8.83989
[200]	validation_0-rmse:8.80435
[400]	validation_0-rmse:8.78816
[600]	validation_0-rmse:8.77659
[8

In [50]:
# submission 作成（id列は自動で推定して作る）
pred_col = target  # 'exam_score'

# id列名を推定
id_col = "id" if "id" in test.columns else ("Id" if "Id" in test.columns else None)
if id_col is None:
    raise ValueError("testに id / Id 列が見つかりません。id列名を手動で指定してください。")

sub = pd.DataFrame({
    id_col: test[id_col].values,
    pred_col: test_pred  # 直前で作った最終予測（線形 + 残差）
})

sub.to_csv("submission.csv", index=False)
sub.head()


Unnamed: 0,id,exam_score
0,630000,70.958174
1,630001,69.800803
2,630002,88.742019
3,630003,55.147936
4,630004,45.419446
