# Predict CTR
Dataset: https://msnews.github.io/#getting-start

Use small data this time.

## Prepare datasets

In [2]:
!ls

MINDsmall_dev.zip   MINDsmall_train.zip Untitled.ipynb


In [3]:
!mkdir MINDsmall_train MINDsmall_dev

mkdir: MINDsmall_train: File exists
mkdir: MINDsmall_dev: File exists


In [4]:
!unzip MINDsmall_train.zip -d MINDsmall_train

Archive:  MINDsmall_train.zip
  inflating: MINDsmall_train/entity_embedding.vec  
  inflating: MINDsmall_train/news.tsv  
  inflating: MINDsmall_train/relation_embedding.vec  
  inflating: MINDsmall_train/behaviors.tsv  


In [5]:
!unzip MINDsmall_dev.zip -d MINDsmall_dev

Archive:  MINDsmall_dev.zip
  inflating: MINDsmall_dev/behaviors.tsv  
  inflating: MINDsmall_dev/entity_embedding.vec  
  inflating: MINDsmall_dev/news.tsv  
  inflating: MINDsmall_dev/relation_embedding.vec  


In [None]:
# pip install sentence-transformers lightgbm pandas tqdm scikit-learn tables

## Experiments

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
tqdm.pandas()

### Ensemble(text vector + other features -> GBDT)

### E5でベクトル化 → HDF5保存
HDF5を使うことで部分読み込みができるようになり、省メモリ化できた。30G+ -> 20G。しかしまだ足りなかったのでデータを分割処理バッチ学習に変更。

In [2]:
# ニュースデータ読み込み（MINDのnews.tsv想定）
news = pd.read_table("MINDsmall_train/news.tsv", names=["id", "cat", "subcat", "title", "abstract", "url", "title_entities", "abstract_entities"])

In [None]:
# モデル読み込み（例：E5-base）
embedder = SentenceTransformer("intfloat/e5-base")

# タイトルをE5ベクトル化
embeddings = embedder.encode(
    ["query: " + t for t in news["title"].fillna("")], 
    batch_size=64, 
    show_progress_bar=True
)

# ベクトル＋news_idでDataFrame化
vec_df = pd.DataFrame(embeddings)
vec_df.insert(0, "news_id", news["id"].values)

# HDF5に保存
vec_df.to_hdf("news_vectors.h5", key="vecs", mode="w")

In [None]:
### CTRデータに出てくる news_id だけ読み込む

In [3]:
# CTRログ読み込み（MINDのbehaviors.tsv想定）
ctr_df = pd.read_table("MINDsmall_train/behaviors.tsv", names=["imp_id", "user_id", "time", "history", "impressions"])

# impressions を展開してクリックデータに
def parse_impressions(row):
    pairs = row["impressions"].split()  # ← 修正済み
    data = []
    for pair in pairs:
        nid, label = pair.split("-")
        data.append((row["imp_id"], row["user_id"], nid, int(label)))
    return data

ctr_data = []
for _, row in ctr_df.iterrows():
    ctr_data.extend(parse_impressions(row))

# DataFrameに変換
ctr_df = pd.DataFrame(ctr_data, columns=["imp_id", "user_id", "news_id", "label"])

# 使用されている news_id のみ取得
used_news_ids = ctr_df["news_id"].unique()

# 対象のベクトルだけを部分読み込み
vec_df = pd.read_hdf("news_vectors.h5", key="vecs")
vec_df = vec_df[vec_df["news_id"].isin(used_news_ids)]

In [None]:
### 特徴量構築
# pd.merge を使わず、news_id → ベクトル の辞書にして map＋list で追加する(map + dict + vstack)方が圧倒的に軽量。
# しかしまだ足りなかったのでデータを分割処理バッチ学習に変更

In [9]:
# # カテゴリ情報のマージ
# news_cat_map = news.set_index("id")["cat"].to_dict()
# ctr_df["cat"] = ctr_df["news_id"].map(news_cat_map)
# cat_dummies = pd.get_dummies(ctr_df["cat"], prefix="cat")

# # ベクトルを辞書化してmapで追加（メモリ効率）
# vec_dict = dict(zip(vec_df["news_id"], vec_df.drop(columns=["news_id"]).values))
# vec_dim = len(next(iter(vec_dict.values())))
# vec_cols = [f'vec_{i}' for i in range(vec_dim)]

# # 欠損処理しながらベクトルを列展開
# vec_filled = [vec_dict.get(nid, np.zeros(vec_dim)) for nid in ctr_df["news_id"]]
# vec_matrix = np.vstack(vec_filled) <-- ここでメモリを消費
# vec_df_expanded = pd.DataFrame(vec_matrix, columns=vec_cols)

# # 最終特徴量
# X = pd.concat([vec_df_expanded, cat_dummies], axis=1)
# y = ctr_df["label"]

In [7]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

# ---------- 準備 ----------
news_cat_map = news.set_index("id")["cat"].to_dict()
ctr_df["cat"] = ctr_df["news_id"].map(news_cat_map)
all_cats = sorted(ctr_df["cat"].dropna().unique())

vec_dict = dict(zip(vec_df["news_id"], vec_df.drop(columns=["news_id"]).values))
vec_dim = len(next(iter(vec_dict.values())))
vec_cols = [f"vec_{i}" for i in range(vec_dim)]

batch_size = 100_000
ctr_df = shuffle(ctr_df, random_state=42)

# ---------- バリデーションデータを分ける ----------
val_df = ctr_df.iloc[-batch_size:].copy()
train_df = ctr_df.iloc[:-batch_size].copy()

val_df["cat"] = pd.Categorical(val_df["news_id"].map(news_cat_map), categories=all_cats)
val_cat_dummies = pd.get_dummies(val_df["cat"], prefix="cat").astype("float32").reindex(val_df.index).fillna(0)
val_vec_matrix = np.vstack([vec_dict.get(nid, np.zeros(vec_dim)) for nid in val_df["news_id"]])
val_vec_df = pd.DataFrame(val_vec_matrix, columns=vec_cols, index=val_df.index)
X_val = pd.concat([val_vec_df, val_cat_dummies], axis=1).fillna(0)
y_val = val_df["label"]

# ---------- LightGBM パラメータ ----------
params = {
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.1,
    "num_leaves": 31,
    "seed": 42,
}

model = None  # 初期モデル

# ---------- バッチ学習ループ ----------
for i, start in enumerate(range(0, len(train_df), batch_size)):
    end = start + batch_size
    batch = train_df.iloc[start:end].copy()

    batch["cat"] = pd.Categorical(batch["news_id"].map(news_cat_map), categories=all_cats)
    cat_dummies = pd.get_dummies(batch["cat"], prefix="cat").astype("float32").reindex(batch.index).fillna(0)

    vec_matrix = np.vstack([vec_dict.get(nid, np.zeros(vec_dim)) for nid in batch["news_id"]])
    vec_df_expanded = pd.DataFrame(vec_matrix, columns=vec_cols, index=batch.index)

    X_batch = pd.concat([vec_df_expanded, cat_dummies], axis=1).fillna(0)
    y_batch = batch["label"]

    lgb_train = lgb.Dataset(X_batch, label=y_batch, free_raw_data=True)

    model = lgb.train(
        params,
        train_set=lgb_train,
        num_boost_round=100,  # 多めにしてもOK
        valid_sets=[lgb_train, lgb.Dataset(X_val, label=y_val)],
        valid_names=["train", "valid"],
        init_model=model,
        callbacks=[
            early_stopping(stopping_rounds=10),
            log_evaluation(10)
        ]
    )

    # 中間AUC出力
    y_pred = model.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    print(f"[Batch {i+1}] Validation AUC: {auc:.4f}")

Training until validation scores don't improve for 10 rounds
[10]	train's auc: 0.743098	valid's auc: 0.675925
[20]	train's auc: 0.768825	valid's auc: 0.682023
[30]	train's auc: 0.782209	valid's auc: 0.682605
Early stopping, best iteration is:
[23]	train's auc: 0.773617	valid's auc: 0.68295
[Batch 1] Validation AUC: 0.6830
Training until validation scores don't improve for 10 rounds
[30]	train's auc: 0.727969	valid's auc: 0.688009
[40]	train's auc: 0.759948	valid's auc: 0.690242
[50]	train's auc: 0.776317	valid's auc: 0.69035
Early stopping, best iteration is:
[42]	train's auc: 0.76436	valid's auc: 0.690463
[Batch 2] Validation AUC: 0.6905
Training until validation scores don't improve for 10 rounds
[50]	train's auc: 0.731606	valid's auc: 0.694877
[60]	train's auc: 0.760139	valid's auc: 0.695443
Early stopping, best iteration is:
[58]	train's auc: 0.755822	valid's auc: 0.695577
[Batch 3] Validation AUC: 0.6956
Training until validation scores don't improve for 10 rounds
[60]	train's auc