In [1]:
import os
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
DATA_FILE = "restaurants_cleaned.csv"
PASS1_MODEL_OUT = "catboost_restaurant_score_pass1.cbm"
FINAL_MODEL_OUT = "catboost_restaurant_score_final.cbm"
PSEUDO_LABEL_WEIGHT = 0.0
RANDOM_STATE = 42

In [3]:
df = pd.read_csv(DATA_FILE)

In [4]:
cat_cols = ["category"]
num_cols = [
    "price_range", "price_range_missing",
    "ratings", "ratings_missing"
]
feat_cols = cat_cols + num_cols

In [5]:
mask_labeled = df["score_missing"] == 0
labeled_df   = df[mask_labeled].copy()

X_lab = labeled_df[feat_cols]
y_lab = labeled_df["score"]

# value counts of each category among *labeled* rows
vc = labeled_df['category'].value_counts()

# categories that appear only once
rare_cats = vc[vc < 2].index

# new column where singletons are grouped under 'other'
strat_col = labeled_df['category'].where(~labeled_df['category'].isin(rare_cats), 'other')

X_train, X_val, y_train, y_val = train_test_split(
    X_lab, y_lab, test_size=0.2, random_state=RANDOM_STATE,
    stratify=strat_col
)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
val_pool   = Pool(X_val,   y_val,   cat_features=cat_cols)

In [6]:
print("=== Pass 1: training on labeled subset ===")
pass1 = CatBoostRegressor(
    iterations=1200,
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=RANDOM_STATE,
    verbose=200,
    early_stopping_rounds=50,
)

pass1.fit(train_pool, eval_set=val_pool, use_best_model=True)
rmse_pass1 = pass1.get_best_score()["validation"]["RMSE"]
print(f"Pass‑1 RMSE (labeled val): {rmse_pass1:.4f}")

=== Pass 1: training on labeled subset ===
0:	learn: 0.2986664	test: 0.2970655	best: 0.2970655 (0)	total: 187ms	remaining: 3m 44s
200:	learn: 0.2694855	test: 0.2657767	best: 0.2657489 (155)	total: 9.99s	remaining: 49.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.2657488896
bestIteration = 155

Shrink model to first 156 iterations.
Pass‑1 RMSE (labeled val): 0.2657


In [7]:
mask_unlabeled = df["score_missing"] == 1
if mask_unlabeled.sum() == 0:
    raise ValueError("No rows with missing score found! Check preprocessing.")

pseudo_preds = pass1.predict(Pool(df.loc[mask_unlabeled, feat_cols],
                                  cat_features=cat_cols))
df.loc[mask_unlabeled, "pseudo_score"] = pseudo_preds

In [8]:
df["y_final"] = np.where(mask_unlabeled, df["pseudo_score"], df["score"])
sample_weight = np.where(mask_unlabeled, PSEUDO_LABEL_WEIGHT, 1.0)

final_pool = Pool(
    df[feat_cols],
    df["y_final"],
    weight=sample_weight,
    cat_features=cat_cols
)

# Use same validation set (real labels only) to measure improvement
val_pool_final = Pool(X_val, y_val, cat_features=cat_cols)

In [9]:
print("\n=== Pass 2: self‑training on full dataset ===")
final = CatBoostRegressor(
    iterations=1200,
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=RANDOM_STATE,
    verbose=200,
    early_stopping_rounds=50,
)

final.fit(final_pool, eval_set=val_pool_final, use_best_model=True)
rmse_final = final.get_best_score()["validation"]["RMSE"]
print(f"Final RMSE (labeled val): {rmse_final:.4f}   |  Gain: {rmse_pass1 - rmse_final:+.4f}")

final.save_model(FINAL_MODEL_OUT)


=== Pass 2: self‑training on full dataset ===
0:	learn: 0.2982571	test: 0.2963249	best: 0.2963249 (0)	total: 64.2ms	remaining: 1m 16s
200:	learn: 0.2641020	test: 0.2448732	best: 0.2448732 (200)	total: 11.9s	remaining: 59.1s
400:	learn: 0.2601285	test: 0.2421610	best: 0.2421426 (399)	total: 24.6s	remaining: 49s
600:	learn: 0.2573473	test: 0.2409663	best: 0.2409663 (600)	total: 37.5s	remaining: 37.4s
800:	learn: 0.2549590	test: 0.2398192	best: 0.2398192 (800)	total: 49.7s	remaining: 24.7s
1000:	learn: 0.2529475	test: 0.2390777	best: 0.2390777 (1000)	total: 1m 1s	remaining: 12.3s
1199:	learn: 0.2511576	test: 0.2384928	best: 0.2384871 (1194)	total: 1m 13s	remaining: 0us

bestTest = 0.2384870845
bestIteration = 1194

Shrink model to first 1195 iterations.
Final RMSE (labeled val): 0.2385   |  Gain: +0.0273


In [10]:
df.to_csv("restaurants_with_pseudo_scores.csv", index=False)
print("\nArtifacts written:")
print(f"  • {PASS1_MODEL_OUT}")
print(f"  • {FINAL_MODEL_OUT}")
print("  • restaurants_with_pseudo_scores.csv")


Artifacts written:
  • catboost_restaurant_score_pass1.cbm
  • catboost_restaurant_score_final.cbm
  • restaurants_with_pseudo_scores.csv
