# **1. Preprocess**

## **1.1. Download necessary libraries**

In [None]:
!pip install pandas numpy scikit-learn xgboost matplotlib



## **1.2. Import libraries and download data**

In [None]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
import kagglehub

# -------------------------
# 1. Download the latest dataset from Kaggle
# -------------------------
dataset_path = kagglehub.dataset_download("arianazmoudeh/airbnbopendata")
print("Downloaded dataset at:", dataset_path)

# Automatically find CSV files in the downloaded dataset directory
csv_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv")]
if len(csv_files) == 0:
    raise FileNotFoundError("No CSV files found in the dataset folder.")

RAW_PATH = os.path.join(dataset_path, csv_files[0])
print("Using CSV file:", RAW_PATH)

# -------------------------
# 2. Output directory
# -------------------------
OUT_DIR = "/content/data"
os.makedirs(OUT_DIR, exist_ok=True)
print("Output directory:", OUT_DIR)

# -------------------------
# 3. Load the dataset
# -------------------------
df = pd.read_csv(RAW_PATH)
print("Loaded dataset with shape:", df.shape)

Downloading from https://www.kaggle.com/api/v1/datasets/download/arianazmoudeh/airbnbopendata?dataset_version_number=1...


100%|██████████| 10.5M/10.5M [00:01<00:00, 7.00MB/s]

Extracting files...





Downloaded dataset at: /root/.cache/kagglehub/datasets/arianazmoudeh/airbnbopendata/versions/1
Using CSV file: /root/.cache/kagglehub/datasets/arianazmoudeh/airbnbopendata/versions/1/Airbnb_Open_Data.csv
Output directory: /content/data
Loaded dataset with shape: (102599, 26)


  df = pd.read_csv(RAW_PATH)


## **1.3. Preprocessing function**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_airbnb(raw_path=RAW_PATH, out_dir=OUT_DIR):
    df = pd.read_csv(raw_path)

    # Standardize column names: trim spaces, convert to lowercase, replace spaces with underscores
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    # Keep only the columns we need
    cols_keep = [
        "price",
        "service_fee",
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "construction_year",
        "country",
        "country_code",
    ]
    cols_keep = [c for c in cols_keep if c in df.columns]
    df = df[cols_keep].copy()

    target_col = "price"

    # Clean 'price' and 'service_fee' columns
    for col in [target_col, 'service_fee']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Basic cleaning: drop rows where price is missing or non-positive
    df = df.dropna(subset=[target_col])
    df = df[df[target_col] > 0]

    numeric_cols = [
        "service_fee",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "construction_year",
    ]
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    # Fill NaN values in numeric columns (Median imputation)
    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            # If median is also NaN (empty column), fill with 0
            df[col] = df[col].fillna(0)

    categorical_cols = [
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "country",
        "country_code",
    ]
    categorical_cols = [c for c in categorical_cols if c in df.columns]

    # Fill NaN values in categorical columns before encoding
    for col in categorical_cols:
        if col in df.columns and df[col].isnull().any():
            df[col] = df[col].fillna('__MISSING__').astype(str)

    # Train/validation/test split
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Scale numeric features
    scaler = StandardScaler()
    if numeric_cols:
        # Fit on train, transform all
        train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
        val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
        test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    # Ordinal encode categorical features
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    if categorical_cols:
        train_cat_encoded = encoder.fit_transform(train_df[categorical_cols])
        val_cat_encoded = encoder.transform(val_df[categorical_cols])
        test_cat_encoded = encoder.transform(test_df[categorical_cols])

        train_df[categorical_cols] = np.where(train_cat_encoded == -1, 0, train_cat_encoded + 1).astype(int)
        val_df[categorical_cols] = np.where(val_cat_encoded == -1, 0, val_cat_encoded + 1).astype(int)
        test_df[categorical_cols] = np.where(test_cat_encoded == -1, 0, test_cat_encoded + 1).astype(int)

    # Save processed datasets
    train_path = os.path.join(out_dir, "train_processed.csv")
    val_path = os.path.join(out_dir, "val_processed.csv")
    test_path = os.path.join(out_dir, "test_processed.csv")

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    # Save metadata
    metadata = {
        "target_col": target_col,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "cat_cardinalities": {},
    }

    for i, col in enumerate(categorical_cols):
        metadata["cat_cardinalities"][col] = len(encoder.categories_[i]) + 1

    meta_path = os.path.join(out_dir, "metadata.json")
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print("Preprocess done.")
    print(f"Saved: {train_path}")
    print(f"Saved: {val_path}")
    print(f"Saved: {test_path}")
    print(f"Saved: {meta_path}")

In [None]:
preprocess_airbnb()

  df = pd.read_csv(raw_path)


Preprocess done.
Saved: /content/data/train_processed.csv
Saved: /content/data/val_processed.csv
Saved: /content/data/test_processed.csv
Saved: /content/data/metadata.json


# **2. Traditional ML Baseline Model**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

def load_processed_data(data_dir=OUT_DIR):
    with open(os.path.join(data_dir, "metadata.json"), "r") as f:
        meta = json.load(f)

    target_col = meta["target_col"]
    numeric_cols = meta["numeric_cols"]
    categorical_cols = meta["categorical_cols"]

    train = pd.read_csv(os.path.join(data_dir, "train_processed.csv"))
    val   = pd.read_csv(os.path.join(data_dir, "val_processed.csv"))
    test  = pd.read_csv(os.path.join(data_dir, "test_processed.csv"))

    feature_cols = numeric_cols + categorical_cols

    X_train = train[feature_cols].values
    y_train = train[target_col].values

    X_val = val[feature_cols].values
    y_val = val[target_col].values

    X_test = test[feature_cols].values
    y_test = test[target_col].values

    return (X_train, y_train, X_val, y_val, X_test, y_test, meta)

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, meta = load_processed_data()

def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)

    # Calculate MSE first, then take the square root for RMSE, as 'squared' might not be supported.
    mse = mean_squared_error(y_val, pred_val)
    rmse = mse**0.5
    mae  = mean_absolute_error(y_val, pred_val)

    print(f"{name} -> val RMSE: {rmse:.3f}, MAE: {mae:.3f}")

# Redefine the function to fix the 'squared' argument issue locally
def evaluate_baseline_full(name, model, X_train, y_train, X_val, y_val, X_test, y_test):
    model.fit(X_train, y_train)

    # VAL
    pred_val = model.predict(X_val)
    mse_val = mean_squared_error(y_val, pred_val)
    rmse_val = mse_val ** 0.5  # Calculate RMSE manually
    mae_val  = mean_absolute_error(y_val, pred_val)

    # TEST
    pred_test = model.predict(X_test)
    mse_test = mean_squared_error(y_test, pred_test)
    rmse_test = mse_test ** 0.5  # Calculate RMSE manually
    mae_test  = mean_absolute_error(y_test, pred_test)

    print(f"{name} -> VAL  RMSE: {rmse_val:.3f}, MAE: {mae_val:.3f}")
    print(f"{name} -> TEST RMSE: {rmse_test:.3f}, MAE: {mae_test:.3f}")

## **2.1. Random Forest**

In [None]:
# Random Forest
X_train, y_train, X_val, y_val, X_test, y_test, meta = load_processed_data()

rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=5,
    min_samples_split=50,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
)
evaluate_baseline_full("RandomForest", rf,
                       X_train, y_train,
                       X_val, y_val,
                       X_test, y_test)

RandomForest -> VAL  RMSE: 19.075, MAE: 8.519
RandomForest -> TEST RMSE: 21.334, MAE: 8.777


## **2.2. XGBoost**

In [None]:
xgb = XGBRegressor(
    n_estimators=50,
    max_depth=3,
    learning_rate=0.3,
    subsample=0.5,
    colsample_bytree=0.5,
    reg_lambda=5,
    random_state=42,
    n_jobs=-1,
)
evaluate_baseline_full("XGBoost", xgb,
                       X_train, y_train,
                       X_val, y_val,
                       X_test, y_test)

XGBoost -> VAL  RMSE: 18.061, MAE: 5.860
XGBoost -> TEST RMSE: 20.404, MAE: 6.061


# **3. MLP Deep Learning Baseline Model**

## **3.1. Import libraries and download data**

In [None]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

DATA_DIR = "/content/data"

META_PATH = os.path.join(DATA_DIR, "metadata.json")

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## **3.2. Prepare the Airbnb Open Dataset**

In [None]:
class AirbnbDataset(Dataset):
    def __init__(self, csv_path: str, meta_path: str):
        self.df = pd.read_csv(csv_path)

        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.target_col = meta["target_col"]
        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]

        self.y = self.df[self.target_col].values.astype("float32")

        if self.numeric_cols:
            self.x_num = self.df[self.numeric_cols].values.astype("float32")
        else:
            self.x_num = None

        if self.categorical_cols:
            self.x_cat = self.df[self.categorical_cols].values.astype("int64")
        else:
            self.x_cat = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        y = torch.tensor(self.y[idx], dtype=torch.float32)

        if self.x_num is not None:
            x_num = torch.tensor(self.x_num[idx], dtype=torch.float32)
        else:
            x_num = torch.empty(0, dtype=torch.float32)

        if self.x_cat is not None:
            x_cat = torch.tensor(self.x_cat[idx], dtype=torch.long)
        else:
            x_cat = torch.empty(0, dtype=torch.long)

        return x_num, x_cat, y

## **3.3. MLP Pricing Model**

In [None]:
class MLPPriceModel(nn.Module):
    def __init__(self, meta_path: str, embed_dim: int = 8, hidden_dim: int = 256):
        super().__init__()

        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]
        cat_cardinalities = meta["cat_cardinalities"]

        self.num_numeric = len(self.numeric_cols)

        self.embeds = nn.ModuleList()
        for col in self.categorical_cols:
            # cat_cardinalities[col] now already includes the +1 for the unknown category
            num_categories = cat_cardinalities[col]
            self.embeds.append(nn.Embedding(num_categories, embed_dim))

        input_dim = self.num_numeric + len(self.categorical_cols) * embed_dim

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x_num, x_cat):
        embed_list = []
        for i, emb in enumerate(self.embeds):
            embed_list.append(emb(x_cat[:, i]))  # [B, embed_dim]

        if embed_list:
            x_embed = torch.cat(embed_list, dim=1)
            x = torch.cat([x_num, x_embed], dim=1)
        else:
            x = x_num

        out = self.mlp(x).squeeze(1)  # [B]
        return out

## **3.4. Train and Test function**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    n = 0

    for x_num, x_cat, y in loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        preds = model(x_num, x_cat)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        bs = y.size(0)
        total_loss += loss.item() * bs
        n += bs

    return total_loss / n

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    n = 0

    for x_num, x_cat, y in loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        preds = model(x_num, x_cat)
        loss = criterion(preds, y)

        bs = y.size(0)
        total_loss += loss.item() * bs
        n += bs

    return total_loss / n

@torch.no_grad()
def eval_with_metrics(model, loader, device):
    model.eval()
    all_preds = []
    all_targets = []

    for x_num, x_cat, y in loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        preds = model(x_num, x_cat)

        all_preds.append(preds.detach().cpu().numpy())
        all_targets.append(y.detach().cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(all_targets, all_preds)

    return rmse, mae

## **3.5. Load Data and Train Baseline MLP**

In [None]:
batch_size = 256
epochs = 20
lr = 1e-3
embed_dim = 8
hidden_dim = 256

train_csv = os.path.join(DATA_DIR, "train_processed.csv")
val_csv   = os.path.join(DATA_DIR, "val_processed.csv")

train_ds = AirbnbDataset(train_csv, META_PATH)
val_ds   = AirbnbDataset(val_csv, META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

model = MLPPriceModel(META_PATH, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_val = float("inf")
save_dir = "/content/experiments/outputs"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "best_mlp.pt")

for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), save_path)
        print(f"  -> New best saved to {save_path}")

Epoch 01 | train_loss=144077.6325 | val_loss=28877.7676
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 02 | train_loss=4869.8252 | val_loss=396.2497
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 03 | train_loss=608.8866 | val_loss=330.2882
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 04 | train_loss=554.5795 | val_loss=308.4163
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 05 | train_loss=523.1757 | val_loss=301.2410
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 06 | train_loss=513.9164 | val_loss=307.5439
Epoch 07 | train_loss=511.2929 | val_loss=309.4115
Epoch 08 | train_loss=504.6627 | val_loss=309.1200
Epoch 09 | train_loss=509.9840 | val_loss=301.4132
Epoch 10 | train_loss=507.1584 | val_loss=294.0808
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 11 | train_loss=506.1570 | val_loss=301.3789
Epoch 12 | train_loss=500.8074 | val_loss=296.604

## **3.6. Evaluate Baseline MLP**

In [None]:
test_csv = os.path.join(DATA_DIR, "test_processed.csv")
test_ds = AirbnbDataset(test_csv, META_PATH)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

best_model = MLPPriceModel(META_PATH, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
best_model.load_state_dict(torch.load(save_path, map_location=device))

val_rmse, val_mae = eval_with_metrics(best_model, val_loader, device)
print(f"Best MLP on VAL -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_with_metrics(best_model, test_loader, device)
print(f"Best MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Best MLP on VAL -> RMSE: 17.149, MAE: 3.530
Best MLP on TEST -> RMSE: 20.007, MAE: 3.665


# **4. log_price**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_airbnb(raw_path=RAW_PATH, out_dir=OUT_DIR):
    df = pd.read_csv(raw_path)

    # Standardize column names: trim spaces, convert to lowercase, replace spaces with underscores
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    # Keep only the columns we need
    cols_keep = [
        "price",
        "service_fee",
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "construction_year",
        "country",
        "country_code",
    ]
    cols_keep = [c for c in cols_keep if c in df.columns]
    df = df[cols_keep].copy()

    target_col = "price"

    # Clean 'price' and 'service_fee' columns
    for col in [target_col, 'service_fee']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Basic cleaning: drop rows where price is missing or non-positive
    df = df.dropna(subset=[target_col])
    df = df[df[target_col] > 0]
    df["log_price"] = np.log(df["price"])
    target_col = "log_price"

    numeric_cols = [
        "service_fee",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "construction_year",
    ]
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    # Fill NaN values in numeric columns (Median imputation)
    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            # If median is also NaN (empty column), fill with 0
            df[col] = df[col].fillna(0)

    categorical_cols = [
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "country",
        "country_code",
    ]
    categorical_cols = [c for c in categorical_cols if c in df.columns]

    # Fill NaN values in categorical columns before encoding
    for col in categorical_cols:
        if col in df.columns and df[col].isnull().any():
            df[col] = df[col].fillna('__MISSING__').astype(str)

    # Train/validation/test split
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Scale numeric features
    scaler = StandardScaler()
    if numeric_cols:
        # Fit on train, transform all
        train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
        val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
        test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    # Ordinal encode categorical features
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    if categorical_cols:
        train_cat_encoded = encoder.fit_transform(train_df[categorical_cols])
        val_cat_encoded = encoder.transform(val_df[categorical_cols])
        test_cat_encoded = encoder.transform(test_df[categorical_cols])

        # Map -1 (unknown) to 0, and shift all other categories by 1
        train_df[categorical_cols] = np.where(train_cat_encoded == -1, 0, train_cat_encoded + 1).astype(int)
        val_df[categorical_cols] = np.where(val_cat_encoded == -1, 0, val_cat_encoded + 1).astype(int)
        test_df[categorical_cols] = np.where(test_cat_encoded == -1, 0, test_cat_encoded + 1).astype(int)

    # Save processed datasets
    train_path = os.path.join(out_dir, "train_processed.csv")
    val_path = os.path.join(out_dir, "val_processed.csv")
    test_path = os.path.join(out_dir, "test_processed.csv")

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    # Save metadata
    metadata = {
        "target_col": target_col,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "cat_cardinalities": {},
    }

    for i, col in enumerate(categorical_cols):
        metadata["cat_cardinalities"][col] = len(encoder.categories_[i]) + 1

    meta_path = os.path.join(out_dir, "metadata.json")
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print("Preprocess done.")
    print(f"Saved: {train_path}")
    print(f"Saved: {val_path}")
    print(f"Saved: {test_path}")
    print(f"Saved: {meta_path}")

In [None]:
preprocess_airbnb()

  df = pd.read_csv(raw_path)


Preprocess done.
Saved: /content/data/train_processed.csv
Saved: /content/data/val_processed.csv
Saved: /content/data/test_processed.csv
Saved: /content/data/metadata.json


## **4.1. Import libraries and download data**

In [None]:
import os
import json
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Directory where preprocessing stored all output files
DATA_DIR = "/content/data"   # Must match OUT_DIR from preprocessing notebook

META_PATH = os.path.join(DATA_DIR, "metadata.json")

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

## **4.2. Prepare the Airbnb Open Dataset**

In [None]:
class AirbnbDataset(Dataset):
    def __init__(self, csv_path: str, meta_path: str):
        self.df = pd.read_csv(csv_path)

        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.target_col = meta["target_col"]
        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]

        self.y = self.df[self.target_col].values.astype("float32")

        if self.numeric_cols:
            self.x_num = self.df[self.numeric_cols].values.astype("float32")
        else:
            self.x_num = None

        if self.categorical_cols:
            self.x_cat = self.df[self.categorical_cols].values.astype("int64")
        else:
            self.x_cat = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        y = torch.tensor(self.y[idx], dtype=torch.float32)

        if self.x_num is not None:
            x_num = torch.tensor(self.x_num[idx], dtype=torch.float32)
        else:
            x_num = torch.empty(0, dtype=torch.float32)

        if self.x_cat is not None:
            x_cat = torch.tensor(self.x_cat[idx], dtype=torch.long)
        else:
            x_cat = torch.empty(0, dtype=torch.long)

        return x_num, x_cat, y

## **4.3. MLP Pricing Model**

In [None]:
class MLPPriceModel(nn.Module):
    def __init__(self, meta_path: str, embed_dim: int = 8, hidden_dim: int = 256):
        super().__init__()

        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]
        cat_cardinalities = meta["cat_cardinalities"]

        self.num_numeric = len(self.numeric_cols)

        self.embeds = nn.ModuleList()
        for col in self.categorical_cols:
            # cat_cardinalities[col] now already includes the +1 for the unknown category
            num_categories = cat_cardinalities[col]
            self.embeds.append(nn.Embedding(num_categories, embed_dim))

        input_dim = self.num_numeric + len(self.categorical_cols) * embed_dim

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x_num, x_cat):
        embed_list = []
        for i, emb in enumerate(self.embeds):
            embed_list.append(emb(x_cat[:, i]))  # [B, embed_dim]

        if embed_list:
            x_embed = torch.cat(embed_list, dim=1)
            x = torch.cat([x_num, x_embed], dim=1)
        else:
            x = x_num

        out = self.mlp(x).squeeze(1)  # [B]
        return out

## **4.4. Train and Test function**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    n = 0

    for x_num, x_cat, y in loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        preds = model(x_num, x_cat)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        bs = y.size(0)
        total_loss += loss.item() * bs
        n += bs

    return total_loss / n

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    n = 0

    for x_num, x_cat, y in loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        preds = model(x_num, x_cat)
        loss = criterion(preds, y)

        bs = y.size(0)
        total_loss += loss.item() * bs
        n += bs

    return total_loss / n

@torch.no_grad()
def eval_with_metrics(model, loader, device):
    model.eval()
    all_preds = []
    all_targets = []

    for x_num, x_cat, y in loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        preds = model(x_num, x_cat)

        all_preds.append(preds.detach().cpu().numpy())
        all_targets.append(y.detach().cpu().numpy())

    all_preds = np.concatenate(all_preds, axis=0)
    all_targets = np.concatenate(all_targets, axis=0)

    all_preds = np.exp(all_preds)
    all_targets = np.exp(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(all_targets, all_preds)

    return rmse, mae

## **4.5. Load Data and Train Baseline MLP**

In [None]:
batch_size = 256
epochs = 20
lr = 1e-3
embed_dim = 8
hidden_dim = 256

train_csv = os.path.join(DATA_DIR, "train_processed.csv")
val_csv   = os.path.join(DATA_DIR, "val_processed.csv")

train_ds = AirbnbDataset(train_csv, META_PATH)
val_ds   = AirbnbDataset(val_csv, META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

model = MLPPriceModel(META_PATH, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_val = float("inf")
save_dir = "/content/experiments/outputs"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "best_mlp.pt")

for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), save_path)
        print(f"  -> New best saved to {save_path}")

Epoch 01 | train_loss=1.1727 | val_loss=0.0811
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 02 | train_loss=0.0911 | val_loss=0.0201
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 03 | train_loss=0.0513 | val_loss=0.0132
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 04 | train_loss=0.0409 | val_loss=0.0059
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 05 | train_loss=0.0367 | val_loss=0.0079
Epoch 06 | train_loss=0.0326 | val_loss=0.0051
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 07 | train_loss=0.0306 | val_loss=0.0074
Epoch 08 | train_loss=0.0287 | val_loss=0.0057
Epoch 09 | train_loss=0.0266 | val_loss=0.0038
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 10 | train_loss=0.0256 | val_loss=0.0078
Epoch 11 | train_loss=0.0247 | val_loss=0.0047
Epoch 12 | train_loss=0.0247 | val_loss=0.0034
  -> New best saved to /content/experiments/outputs/

## **4.6. Evaluate Baseline MLP**

In [None]:
test_csv = os.path.join(DATA_DIR, "test_processed.csv")
test_ds = AirbnbDataset(test_csv, META_PATH)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

best_model = MLPPriceModel(META_PATH, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
best_model.load_state_dict(torch.load(save_path, map_location=device))

val_rmse, val_mae = eval_with_metrics(best_model, val_loader, device)
print(f"Best MLP on VAL -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_with_metrics(best_model, test_loader, device)
print(f"Best MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Best MLP on VAL -> RMSE: 32.173, MAE: 20.038
Best MLP on TEST -> RMSE: 33.283, MAE: 20.094


# **5. Large MLP vs Small MLP**

In [None]:
class MLPPriceModel(nn.Module):
    def __init__(self, meta_path: str, embed_dim: int = 8, hidden_dim: int = 256, dropout: float = 0.1):
        super().__init__()

        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]
        cat_cardinalities = meta["cat_cardinalities"]

        self.num_numeric = len(self.numeric_cols)

        self.embeds = nn.ModuleList()
        for col in self.categorical_cols:
            # cat_cardinalities[col] now already includes the +1 for the unknown category
            num_categories = cat_cardinalities[col]
            self.embeds.append(nn.Embedding(num_categories, embed_dim))

        input_dim = self.num_numeric + len(self.categorical_cols) * embed_dim

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x_num, x_cat):
        embed_list = []
        for i, emb in enumerate(self.embeds):
            embed_list.append(emb(x_cat[:, i]))  # [B, embed_dim]

        if embed_list:
            x_embed = torch.cat(embed_list, dim=1)
            x = torch.cat([x_num, x_embed], dim=1)
        else:
            x = x_num

        out = self.mlp(x).squeeze(1)  # [B]
        return out

## **5.1. Train Small MLP**

### Subtask:
Train the 'Small MLP' configuration (embed_dim=4, hidden_dim=128, dropout=0.05) and evaluate its performance.


**Reasoning**:
Train the Small MLP model with the specified hyperparameters, save the best model, and evaluate it on validation and test sets.



In [None]:
# Hyperparameters for Small MLP
embed_dim_small = 4
hidden_dim_small = 128
dropout_small = 0.05
epochs = 20
lr = 1e-3
batch_size = 256

# Re-initialize Loaders (to ensure consistency)
train_ds = AirbnbDataset(os.path.join(DATA_DIR, "train_processed.csv"), META_PATH)
val_ds   = AirbnbDataset(os.path.join(DATA_DIR, "val_processed.csv"), META_PATH)
test_ds  = AirbnbDataset(os.path.join(DATA_DIR, "test_processed.csv"), META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

# Initialize Model
model_small = MLPPriceModel(
    meta_path=META_PATH,
    embed_dim=embed_dim_small,
    hidden_dim=hidden_dim_small,
    dropout=dropout_small
).to(device)

optimizer = torch.optim.Adam(model_small.parameters(), lr=lr)
criterion = nn.MSELoss()

# Training Loop
save_path_small = os.path.join(save_dir, "small_mlp.pt")
best_val_small = float("inf")

print("Training Small MLP...")
for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(model_small, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model_small, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val_small:
        best_val_small = val_loss
        torch.save(model_small.state_dict(), save_path_small)
        print(f"  -> New best saved to {save_path_small}")

print("\nEvaluating Best Small MLP...")
model_small.load_state_dict(torch.load(save_path_small, map_location=device))

val_rmse, val_mae = eval_with_metrics(model_small, val_loader, device)
print(f"Small MLP on VAL  -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_with_metrics(model_small, test_loader, device)
print(f"Small MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Training Small MLP...
Epoch 01 | train_loss=195501.0435 | val_loss=47075.7379
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 02 | train_loss=15763.8485 | val_loss=808.2934
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 03 | train_loss=724.1164 | val_loss=383.7133
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 04 | train_loss=574.6233 | val_loss=337.0288
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 05 | train_loss=529.4603 | val_loss=312.9179
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 06 | train_loss=510.0393 | val_loss=301.3764
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 07 | train_loss=496.8661 | val_loss=297.0272
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 08 | train_loss=486.7819 | val_loss=292.6822
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 09 | train_loss=484.2217 | val_loss=2

## **5.2. Train Large MLP**

### Subtask:
Train the 'Large MLP' configuration (embed_dim=16, hidden_dim=512, dropout=0.2) and evaluate its performance.


**Reasoning**:
Train the Large MLP model with the specified hyperparameters, save the best model, and evaluate it on validation and test sets.



In [None]:
# Hyperparameters for Large MLP
embed_dim_large = 16
hidden_dim_large = 512
dropout_large = 0.2
epochs = 20
lr = 1e-3

# Initialize Model
model_large = MLPPriceModel(
    meta_path=META_PATH,
    embed_dim=embed_dim_large,
    hidden_dim=hidden_dim_large,
    dropout=dropout_large
).to(device)

optimizer = torch.optim.Adam(model_large.parameters(), lr=lr)
criterion = nn.MSELoss()

# Training Loop
save_path_large = os.path.join(save_dir, "large_mlp.pt")
best_val_large = float("inf")

print("Training Large MLP...")
for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(model_large, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model_large, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val_large:
        best_val_large = val_loss
        torch.save(model_large.state_dict(), save_path_large)
        print(f"  -> New best saved to {save_path_large}")

print("\nEvaluating Best Large MLP...")
model_large.load_state_dict(torch.load(save_path_large, map_location=device))

val_rmse, val_mae = eval_with_metrics(model_large, val_loader, device)
print(f"Large MLP on VAL  -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_with_metrics(model_large, test_loader, device)
print(f"Large MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Training Large MLP...
Epoch 01 | train_loss=100110.5215 | val_loss=1359.9446
  -> New best saved to /content/experiments/outputs/large_mlp.pt
Epoch 02 | train_loss=805.2411 | val_loss=359.7810
  -> New best saved to /content/experiments/outputs/large_mlp.pt
Epoch 03 | train_loss=668.2381 | val_loss=384.9090
Epoch 04 | train_loss=639.8300 | val_loss=317.8292
  -> New best saved to /content/experiments/outputs/large_mlp.pt
Epoch 05 | train_loss=622.2850 | val_loss=360.2366
Epoch 06 | train_loss=604.9114 | val_loss=312.4567
  -> New best saved to /content/experiments/outputs/large_mlp.pt
Epoch 07 | train_loss=588.9663 | val_loss=332.2355
Epoch 08 | train_loss=584.6105 | val_loss=327.5446
Epoch 09 | train_loss=572.4352 | val_loss=334.8095
Epoch 10 | train_loss=578.5624 | val_loss=320.2399
Epoch 11 | train_loss=579.6523 | val_loss=374.6021
Epoch 12 | train_loss=566.7217 | val_loss=355.3954
Epoch 13 | train_loss=570.5925 | val_loss=490.4145
Epoch 14 | train_loss=560.1618 | val_loss=324.4490


# **6. TF-IDF**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

def preprocess_airbnb(raw_path=RAW_PATH, out_dir=OUT_DIR):
    df = pd.read_csv(raw_path)

    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    cols_keep = [
        "price",
        "service_fee",
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "construction_year",
        "country",
        "country_code",
        "name",
        "description",
    ]
    cols_keep = [c for c in cols_keep if c in df.columns]
    df = df[cols_keep].copy()

    target_col = "price"

    for col in [target_col, 'service_fee']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.dropna(subset=[target_col])
    df = df[df[target_col] > 0]

    numeric_cols = [
        "service_fee",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "construction_year",
    ]
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
        df[col] = df[col].fillna(0)

    categorical_cols = [
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "country",
        "country_code",
    ]
    categorical_cols = [c for c in categorical_cols if c in df.columns]

    for col in categorical_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna("__MISSING__").astype(str)

    text_cols = []
    if "description" in df.columns:
        text_cols.append("description")
    if "name" in df.columns:
        text_cols.append("name")
    if "neighbourhood" in df.columns:
        text_cols.append("neighbourhood")
    if "neighbourhood_group" in df.columns:
        text_cols.append("neighbourhood_group")

    if len(text_cols) == 0:
        text_series = pd.Series([""] * len(df))
    else:
        text_series = df[text_cols].fillna("").astype(str).agg(" ".join, axis=1)

    tfidf = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf.fit_transform(text_series)

    pca = PCA(n_components=16, random_state=42)
    text_embeddings = pca.fit_transform(tfidf_matrix.toarray())

    text_embed_path = os.path.join(out_dir, "text_embeddings.npy")
    np.save(text_embed_path, text_embeddings)

    # ----------------------------

    train_df, temp_df, train_text, temp_text = train_test_split(
        df, text_embeddings, test_size=0.2, random_state=42
    )
    val_df, test_df, val_text, test_text = train_test_split(
        temp_df, temp_text, test_size=0.5, random_state=42
    )

    scaler = StandardScaler()
    if numeric_cols:
        train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
        val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
        test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    if categorical_cols:
        train_cat_encoded = encoder.fit_transform(train_df[categorical_cols])
        val_cat_encoded = encoder.transform(val_df[categorical_cols])
        test_cat_encoded = encoder.transform(test_df[categorical_cols])

        train_df[categorical_cols] = np.where(train_cat_encoded == -1, 0, train_cat_encoded + 1).astype(int)
        val_df[categorical_cols] = np.where(val_cat_encoded == -1, 0, val_cat_encoded + 1).astype(int)
        test_df[categorical_cols] = np.where(test_cat_encoded == -1, 0, test_cat_encoded + 1).astype(int)

    train_df.to_csv(os.path.join(out_dir, "train_processed.csv"), index=False)
    val_df.to_csv(os.path.join(out_dir, "val_processed.csv"), index=False)
    test_df.to_csv(os.path.join(out_dir, "test_processed.csv"), index=False)

    metadata = {
        "target_col": target_col,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "text_dim": 16,
        "cat_cardinalities": {},
    }

    for i, col in enumerate(categorical_cols):
        metadata["cat_cardinalities"][col] = len(encoder.categories_[i]) + 1

    with open(os.path.join(out_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f, indent=2)

    print("Preprocess done.")
    print("Saved:", out_dir)


In [None]:
preprocess_airbnb()

  df = pd.read_csv(raw_path)


Preprocess done.
Saved: /content/data


In [None]:
batch_size = 256
epochs = 20
lr = 1e-3
embed_dim = 8
hidden_dim = 256

train_csv = os.path.join(DATA_DIR, "train_processed.csv")
val_csv   = os.path.join(DATA_DIR, "val_processed.csv")

train_ds = AirbnbDataset(train_csv, META_PATH)
val_ds   = AirbnbDataset(val_csv, META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

model = MLPPriceModel(META_PATH, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_val = float("inf")
save_dir = "/content/experiments/outputs"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "best_mlp.pt")

for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val:
        best_val = val_loss
        torch.save(model.state_dict(), save_path)
        print(f"  -> New best saved to {save_path}")

Epoch 01 | train_loss=131489.5060 | val_loss=22394.7534
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 02 | train_loss=3286.8720 | val_loss=412.0780
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 03 | train_loss=622.4379 | val_loss=322.4845
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 04 | train_loss=553.3055 | val_loss=312.8251
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 05 | train_loss=535.1397 | val_loss=298.0162
  -> New best saved to /content/experiments/outputs/best_mlp.pt
Epoch 06 | train_loss=530.9735 | val_loss=313.5283
Epoch 07 | train_loss=511.4739 | val_loss=319.2828
Epoch 08 | train_loss=517.2073 | val_loss=299.2469
Epoch 09 | train_loss=511.8141 | val_loss=304.5762
Epoch 10 | train_loss=515.3220 | val_loss=313.5430
Epoch 11 | train_loss=513.1058 | val_loss=314.2175
Epoch 12 | train_loss=514.6342 | val_loss=307.1279
Epoch 13 | train_loss=503.3811 | val_loss=297.7880
  -> New be

In [None]:
test_csv = os.path.join(DATA_DIR, "test_processed.csv")
test_ds = AirbnbDataset(test_csv, META_PATH)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

best_model = MLPPriceModel(META_PATH, embed_dim=embed_dim, hidden_dim=hidden_dim).to(device)
best_model.load_state_dict(torch.load(save_path, map_location=device))

val_rmse, val_mae = eval_with_metrics(best_model, val_loader, device)
print(f"Best MLP on VAL -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_with_metrics(best_model, test_loader, device)
print(f"Best MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Best MLP on VAL -> RMSE: 17.021, MAE: 2.992
Best MLP on TEST -> RMSE: 19.855, MAE: 3.217


# **7. Deeper MLP with BatchNorm & Dropout**

### Subtask:
Define the DeepMLPPriceModel class, train it on the prepared data, and evaluate performance.


Define a new `DeepMLPPriceModel` class featuring a deeper architecture (3-4 layers), Batch Normalization layers, and increased Dropout (0.2-0.3). Train this model on the processed dataset located in `"/content/data"` and evaluate its RMSE and MAE on validation and test sets to verify architectural improvements.

**Reasoning**:
Define the DeepMLPPriceModel class with the specified architecture (BatchNorm, Dropout) and train it using the Airbnb dataset. Then evaluate it on validation and test sets without applying exponential transformation to the predictions.



In [None]:
import torch
import torch.nn as nn
import pandas as pd
import json
import os
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define Model Class
class DeepMLPPriceModel(nn.Module):
    def __init__(self, meta_path, embed_dim=8, hidden_dim=256, dropout=0.3):
        super().__init__()
        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]
        cat_cardinalities = meta["cat_cardinalities"]

        self.embeds = nn.ModuleList()
        for col in self.categorical_cols:
            self.embeds.append(nn.Embedding(cat_cardinalities[col], embed_dim))

        input_dim = len(self.numeric_cols) + len(self.categorical_cols) * embed_dim

        # Hidden Block 1
        self.block1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        # Hidden Block 2
        self.block2 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        # Hidden Block 3
        self.block3 = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.BatchNorm1d(hidden_dim // 4),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        # Output Layer
        self.output = nn.Linear(hidden_dim // 4, 1)

    def forward(self, x_num, x_cat):
        embed_list = []
        for i, emb in enumerate(self.embeds):
            embed_list.append(emb(x_cat[:, i]))

        if embed_list:
            x_cat_embed = torch.cat(embed_list, dim=1)
            x = torch.cat([x_num, x_cat_embed], dim=1)
        else:
            x = x_num

        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        out = self.output(x).squeeze(1)
        return out

# Setup Data and Components
DATA_DIR = "/content/data"
META_PATH = os.path.join(DATA_DIR, "metadata.json")
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 256
lr = 1e-3
epochs = 20

train_ds = AirbnbDataset(os.path.join(DATA_DIR, "train_processed.csv"), META_PATH)
val_ds = AirbnbDataset(os.path.join(DATA_DIR, "val_processed.csv"), META_PATH)
test_ds = AirbnbDataset(os.path.join(DATA_DIR, "test_processed.csv"), META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

model = DeepMLPPriceModel(META_PATH).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

# Training Loop
save_dir = "/content/experiments/outputs"
os.makedirs(save_dir, exist_ok=True)
best_path = os.path.join(save_dir, "best_deep_mlp.pt")
best_val_loss = float("inf")

print("Training Deep MLP with BatchNorm & Dropout...")

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0
    n = 0
    for x_num, x_cat, y in train_loader:
        x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(x_num, x_cat)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * y.size(0)
        n += y.size(0)
    train_loss = total_loss / n

    # Validation
    model.eval()
    val_loss_accum = 0
    n_val = 0
    with torch.no_grad():
        for x_num, x_cat, y in val_loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)
            preds = model(x_num, x_cat)
            loss = criterion(preds, y)
            val_loss_accum += loss.item() * y.size(0)
            n_val += y.size(0)
    val_loss = val_loss_accum / n_val

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_path)
        print(f"  -> New best saved to {best_path}")

# Evaluation
print("\nEvaluating Best Deep MLP...")
model.load_state_dict(torch.load(best_path))
model.eval()

def get_metrics(loader):
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for x_num, x_cat, y in loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)
            preds = model(x_num, x_cat)
            all_preds.append(preds.cpu().numpy())
            all_targets.append(y.cpu().numpy())

    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)

    mse = mean_squared_error(all_targets, all_preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(all_targets, all_preds)
    return rmse, mae

val_rmse, val_mae = get_metrics(val_loader)
print(f"Deep MLP on VAL  -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = get_metrics(test_loader)
print(f"Deep MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Training Deep MLP with BatchNorm & Dropout...
Epoch 01 | train_loss=484291.9940 | val_loss=457367.2498
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 02 | train_loss=417280.8405 | val_loss=369281.1189
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 03 | train_loss=316427.9462 | val_loss=266591.9071
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 04 | train_loss=209917.2974 | val_loss=154866.3736
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 05 | train_loss=121481.2638 | val_loss=83054.7934
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 06 | train_loss=62014.4670 | val_loss=35674.8625
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 07 | train_loss=29551.1230 | val_loss=15454.3059
  -> New best saved to /content/experiments/outputs/best_deep_mlp.pt
Epoch 08 | train_loss=15374.6996 | val_loss=8748.3724
  -> New best saved to 

# **8. SmoothL1 Loss Experiment**

### Subtask:
Retrain the baseline MLP architecture using SmoothL1Loss to improve robustness to outliers.


**Reasoning**:
I will perform the SmoothL1 Loss experiment by defining the baseline MLP model, initializing it with the SmoothL1Loss function, training it for 20 epochs, and evaluating its performance using RMSE and MAE metrics on the validation and test sets.



In [None]:
import torch
import torch.nn as nn
import json
import os
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error

DATA_DIR = "/content/data"
META_PATH = os.path.join(DATA_DIR, "metadata.json")
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 256
lr = 1e-3
epochs = 20

with open(META_PATH, "r") as f:
    meta = json.load(f)
print(f"Target column in metadata: {meta['target_col']}")

class AirbnbDataset(Dataset):
    def __init__(self, csv_path, meta_path):
        self.df = pd.read_csv(csv_path)
        with open(meta_path, "r") as f:
            meta = json.load(f)
        self.target_col = meta["target_col"]
        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]

        self.y = self.df[self.target_col].values.astype("float32")
        self.x_num = self.df[self.numeric_cols].values.astype("float32") if self.numeric_cols else np.zeros((len(self.df), 0), dtype="float32")
        self.x_cat = self.df[self.categorical_cols].values.astype("int64") if self.categorical_cols else np.zeros((len(self.df), 0), dtype="int64")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.x_num[idx]),
            torch.tensor(self.x_cat[idx]),
            torch.tensor(self.y[idx])
        )

# Baseline MLP Model Definition
class MLPPriceModel(nn.Module):
    def __init__(self, meta_path, embed_dim=8, hidden_dim=256, dropout=0.1):
        super().__init__()
        with open(meta_path, "r") as f:
            meta = json.load(f)
        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]
        cat_cardinalities = meta["cat_cardinalities"]

        self.embeds = nn.ModuleList([
            nn.Embedding(cat_cardinalities[col], embed_dim)
            for col in self.categorical_cols
        ])

        input_dim = len(self.numeric_cols) + len(self.categorical_cols) * embed_dim

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x_num, x_cat):
        embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeds)]
        if embs:
            x = torch.cat([x_num] + embs, dim=1)
        else:
            x = x_num
        return self.mlp(x).squeeze(1)

# Load Data
train_ds = AirbnbDataset(os.path.join(DATA_DIR, "train_processed.csv"), META_PATH)
val_ds = AirbnbDataset(os.path.join(DATA_DIR, "val_processed.csv"), META_PATH)
test_ds = AirbnbDataset(os.path.join(DATA_DIR, "test_processed.csv"), META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

# Initialize Model, Optimizer, and SmoothL1Loss
model = MLPPriceModel(META_PATH, embed_dim=8, hidden_dim=256, dropout=0.1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.SmoothL1Loss()

# Training Loop
save_dir = "/content/experiments/outputs"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "best_smooth_mlp.pt")
best_val_loss = float("inf")

print("Training Baseline MLP with SmoothL1Loss...")

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0
    n = 0
    for x_num, x_cat, y in train_loader:
        x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(x_num, x_cat)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * y.size(0)
        n += y.size(0)

    train_loss = total_loss / n

    # Validation
    model.eval()
    total_val_loss = 0
    n_val = 0
    with torch.no_grad():
        for x_num, x_cat, y in val_loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)
            preds = model(x_num, x_cat)
            loss = criterion(preds, y)
            total_val_loss += loss.item() * y.size(0)
            n_val += y.size(0)

    val_loss = total_val_loss / n_val

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), save_path)
        print(f"  -> New best saved to {save_path}")

# Evaluation Function
def eval_metrics(model, loader, device):
    model.eval()
    preds_list, targets_list = [], []
    with torch.no_grad():
        for x_num, x_cat, y in loader:
            x_num, x_cat, y = x_num.to(device), x_cat.to(device), y.to(device)
            preds = model(x_num, x_cat)
            preds_list.append(preds.cpu().numpy())
            targets_list.append(y.cpu().numpy())

    preds = np.concatenate(preds_list)
    targets = np.concatenate(targets_list)

    mse = mean_squared_error(targets, preds)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(targets, preds)
    return rmse, mae

model.load_state_dict(torch.load(save_path, map_location=device))
print("\nEvaluating Best SmoothL1 Model...")

val_rmse, val_mae = eval_metrics(model, val_loader, device)
print(f"SmoothL1 MLP on VAL  -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_metrics(model, test_loader, device)
print(f"SmoothL1 MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Target column in metadata: price
Training Baseline MLP with SmoothL1Loss...
Epoch 01 | train_loss=276.0808 | val_loss=19.9877
  -> New best saved to /content/experiments/outputs/best_smooth_mlp.pt
Epoch 02 | train_loss=14.6179 | val_loss=3.5851
  -> New best saved to /content/experiments/outputs/best_smooth_mlp.pt
Epoch 03 | train_loss=12.2438 | val_loss=3.1626
  -> New best saved to /content/experiments/outputs/best_smooth_mlp.pt
Epoch 04 | train_loss=11.8735 | val_loss=3.4423
Epoch 05 | train_loss=11.6937 | val_loss=3.4223
Epoch 06 | train_loss=11.4883 | val_loss=3.5100
Epoch 07 | train_loss=11.5093 | val_loss=4.4283
Epoch 08 | train_loss=11.3698 | val_loss=2.8783
  -> New best saved to /content/experiments/outputs/best_smooth_mlp.pt
Epoch 09 | train_loss=11.2770 | val_loss=3.4705
Epoch 10 | train_loss=11.2170 | val_loss=3.9305
Epoch 11 | train_loss=11.1832 | val_loss=2.6696
  -> New best saved to /content/experiments/outputs/best_smooth_mlp.pt
Epoch 12 | train_loss=11.1950 | val_los

## Final Task

### Subtask:
Summarize and compare performance metrics of Baseline MLP, Deep MLP, and SmoothL1 MLP.


## Summary:

### Q&A

**How does the performance of the Deep MLP compare to the Baseline MLP trained with SmoothL1 Loss?**

The Baseline MLP trained with SmoothL1 Loss significantly outperformed the Deep MLP (trained with standard MSE Loss), particularly in terms of Mean Absolute Error (MAE).
*   **Deep MLP (MSE Loss):** Test RMSE: 22.311, Test MAE: 8.166.
*   **Baseline MLP (SmoothL1 Loss):** Test RMSE: 19.895, Test MAE: 3.325.

While the Deep MLP introduced architectural complexity (3 layers, BatchNorm, higher Dropout), the change in loss function to SmoothL1 provided a much larger gain in accuracy, reducing the MAE by more than half compared to the Deep MLP approach.

### Data Analysis Key Findings

*   **Deep MLP Architecture Performance:**
    *   A deeper model featuring 3 hidden blocks (256 -> 128 -> 64 units), Batch Normalization, and increased Dropout (0.3) was implemented.
    *   Training stabilized with a best validation loss achieved at **Epoch 14**.
    *   The model achieved a **Validation RMSE of 19.615** and **Test RMSE of 22.311**.
    *   The MAE remained relatively high at **8.166** on the test set.

*   **SmoothL1 Loss Experiment:**
    *   Retraining the baseline MLP architecture using `SmoothL1Loss` (which is less sensitive to outliers than MSE) resulted in rapid convergence, peaking at **Epoch 11**.
    *   This configuration achieved the best results among the experiments shown: **Validation RMSE of 17.046** and **Test RMSE of 19.895**.
    *   The **Test MAE dropped to 3.325**, indicating much better per-prediction accuracy compared to the Deep MLP's MAE of ~8.17.

### Insights or Next Steps

*   **Loss Function Impact:** The significant drop in MAE (from ~8.17 to ~3.33) when switching to SmoothL1 Loss suggests the dataset contains significant outliers that skew the MSE-based training, regardless of network depth. Robust loss functions are more critical here than architectural depth.
*   **Combination Strategy:** The next logical step is to train the **Deep MLP architecture** using **SmoothL1 Loss**. This would test if the architectural improvements (BatchNorm/Depth) can yield even better results when not hindered by the sensitivity of standard MSE loss to outliers.


# **9. Combine**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

def preprocess_airbnb(raw_path=RAW_PATH, out_dir=OUT_DIR):
    df = pd.read_csv(raw_path)

    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    cols_keep = [
        "price",
        "service_fee",
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "construction_year",
        "country",
        "country_code",
        "name",
        "description",
    ]
    cols_keep = [c for c in cols_keep if c in df.columns]
    df = df[cols_keep].copy()

    target_col = "price"

    for col in [target_col, 'service_fee']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.dropna(subset=[target_col])
    df = df[df[target_col] > 0]

    numeric_cols = [
        "service_fee",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "construction_year",
    ]
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
        df[col] = df[col].fillna(0)

    categorical_cols = [
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "country",
        "country_code",
    ]
    categorical_cols = [c for c in categorical_cols if c in df.columns]

    for col in categorical_cols:
        if df[col].isnull().any():
            df[col] = df[col].fillna("__MISSING__").astype(str)

    # ----------------------------
    # TEXT FEATURE PIPELINE (safe)
    # ----------------------------
    text_cols = []
    if "description" in df.columns:
        text_cols.append("description")
    if "name" in df.columns:
        text_cols.append("name")
    if "neighbourhood" in df.columns:
        text_cols.append("neighbourhood")
    if "neighbourhood_group" in df.columns:
        text_cols.append("neighbourhood_group")

    if len(text_cols) == 0:
        text_series = pd.Series([""] * len(df))
    else:
        text_series = df[text_cols].fillna("").astype(str).agg(" ".join, axis=1)

    tfidf = TfidfVectorizer(max_features=5000)
    tfidf_matrix = tfidf.fit_transform(text_series)

    pca = PCA(n_components=16, random_state=42)
    text_embeddings = pca.fit_transform(tfidf_matrix.toarray())

    text_embed_path = os.path.join(out_dir, "text_embeddings.npy")
    np.save(text_embed_path, text_embeddings)


    train_df, temp_df, train_text, temp_text = train_test_split(
        df, text_embeddings, test_size=0.2, random_state=42
    )
    val_df, test_df, val_text, test_text = train_test_split(
        temp_df, temp_text, test_size=0.5, random_state=42
    )

    scaler = StandardScaler()
    if numeric_cols:
        train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
        val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
        test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    if categorical_cols:
        train_cat_encoded = encoder.fit_transform(train_df[categorical_cols])
        val_cat_encoded = encoder.transform(val_df[categorical_cols])
        test_cat_encoded = encoder.transform(test_df[categorical_cols])

        train_df[categorical_cols] = np.where(train_cat_encoded == -1, 0, train_cat_encoded + 1).astype(int)
        val_df[categorical_cols] = np.where(val_cat_encoded == -1, 0, val_cat_encoded + 1).astype(int)
        test_df[categorical_cols] = np.where(test_cat_encoded == -1, 0, test_cat_encoded + 1).astype(int)

    train_df.to_csv(os.path.join(out_dir, "train_processed.csv"), index=False)
    val_df.to_csv(os.path.join(out_dir, "val_processed.csv"), index=False)
    test_df.to_csv(os.path.join(out_dir, "test_processed.csv"), index=False)

    metadata = {
        "target_col": target_col,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "text_dim": 16,
        "cat_cardinalities": {},
    }

    for i, col in enumerate(categorical_cols):
        metadata["cat_cardinalities"][col] = len(encoder.categories_[i]) + 1

    with open(os.path.join(out_dir, "metadata.json"), "w") as f:
        json.dump(metadata, f, indent=2)

    print("Preprocess done.")
    print("Saved:", out_dir)

preprocess_airbnb()

  df = pd.read_csv(raw_path)


Preprocess done.
Saved: /content/data


In [None]:
class MLPPriceModel(nn.Module):
    def __init__(self, meta_path: str, embed_dim: int = 8, hidden_dim: int = 256, dropout: float = 0.1):
        super().__init__()

        with open(meta_path, "r") as f:
            meta = json.load(f)

        self.numeric_cols = meta["numeric_cols"]
        self.categorical_cols = meta["categorical_cols"]
        cat_cardinalities = meta["cat_cardinalities"]

        self.num_numeric = len(self.numeric_cols)

        self.embeds = nn.ModuleList()
        for col in self.categorical_cols:
            # cat_cardinalities[col] now already includes the +1 for the unknown category
            num_categories = cat_cardinalities[col]
            self.embeds.append(nn.Embedding(num_categories, embed_dim))

        input_dim = self.num_numeric + len(self.categorical_cols) * embed_dim

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x_num, x_cat):
        embed_list = []
        for i, emb in enumerate(self.embeds):
            embed_list.append(emb(x_cat[:, i]))  # [B, embed_dim]

        if embed_list:
            x_embed = torch.cat(embed_list, dim=1)
            x = torch.cat([x_num, x_embed], dim=1)
        else:
            x = x_num

        out = self.mlp(x).squeeze(1)  # [B]
        return out

## **9.1. Train Small MLP**

### Subtask:
Train the 'Small MLP' configuration (embed_dim=4, hidden_dim=128, dropout=0.05) and evaluate its performance.


**Reasoning**:
Train the Small MLP model with the specified hyperparameters, save the best model, and evaluate it on validation and test sets.



In [None]:
# Hyperparameters for Small MLP
embed_dim_small = 4
hidden_dim_small = 128
dropout_small = 0.05
epochs = 100
lr = 1e-4
batch_size = 32

# Re-initialize Loaders (to ensure consistency)
train_ds = AirbnbDataset(os.path.join(DATA_DIR, "train_processed.csv"), META_PATH)
val_ds   = AirbnbDataset(os.path.join(DATA_DIR, "val_processed.csv"), META_PATH)
test_ds  = AirbnbDataset(os.path.join(DATA_DIR, "test_processed.csv"), META_PATH)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

# Initialize Model
model_small = MLPPriceModel(
    meta_path=META_PATH,
    embed_dim=embed_dim_small,
    hidden_dim=hidden_dim_small,
    dropout=dropout_small
).to(device)

optimizer = torch.optim.Adam(model_small.parameters(), lr=lr)
criterion = nn.MSELoss()

# Training Loop
save_path_small = os.path.join(save_dir, "small_mlp.pt")
best_val_small = float("inf")

print("Training Small MLP...")
for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(model_small, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model_small, val_loader, criterion, device)

    print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_loss={val_loss:.4f}")

    if val_loss < best_val_small:
        best_val_small = val_loss
        torch.save(model_small.state_dict(), save_path_small)
        print(f"  -> New best saved to {save_path_small}")

print("\nEvaluating Best Small MLP...")
model_small.load_state_dict(torch.load(save_path_small, map_location=device))

val_rmse, val_mae = eval_with_metrics(model_small, val_loader, device)
print(f"Small MLP on VAL  -> RMSE: {val_rmse:.3f}, MAE: {val_mae:.3f}")

test_rmse, test_mae = eval_with_metrics(model_small, test_loader, device)
print(f"Small MLP on TEST -> RMSE: {test_rmse:.3f}, MAE: {test_mae:.3f}")

Training Small MLP...
Epoch 01 | train_loss=183948.9628 | val_loss=66130.8807
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 02 | train_loss=33656.3332 | val_loss=4235.9072
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 03 | train_loss=1372.4933 | val_loss=528.4409
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 04 | train_loss=728.8375 | val_loss=390.4685
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 05 | train_loss=619.7787 | val_loss=335.8340
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 06 | train_loss=564.3281 | val_loss=313.9564
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 07 | train_loss=537.8092 | val_loss=303.4005
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 08 | train_loss=515.4329 | val_loss=297.2276
  -> New best saved to /content/experiments/outputs/small_mlp.pt
Epoch 09 | train_loss=507.9888 | val_loss

# **10. TabNet**



In [None]:
!pip install pytorch-tabnet


Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_airbnb(raw_path=RAW_PATH, out_dir=OUT_DIR):
    df = pd.read_csv(raw_path)

    # Standardize column names: trim spaces, convert to lowercase, replace spaces with underscores
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    # Keep only the columns we need
    cols_keep = [
        "price",
        "service_fee",
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "construction_year",
        "country",
        "country_code",
    ]
    cols_keep = [c for c in cols_keep if c in df.columns]
    df = df[cols_keep].copy()

    target_col = "price"

    # Clean 'price' and 'service_fee' columns
    for col in [target_col, 'service_fee']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Basic cleaning: drop rows where price is missing or non-positive
    df = df.dropna(subset=[target_col])
    df = df[df[target_col] > 0]

    numeric_cols = [
        "service_fee",
        "minimum_nights",
        "number_of_reviews",
        "reviews_per_month",
        "calculated_host_listings_count",
        "availability_365",
        "review_rate_number",
        "lat",
        "long",
        "construction_year",
    ]
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    # Fill NaN values in numeric columns (Median imputation)
    for col in numeric_cols:
        if df[col].isnull().any():
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
            # If median is also NaN (empty column), fill with 0
            df[col] = df[col].fillna(0)

    categorical_cols = [
        "room_type",
        "neighbourhood_group",
        "neighbourhood",
        "cancellation_policy",
        "host_identity_verified",
        "instant_bookable",
        "country",
        "country_code",
    ]
    categorical_cols = [c for c in categorical_cols if c in df.columns]

    # Fill NaN values in categorical columns before encoding
    for col in categorical_cols:
        if col in df.columns and df[col].isnull().any():
            df[col] = df[col].fillna('__MISSING__').astype(str)

    # Train/validation/test split
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Scale numeric features
    scaler = StandardScaler()
    if numeric_cols:
        train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])
        val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
        test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    if categorical_cols:
        train_cat_encoded = encoder.fit_transform(train_df[categorical_cols])
        val_cat_encoded = encoder.transform(val_df[categorical_cols])
        test_cat_encoded = encoder.transform(test_df[categorical_cols])

        # Map -1 (unknown) to 0, and shift all other categories by 1
        train_df[categorical_cols] = np.where(train_cat_encoded == -1, 0, train_cat_encoded + 1).astype(int)
        val_df[categorical_cols] = np.where(val_cat_encoded == -1, 0, val_cat_encoded + 1).astype(int)
        test_df[categorical_cols] = np.where(test_cat_encoded == -1, 0, test_cat_encoded + 1).astype(int)

    train_path = os.path.join(out_dir, "train_processed.csv")
    val_path = os.path.join(out_dir, "val_processed.csv")
    test_path = os.path.join(out_dir, "test_processed.csv")

    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

    metadata = {
        "target_col": target_col,
        "numeric_cols": numeric_cols,
        "categorical_cols": categorical_cols,
        "cat_cardinalities": {},
    }

    for i, col in enumerate(categorical_cols):
        metadata["cat_cardinalities"][col] = len(encoder.categories_[i]) + 1

    meta_path = os.path.join(out_dir, "metadata.json")
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print("Preprocess done.")
    print(f"Saved: {train_path}")
    print(f"Saved: {val_path}")
    print(f"Saved: {test_path}")
    print(f"Saved: {meta_path}")

In [None]:
preprocess_airbnb()

  df = pd.read_csv(raw_path)


Preprocess done.
Saved: /content/data/train_processed.csv
Saved: /content/data/val_processed.csv
Saved: /content/data/test_processed.csv
Saved: /content/data/metadata.json


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

def load_processed_data(data_dir=OUT_DIR):
    with open(os.path.join(data_dir, "metadata.json"), "r") as f:
        meta = json.load(f)

    target_col = meta["target_col"]
    numeric_cols = meta["numeric_cols"]
    categorical_cols = meta["categorical_cols"]

    train = pd.read_csv(os.path.join(data_dir, "train_processed.csv"))
    val   = pd.read_csv(os.path.join(data_dir, "val_processed.csv"))
    test  = pd.read_csv(os.path.join(data_dir, "test_processed.csv"))

    feature_cols = numeric_cols + categorical_cols

    X_train = train[feature_cols].values
    y_train = train[target_col].values

    X_val = val[feature_cols].values
    y_val = val[target_col].values

    X_test = test[feature_cols].values
    y_test = test[target_col].values

    return (X_train, y_train, X_val, y_val, X_test, y_test, meta)

In [None]:
load_processed_data(data_dir=OUT_DIR)

(array([[ 1.28327864, -0.18974395, -0.49331112, ...,  2.        ,
          1.        ,  1.        ],
        [ 0.72465276, -0.12769356,  0.00841602, ...,  1.        ,
          1.        ,  1.        ],
        [-0.10573705, -0.12769356, -0.41303478, ...,  2.        ,
          1.        ,  1.        ],
        ...,
        [-0.58887294, -0.18974395, -0.53344929, ...,  2.        ,
          1.        ,  1.        ],
        [ 0.43779083,  0.67896147, -0.19227484, ...,  1.        ,
          1.        ,  1.        ],
        [-0.25671702, -0.22076914, -0.43310387, ...,  2.        ,
          1.        ,  1.        ]]),
 array([1050.,  867.,  590., ...,  432.,  772.,  540.]),
 array([[ 1.49465059, -0.18974395, -0.53344929, ...,  2.        ,
          1.        ,  1.        ],
        [ 0.13583089, -0.15871875,  0.55028134, ...,  1.        ,
          1.        ,  1.        ],
        [-1.31357678, -0.18974395,  0.34959048, ...,  1.        ,
          1.        ,  1.        ],
        ..

In [None]:
def train_tabnet(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test
):

    tabnet = TabNetRegressor(
        n_d=16,
        n_a=16,
        n_steps=5,
        gamma=1.5,
        lambda_sparse=1e-4,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=1e-3),
        mask_type='entmax',
        seed=42
    )

    tabnet.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric=['rmse'],
        max_epochs=400,
        patience=20,
        batch_size=8192,
        virtual_batch_size=256,
        num_workers=0,
        drop_last=False
    )

    # ----------- Evaluation -------------
    pred_val = tabnet.predict(X_val)
    pred_test = tabnet.predict(X_test)

    # RMSE / MAE
    rmse_val = mean_squared_error(y_val, pred_val) ** 0.5
    mae_val = mean_absolute_error(y_val, pred_val)

    rmse_test = mean_squared_error(y_test, pred_test) ** 0.5
    mae_test = mean_absolute_error(y_test, pred_test)

    print(f"TabNet -> VAL RMSE: {rmse_val:.3f}, MAE: {mae_val:.3f}")
    print(f"TabNet -> TEST RMSE: {rmse_test:.3f}, MAE: {mae_test:.3f}")

    return tabnet


In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, meta = load_processed_data()
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
tabnet_model = train_tabnet(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test
)




epoch 0  | loss: 501745.34362| val_0_rmse: 706.71718|  0:00:02s
epoch 1  | loss: 500999.21397| val_0_rmse: 705.69761|  0:00:04s
epoch 2  | loss: 500248.8425| val_0_rmse: 705.28109|  0:00:06s
epoch 3  | loss: 499535.77327| val_0_rmse: 704.49901|  0:00:08s
epoch 4  | loss: 498839.67627| val_0_rmse: 704.42462|  0:00:10s
epoch 5  | loss: 498190.80584| val_0_rmse: 704.12135|  0:00:13s
epoch 6  | loss: 497536.61943| val_0_rmse: 703.72471|  0:00:15s
epoch 7  | loss: 496812.12182| val_0_rmse: 703.5244|  0:00:17s
epoch 8  | loss: 496068.94681| val_0_rmse: 702.96108|  0:00:19s
epoch 9  | loss: 495350.19775| val_0_rmse: 702.91201|  0:00:21s
epoch 10 | loss: 494511.31647| val_0_rmse: 702.31932|  0:00:24s
epoch 11 | loss: 493711.88155| val_0_rmse: 701.73068|  0:00:26s
epoch 12 | loss: 492752.4071| val_0_rmse: 700.94928|  0:00:28s
epoch 13 | loss: 491719.74154| val_0_rmse: 700.24784|  0:00:30s
epoch 14 | loss: 490560.99944| val_0_rmse: 699.51478|  0:00:32s
epoch 15 | loss: 489248.86537| val_0_rmse: 



TabNet -> VAL RMSE: 17.260, MAE: 3.348
TabNet -> TEST RMSE: 20.055, MAE: 3.576
