In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from datetime import datetime
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

# Load Data
train_x = pd.read_csv("airbnb_train_x.csv")
train_y = pd.read_csv("airbnb_train_y.csv")
test_x = pd.read_csv("airbnb_test_x.csv")

train = pd.concat([train_x, train_y], axis=1)
train['high_booking_rate'] = train['high_booking_rate'].map({'NO': 0, 'YES': 1})
train = train.dropna(subset=['high_booking_rate'])

today = pd.to_datetime("2025-04-12")
train['host_since'] = pd.to_datetime(train['host_since'], errors='coerce')
train['first_review'] = pd.to_datetime(train['first_review'], errors='coerce')
test_x['host_since'] = pd.to_datetime(test_x['host_since'], errors='coerce')
test_x['first_review'] = pd.to_datetime(test_x['first_review'], errors='coerce')

# Drop unnecessary columns
drop_cols = ['name', 'summary', 'space', 'description', 'experiences_offered', 'neighborhood_overview', 'notes',
             'transit', 'access', 'interaction', 'house_rules', 'host_location', 'host_about', 'host_response_time',
             'host_acceptance_rate', 'host_neighbourhood', 'host_verifications', 'street', 'neighborhood',
             'neighborhood_group', 'city', 'state', 'market', 'smart_location', 'country_code', 'country',
             'bed_type', 'license', 'jurisdiction_names', 'host_name']
train.drop(columns=[c for c in drop_cols if c in train.columns], inplace=True)
test_x.drop(columns=[c for c in drop_cols if c in test_x.columns], inplace=True)

# Pre-cluster test set
kmeans = KMeans(n_clusters=10, random_state=42)
test_x['location_cluster'] = kmeans.fit_predict(test_x[['latitude', 'longitude']])
cat_features = ['room_type', 'property_type', 'cancellation_policy']
test_x[cat_features] = test_x[cat_features].fillna('Missing')
test_x_orig = test_x.copy()

# Feature engineering
def feature_engineering(df, price_median):
    df['host_account_age '] = (today - df['host_since']).dt.days
    df['listing_age_days'] = (today - df['first_review']).dt.days.fillna(0)
    df['no_reviews_flag'] = df['first_review'].isna().astype(int)
    df['price_per_guest'] = df['price'] / (df['accommodates'] + 1)
    df['price_to_deposit'] = df['security_deposit'] / (df['price'] + 1)
    df['availability_rate_30'] = df['availability_30'] / 30
    df['availability_rate_365'] = df['availability_365'] / 365
    df['log_price'] = np.log1p(df['price'])
    df['is_expensive'] = (df['price'] > price_median).astype(int)
    df['has_cleaning_fee'] = (df['cleaning_fee'] > 0).astype(int)
    df['has_security_deposit'] = (df['security_deposit'] > 0).astype(int)
    df['booking_pressure'] = (df['availability_365'] == 0).astype(int)
    df['security_deposit_missing'] = df['security_deposit'].isna().astype(int)
    df['cleaning_fee_missing'] = df['cleaning_fee'].isna().astype(int)
    df['price_x_avail'] = df['price'] * df['availability_365']
    df['log_price_x_guests'] = df['log_price'] * df['guests_included']
    df['rooms_per_guest'] = (df['bedrooms'] + 1) / (df['guests_included'] + 1)
    df['bathrooms_per_guest'] = (df['bathrooms'] + 1) / (df['guests_included'] + 1)
    df['price_per_bedroom'] = df['price'] / (df['bedrooms'] + 1)
    df['lat_long_product'] = df['latitude'] * df['longitude']
    df['lat_squared'] = df['latitude'] ** 2
    df['long_squared'] = df['longitude'] ** 2
    df['dist_to_center'] = np.sqrt((df['latitude'] - 34.0522)**2 + (df['longitude'] + 118.2437)**2)
    df['price_bin'] = pd.qcut(df['price'], 6, labels=False, duplicates='drop')
    df['min_night_bin'] = pd.cut(df['minimum_nights'], bins=[0, 2, 7, 30, 100, 365], labels=False, include_lowest=True)
    return df

# Cross-validation loop
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_preds = np.zeros(test_x.shape[0])
auc_scores = []

for train_idx, val_idx in skf.split(train, train['high_booking_rate']):
    fold_train = train.iloc[train_idx].copy()
    fold_val = train.iloc[val_idx].copy()

    price_median = fold_train['price'].median()
    zipcode_mean = fold_train.groupby('zipcode')['high_booking_rate'].mean()
    fold_train['zipcode_encoded'] = fold_train['zipcode'].map(zipcode_mean)
    fold_val['zipcode_encoded'] = fold_val['zipcode'].map(zipcode_mean).fillna(fold_train['high_booking_rate'].mean())
    test_x_fold = test_x_orig.copy()
    test_x_fold['zipcode_encoded'] = test_x_fold['zipcode'].map(zipcode_mean).fillna(fold_train['high_booking_rate'].mean())

    fold_train = feature_engineering(fold_train, price_median)
    fold_val = feature_engineering(fold_val, price_median)
    test_x_fold = feature_engineering(test_x_fold, price_median)

    for df in [fold_train, fold_val, test_x_fold]:
        df.drop(columns=['host_since', 'first_review', 'amenities', 'features'], errors='ignore', inplace=True)

    kmeans = KMeans(n_clusters=10, random_state=42)
    fold_train['location_cluster'] = kmeans.fit_predict(fold_train[['latitude', 'longitude']])
    fold_val['location_cluster'] = kmeans.predict(fold_val[['latitude', 'longitude']])
    test_x_fold['location_cluster'] = kmeans.predict(test_x_fold[['latitude', 'longitude']])

    for df in [fold_train, fold_val, test_x_fold]:
        for col in cat_features:
            df[col] = df[col].fillna("Missing")

    features = [col for col in fold_train.columns if col not in ['high_booking_rate', 'perfect_rating_score', 'zipcode']]
    X_train = pd.get_dummies(fold_train[features + cat_features + ['location_cluster']], drop_first=True)
    X_val = pd.get_dummies(fold_val[features + cat_features + ['location_cluster']], drop_first=True)
    X_test = pd.get_dummies(test_x_fold[features + cat_features + ['location_cluster']], drop_first=True)

    X_train = X_train.loc[:, ~X_train.columns.duplicated()]
    X_val = X_val.loc[:, ~X_val.columns.duplicated()]
    X_test = X_test.loc[:, ~X_test.columns.duplicated()]
    X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    y_train = fold_train['high_booking_rate']
    y_val = fold_val['high_booking_rate']

    X_train.fillna(X_train.median(), inplace=True)
    X_val.fillna(X_train.median(), inplace=True)
    X_test.fillna(X_train.median(), inplace=True)

    model = XGBClassifier(
        n_estimators=1000,
        max_depth=5,
        learning_rate=0.01,
        subsample=0.85,
        colsample_bytree=0.85,
        min_child_weight=10,
        gamma=0.3,
        reg_alpha=1.0,
        reg_lambda=2.0,
        eval_metric='auc',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    val_probs = model.predict_proba(X_val)[:, 1]
    auc_scores.append(roc_auc_score(y_val, val_probs))
    xgb_preds += model.predict_proba(X_test)[:, 1] / skf.n_splits

pd.DataFrame({'x': xgb_preds}).to_csv("high_booking_rate_group12.csv", index=False, header=True)
print(f"XGBoost Generalized CV AUC: {np.mean(auc_scores):.4f}")