In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
AirBnB_SVM_Prediction.py
Automatically generated by Colab.
Original file is located at:
    https://colab.research.google.com/drive/1CXmTvgLB4kmVOIMT8qbH5uR9YWOD0ZYy

AirBnB Dataset Prediction

This script tests several regression models (SVM, XGBoost, LightGBM, RandomForest) using advanced feature engineering to predict Airbnb listing prices.
Engineered features include:
  - Amenity Count (amenity_count)
  - Broad Amenity Groups: wifi, kitchen, pool, air conditioning, security
  - Average Review Score (avg_review)
  - Reviews Ratio (reviews_ratio = reviews_per_month / (number_of_reviews+1))
  - Location Clusters (via KMeans on latitude/longitude; one-hot encoded as loc_0 ... loc_9)
  - Polynomial expansion on key numeric features (including engineered ones)

Dataset source: https://insideairbnb.com/get-the-data/ (Credits to the author)

Note:
 - This version is adapted for the Japan dataset.
 - Ensure that "listings.csv.gz" is in the current working directory.
"""

import os
import gzip
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import scipy.stats as stats

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder, MinMaxScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans

# For Bayesian tuning (pip install scikit-optimize)
%pip install -q scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real

# For XGBoost and LightGBM (pip install xgboost lightgbm)
%pip install -q xgboost
from xgboost import XGBRegressor
try:
    import lightgbm as lgb
except ModuleNotFoundError:
    print("Warning: lightgbm module not found. To use LightGBM, please install it via 'pip install lightgbm'.")
    lgb = None

from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge


def main():
    ##############################
    # Original Workflow
    ##############################
    print("\n=== Original Workflow ===")
    # 1. Data Extraction
    cwd = os.getcwd()
    print("Current working directory:", cwd)
    compressed_name = 'listings.csv.gz'
    file_name = 'listings.csv'
    if not os.path.exists(file_name):
        with gzip.open(compressed_name, 'rb') as f_in:
            with open(file_name, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    print("Listings file ready:", file_name)

    # 2. Load and Inspect Data
    df = pd.read_csv(file_name)
    print("Original dataset shape:", df.shape)
    print("First 3 rows:")
    print(df.head(3))
    df.info(verbose=True)

    # 3. Data Cleaning
    drop_columns = [
         'id', 'listing_url', 'scrape_id', 'last_scraped', 'source',
         'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
         'host_thumbnail_url', 'host_picture_url', 'host_verifications', 'host_neighbourhood',
         'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic',
         'calendar_updated', 'calendar_last_scraped', 'first_review', 'last_review',
         'neighbourhood', 'neighborhood_overview', 'neighbourhood_group_cleansed', 'license', 'picture_url', 'host_id',
         'maximum_nights_avg_ntm', 'has_availability', 'calendar_last_scraped',
         'number_of_reviews_ltm', 'number_of_reviews_l30d',
         'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
         'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
         'name', 'description'
    ]
    df_cleaned = df.drop(columns=drop_columns).dropna()
    df_cleaned['price'] = df_cleaned['price'].str.replace('$', '', regex=False)\
                                             .str.replace(',', '', regex=False)\
                                             .astype(float)

    # 4. Feature Engineering – Amenities
    df_cleaned['amenities'] = df_cleaned['amenities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df_cleaned['amenity_count'] = df_cleaned['amenities'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    wifi = {"Wifi", "Wi-Fi", "Ethernet connection", "無線lan", "ワイヤレスインターネット"}
    kitchen = {"Kitchen", "Microwave", "Refrigerator", "Stove", "Oven", "BBQ grill", "Coffee maker", "Dining table"}
    pool = {"Pool", "Swimming pool", "Hot tub"}
    air_cond = {"Air conditioning", "Heating", "Ceiling fan", "Indoor Fireplace"}
    security = {"Smoke alarm", "Fire extinguisher", "Carbon monoxide alarm"}
    groups = {"has_wifi": wifi, "has_kitchen": kitchen, "has_pool": pool, "has_air_conditioning": air_cond, "has_security": security}
    for feat, group in groups.items():
         df_cleaned[feat] = df_cleaned.apply(lambda row: 1 if (isinstance(row['amenity_count'], int) and
                                         any(item.strip().lower() in {a.lower() for a in group} for item in row['amenities']))
                                         else 0, axis=1)
    df_cleaned = df_cleaned.drop(columns=['amenities'])

    # 5. Process Property & Physical Attributes
    if 'property_type' in df_cleaned.columns:
         df_cleaned = df_cleaned.drop(columns=['property_type'])
    if 'room_type' in df_cleaned.columns:
         df_cleaned = pd.get_dummies(df_cleaned, columns=['room_type'], prefix='room_type')
    df_cleaned = df_cleaned[df_cleaned['bedrooms'] <= 10]
    df_cleaned = df_cleaned[df_cleaned['beds'] <= 10]

    # 6. Price Analysis and Outlier Removal
    price_thresh = np.percentile(df_cleaned["price"], 95)
    df_cleaned = df_cleaned[df_cleaned["price"] <= price_thresh]

    # 7. Process Review Attributes
    review_feats = ['review_scores_rating','review_scores_accuracy','review_scores_cleanliness',
                    'review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value']
    df_cleaned['avg_review'] = df_cleaned[review_feats].mean(axis=1)
    df_cleaned = df_cleaned.drop(columns=review_feats)

    # 8. Process Booking & Host Attributes
    df_cleaned["instant_bookable"] = df_cleaned["instant_bookable"].map({"t": True, "f": False})
    df_cleaned["reviews_ratio"] = df_cleaned["reviews_per_month"] / (df_cleaned["number_of_reviews"] + 1)
    host_map = {"within an hour": 3, "within a few hours": 2, "within a day": 1, "a few days or more": 0}
    df_cleaned['host_response_time'] = df_cleaned['host_response_time'].map(host_map)
    df_cleaned['host_response_rate'] = df_cleaned['host_response_rate'].str.rstrip("%").astype(float) / 100
    df_cleaned['host_acceptance_rate'] = df_cleaned['host_acceptance_rate'].str.rstrip("%").astype(float) / 100
    df_cleaned['host_is_superhost'] = df_cleaned['host_is_superhost'].map({"t": True, "f": False})
    df_cleaned['host_identity_verified'] = df_cleaned['host_identity_verified'].map({"t": True, "f": False})

    # 9. Process Location Attributes (Clustering)
    loc_df = df_cleaned[['latitude', 'longitude']]
    kmeans = KMeans(n_clusters=10, random_state=42)
    df_cleaned['location_cluster'] = kmeans.fit_predict(loc_df)
    df_cleaned = pd.get_dummies(df_cleaned, columns=['location_cluster'], prefix='loc')

    # 10. Final Preparation for Modeling
    drop_extra = ['minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
                  'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights']
    df_cleaned = df_cleaned.drop(columns=[col for col in drop_extra if col in df_cleaned.columns])
    print("Final columns for modeling:")
    print(df_cleaned.columns.tolist())

    use_log_target = True
    if use_log_target:
         y = np.log1p(df_cleaned['price'])
    else:
         y = df_cleaned['price']
    X = df_cleaned.drop(columns=['price'])
    X = pd.get_dummies(X, drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 11. Polynomial Feature Expansion
    numeric_cols = ['bathrooms', 'bedrooms', 'beds', 'accommodates', 'amenity_count', 'avg_review',
                    'number_of_reviews', 'reviews_per_month', 'reviews_ratio']
    available_num = [col for col in numeric_cols if col in X_train.columns]
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly.fit_transform(X_train[available_num])
    X_test_poly = poly.transform(X_test[available_num])
    poly_names = poly.get_feature_names_out(available_num)
    X_train_poly_df = pd.DataFrame(X_train_poly, columns=poly_names, index=X_train.index)
    X_test_poly_df = pd.DataFrame(X_test_poly, columns=poly_names, index=X_test.index)
    X_train_rest = X_train.drop(columns=available_num)
    X_test_rest = X_test.drop(columns=available_num)
    X_train_final = pd.concat([X_train_rest, X_train_poly_df], axis=1)
    X_test_final = pd.concat([X_test_rest, X_test_poly_df], axis=1)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_final)
    X_test_scaled = scaler.transform(X_test_final)

    def revert_target(y_val):
         return np.expm1(y_val) if use_log_target else y_val

    # 12. Model Training and Evaluation
    # Baseline SVM (Linear)
    svm_baseline = SVR(kernel='linear', C=1.0)
    svm_baseline.fit(X_train_scaled, y_train)
    y_pred_svm = svm_baseline.predict(X_test_scaled)
    r2_svm = r2_score(revert_target(y_test), revert_target(y_pred_svm))
    print("Baseline SVM R²:", r2_svm)
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=revert_target(y_test), y=revert_target(y_pred_svm), alpha=0.6)
    plt.xlabel("Actual Prices")
    plt.ylabel("Predicted Prices")
    plt.title("Baseline SVM: Actual vs. Predicted")
    plt.show()

    # Bayesian Tuned SVM
    bayes_params = {'C': Real(0.1, 20.0, prior='log-uniform'),
                    'epsilon': Real(0.01, 2.0, prior='log-uniform')}
    svr = SVR(kernel='linear')
    bayes_search = BayesSearchCV(svr, bayes_params, n_iter=5, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    bayes_search.fit(X_train_scaled, y_train)
    best_svr = bayes_search.best_estimator_
    y_pred_svr_tuned = best_svr.predict(X_test_scaled)
    r2_svr_tuned = r2_score(revert_target(y_test), revert_target(y_pred_svr_tuned))
    print("Bayesian Tuned SVM R²:", r2_svr_tuned)
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=revert_target(y_test), y=revert_target(y_pred_svr_tuned), alpha=0.6)
    plt.xlabel("Actual Prices")
    plt.ylabel("Predicted Prices")
    plt.title("Bayesian Tuned SVM: Actual vs. Predicted")
    plt.show()

    # XGBoost
    xgb_grid = {'n_estimators': [100, 300],
                'max_depth': [3, 5, 7],
                'learning_rate': [0.01, 0.1],
                'subsample': [0.7, 1.0],
                'colsample_bytree': [0.7, 1.0],
                'min_child_weight': [1, 3]}
    xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')
    xgb_search = RandomizedSearchCV(xgb_model, xgb_grid, n_iter=6, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    xgb_search.fit(X_train_scaled, y_train)
    best_xgb = xgb_search.best_estimator_
    y_pred_xgb = best_xgb.predict(X_test_scaled)
    r2_xgb = r2_score(revert_target(y_test), revert_target(y_pred_xgb))
    print("Tuned XGBoost R²:", r2_xgb)

    # LightGBM
    if lgb is not None:
        lgb_grid = {'n_estimators': [100, 300],
                    'max_depth': [3, 5, 7],
                    'learning_rate': [0.01, 0.1],
                    'num_leaves': [15, 31],
                    'subsample': [0.7, 1.0]}
        lgb_model = lgb.LGBMRegressor(random_state=42)
        lgb_search = RandomizedSearchCV(lgb_model, lgb_grid, n_iter=6, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
        lgb_search.fit(X_train_scaled, y_train)
        best_lgb = lgb_search.best_estimator_
        y_pred_lgb = best_lgb.predict(X_test_scaled)
        r2_lgb = r2_score(revert_target(y_test), revert_target(y_pred_lgb))
        print("Tuned LightGBM R²:", r2_lgb)
    else:
        print("Skipping LightGBM training since the module is not installed.")
        r2_lgb = 0.0

    # RandomForest
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
    rf_model.fit(X_train_scaled, y_train)
    y_pred_rf = rf_model.predict(X_test_scaled)
    r2_rf = r2_score(revert_target(y_test), revert_target(y_pred_rf))
    print("RandomForest R²:", r2_rf)

    # Blended Model: weighted blend of tuned SVM, XGBoost, LightGBM, and RF
    y_pred_blend = (0.1 * y_pred_svr_tuned +
                    0.3 * y_pred_xgb +
                    0.3 * (y_pred_lgb if lgb is not None else 0) +
                    0.3 * y_pred_rf)
    r2_blend = r2_score(revert_target(y_test), revert_target(y_pred_blend))
    print("Blended Model R²:", r2_blend)

    # Stacking Ensemble using XGBoost, LightGBM, and RF with Ridge meta-learner
    estimators = [('xgb', best_xgb)]
    if lgb is not None:
        estimators.append(('lgb', best_lgb))
    estimators.append(('rf', rf_model))
    stack_reg = StackingRegressor(estimators=estimators, final_estimator=Ridge(), cv=3, n_jobs=-1)
    stack_reg.fit(X_train_scaled, y_train)
    y_pred_stack = stack_reg.predict(X_test_scaled)
    r2_stack = r2_score(revert_target(y_test), revert_target(y_pred_stack))
    print("Stacking Ensemble R²:", r2_stack)
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=revert_target(y_test), y=revert_target(y_pred_stack), alpha=0.6)
    plt.xlabel("Actual Prices")
    plt.ylabel("Predicted Prices")
    plt.title("Stacking Ensemble: Actual vs. Predicted")
    plt.show()

    # Advanced Stacking Ensemble with tuned SVM, XGBoost, LightGBM, and RF
    estimators = [
         ('svm', SVR(kernel='linear', C=best_svr.get_params()['C'], epsilon=best_svr.get_params()['epsilon'])),
         ('xgb', best_xgb)
    ]
    if lgb is not None:
        estimators.append(('lgb', best_lgb))
    estimators.append(('rf', RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)))
    adv_stack = StackingRegressor(estimators=estimators, final_estimator=Ridge(), cv=2, n_jobs=-1)
    adv_stack.fit(X_train_scaled, y_train)
    y_pred_adv_stack = adv_stack.predict(X_test_scaled)
    r2_adv_stack = r2_score(revert_target(y_test), revert_target(y_pred_adv_stack))
    print("Advanced Stacking Ensemble R²:", r2_adv_stack)
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=revert_target(y_test), y=revert_target(y_pred_adv_stack), alpha=0.6)
    plt.xlabel("Actual Prices")
    plt.ylabel("Predicted Prices")
    plt.title("Advanced Stacking Ensemble: Actual vs. Predicted")
    plt.show()

    # 13. Summary of Results
    print("\n--- Analysis and Findings ---")
    print("| Model                                        |   R² Score |")
    print("|----------------------------------------------|------------|")
    print("| Baseline SVM (Linear + Poly)                 | {:.4f}     |".format(r2_svm))
    print("| Bayesian Tuned SVM (Linear + Poly)           | {:.4f}     |".format(r2_svr_tuned))
    print("| Tuned XGBoost (Poly)                         | {:.4f}     |".format(r2_xgb))
    print("| Tuned LightGBM (Poly)                        | {:.4f}     |".format(r2_lgb))
    print("| RandomForest                                 | {:.4f}     |".format(r2_rf))
    print("| Blended Model (SVM+XGB+LGB+RF)                | {:.4f}     |".format(r2_blend))
    print("| Stacking Ensemble (XGB+LGB+RF+Ridge)           | {:.4f}     |".format(r2_stack))
    print("| Advanced Stacking Ensemble (SVM+XGB+LGB+RF+Ridge)| {:.4f}     |".format(r2_adv_stack))

    
if __name__ == '__main__':
    # Run the original workflow followed immediately by the integrated additional model training
    main()



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



=== Original Workflow ===
Current working directory: c:\Users\micha\OneDrive\Documents\sit\Y2_T2\Machine_Learning\Project\MLBB
Listings file ready: listings.csv
Original dataset shape: (21058, 75)
First 3 rows:
       id                          listing_url       scrape_id last_scraped  \
0  197677  https://www.airbnb.com/rooms/197677  20241230011552   2024-12-30   
1  776070  https://www.airbnb.com/rooms/776070  20241230011552   2024-12-30   
2  905944  https://www.airbnb.com/rooms/905944  20241230011552   2024-12-30   

        source                                               name  \
0  city scrape                          Oshiage Holiday Apartment   
1  city scrape                             Kero-kero house room 1   
2  city scrape  4F Spacious Apartment in Shinjuku / Shibuya Tokyo   

                                         description  \
0                                                NaN   
1  We have been in airbnb since 2011 and it has g...   
2  NEWLY RENOVATED propert

  super()._check_params_vs_input(X, default_n_init=10)


Final columns for modeling:
['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_identity_verified', 'neighbourhood_cleansed', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'price', 'minimum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'instant_bookable', 'reviews_per_month', 'amenity_count', 'has_wifi', 'has_kitchen', 'has_pool', 'has_air_conditioning', 'has_security', 'room_type_Entire home/apt', 'room_type_Hotel room', 'room_type_Private room', 'room_type_Shared room', 'avg_review', 'reviews_ratio', 'loc_0', 'loc_1', 'loc_2', 'loc_3', 'loc_4', 'loc_5', 'loc_6', 'loc_7', 'loc_8', 'loc_9']
