# NYC Airbnb Project Modelling

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [16]:
df = pd.read_csv('AB_NYC_cleaned.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,distance_to_times_square,days_since_last_review
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,12.337915,264.0
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,0.508366,50.0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,0.0,1,365,6.75725,999.0
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,5.701504,233.0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,1.449706,18.0


In [17]:
df.shape

(46444, 18)

In [18]:
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
df['days_since_last_review'] = (
    pd.to_datetime('today') - df['last_review']
).dt.days
df['days_since_last_review'] = df['days_since_last_review'].fillna(df['days_since_last_review'].median())
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# Create target variable (log-transformed)
df = df[df['price'] > 0]  # remove free listings
df['price_log'] = np.log1p(df['price'])

In [19]:
categorical_cols = ['neighbourhood_group', 'neighbourhood', 'room_type']
numerical_cols = [
    'latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
    'reviews_per_month', 'calculated_host_listings_count',
    'availability_365', 'distance_to_times_square', 'days_since_last_review'
]

features = categorical_cols + numerical_cols
X = df[features]
y = df['price_log']

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [22]:
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

In [None]:
for name, model in models.items():
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    pipe.fit(X_train, y_train)

    # Predict log price and actual price
    y_pred_log = pipe.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_actual = np.expm1(y_test)

    print(f"\n--- {name} ---")
    print("MAE:", mean_absolute_error(y_test_actual, y_pred))
    print("RMSE:", mean_squared_error(y_test_actual, y_pred))
    print("R² (log space):", r2_score(y_test, y_pred_log))


--- Random Forest ---
MAE: 38.150416477735334
RMSE: 3689.295623816427
R² (log space): 0.6505576429694038

--- XGBoost ---
MAE: 38.33919122280942
RMSE: 3720.861460635992
R² (log space): 0.6511264029980932
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2068
[LightGBM] [Info] Number of data points in the train set: 37155, number of used features: 142
[LightGBM] [Info] Start training from score 4.693238

--- LightGBM ---
MAE: 38.39263687012282
RMSE: 3761.496256017771
R² (log space): 0.6508841890349698

--- CatBoost ---
MAE: 37.84861532150389
RMSE: 3661.6295075169596
R² (log space): 0.6600546583033083


In [25]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

catboost_model = CatBoostRegressor(verbose=0, random_state=42)

catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', catboost_model)
])

param_dist = {
    'regressor__iterations': [200, 300, 500, 700],
    'regressor__depth': [4, 6, 8, 10],
    'regressor__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'regressor__l2_leaf_reg': [1, 3, 5, 7, 9]
}

# Define RMSE as the scoring metric (negated because lower is better)
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform RandomizedSearchCV
catboost_search = RandomizedSearchCV(
    estimator=catboost_pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring=rmse_scorer,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit search on training data
catboost_search.fit(X_train, y_train)

# Get best model
best_catboost_model = catboost_search.best_estimator_

# Predict and evaluate
y_pred_log = best_catboost_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_true = np.expm1(y_test)

print("✅ Best Parameters:", catboost_search.best_params_)
print("📉 MAE:", mean_absolute_error(y_test_true, y_pred))
print("📉 RMSE:", mean_squared_error(y_test_true, y_pred, squared=False))
print("📈 R² (log scale):", r2_score(y_test, y_pred_log))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
✅ Best Parameters: {'regressor__learning_rate': 0.05, 'regressor__l2_leaf_reg': 5, 'regressor__iterations': 700, 'regressor__depth': 8}
📉 MAE: 37.88621117175843
📉 RMSE: 60.54588897354361
📈 R² (log scale): 0.6594699253418179


