In [None]:
#!pip install kagglehub

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dgomonov/new-york-city-airbnb-open-data")

print("Path to dataset files:", path)

In [None]:
!pip install textblob

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load data
path = r"airbnb-newyork\AB_NYC_2019.csv"
df = pd.read_csv(path, low_memory=False)
print("Loaded:", df.shape)
print("Columns:", df.columns.tolist())


In [None]:
df.info()
df.describe(include='all').T

In [None]:
df.head()

## 3. Basic Cleaning
- Remove listings with missing price or zero price
- Remove extreme prices (> $1000) for stability


In [None]:
df = df[df['price'] > 0]
df = df[df['price'] <= 1000]
df.reset_index(drop=True, inplace=True)

print("After filtering:", df.shape)


## 4. Text Feature Engineering
We will use the `name` column as text input:
- Length of name
- Word count
- Presence of certain keywords
- Sentiment score
- TF-IDF vectorization


In [None]:
import re
from textblob import TextBlob

In [None]:
def safe_text(x):
    return '' if pd.isna(x) else str(x)

In [None]:
df['name_text'] = df['name'].apply(safe_text)
df['name_len'] = df['name_text'].apply(len)
df['name_words'] = df['name_text'].apply(lambda s: len(s.split()))

In [None]:
keywords = ['cozy', 'luxury', 'spacious', 'modern', 'private', 'view']
for kw in keywords:
    df[f'kw_{kw}'] = df['name_text'].str.lower().str.contains(kw).fillna(False).astype(int)

In [None]:
df['name_sentiment'] = df['name_text'].apply(lambda s: TextBlob(s).sentiment.polarity if s else 0)


In [None]:
df[['name_text','name_len','name_words','name_sentiment']].head()

## 5. Feature Selection
We'll keep:
- Numeric: latitude, longitude, minimum_nights, number_of_reviews, reviews_per_month, availability_365
- Categorical: neighbourhood_group, room_type
- Text: name

In [None]:
num_cols = ['latitude','longitude','minimum_nights','number_of_reviews','reviews_per_month','availability_365']
cat_cols = ['neighbourhood_group','room_type']
text_col = 'name_text'

## 6. Train/Test Split
We use log(price) as the target to reduce skew.

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = df[num_cols + cat_cols + [text_col]]
y = np.log1p(df['price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 7. Preprocessing Pipelines
Separate numeric, categorical, and text transformations.

The main goals of preprocessing are:

Handle missing values

Transform features into model-friendly formats

Ensure consistent scaling and encoding

Reduce dimensionality when dealing with high-dimensional text## 0. Understanding the Preprocessing Stages

Before we train any model, we must prepare our dataset so that it is:
- Free of missing values
- All features are in numeric form
- Scaled appropriately (for models that need it)
- Reduced in dimensionality when working with high-dimensional data (like text)

We build **three pipelines**:
1. **Numeric Pipeline** — Median imputation and scaling
2. **Categorical Pipeline** — Constant imputation and One-Hot Encoding
3. **Text Pipeline** — TF-IDF vectorization + Truncated SVD

Finally, we combine them with a `ColumnTransformer` so they run in parallel.

The goal:
- **Numeric** → handle missing, scale for algorithms like Ridge  
- **Categorical** → convert text labels into binary dummy columns  
- **Text** → convert long text into structured numeric form while keeping the most useful components  


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

text_pipeline_ridge = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000, stop_words='english')),
    ('svd', TruncatedSVD(n_components=50, random_state=42))
])

text_pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
    ('svd', TruncatedSVD(n_components=20, random_state=42))
])

preprocessor_ridge = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols),
    ('text', text_pipeline_ridge, text_col)
])

preprocessor_rf = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols),
    ('text', text_pipeline_rf, text_col)
])

## 8. Ridge Regression Model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pipe_ridge = Pipeline([
    ('pre', preprocessor_ridge),
    ('model', Ridge(alpha=1.0))
])

pipe_ridge.fit(X_train, y_train)
y_pred_ridge = np.expm1(pipe_ridge.predict(X_test))

print("Ridge Regression Performance:")
print("RMSE:", mean_squared_error(np.expm1(y_test), y_pred_ridge, squared=False))
print("MAE:", mean_absolute_error(np.expm1(y_test), y_pred_ridge))
print("R²:", r2_score(np.expm1(y_test), y_pred_ridge))


## 9. Random Forest Regression Model


In [None]:
from sklearn.ensemble import RandomForestRegressor

pipe_rf = Pipeline([
    ('pre', preprocessor_rf),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

pipe_rf.fit(X_train, y_train)
y_pred_rf = np.expm1(pipe_rf.predict(X_test))

print("Random Forest Performance:")
print("RMSE:", mean_squared_error(np.expm1(y_test), y_pred_rf, squared=False))
print("MAE:", mean_absolute_error(np.expm1(y_test), y_pred_rf))
print("R²:", r2_score(np.expm1(y_test), y_pred_rf))


## 10. Model Comparison

In [None]:
results = pd.DataFrame({
    'Model': ['Ridge Regression', 'Random Forest'],
    'RMSE': [
        mean_squared_error(np.expm1(y_test), y_pred_ridge, squared=False),
        mean_squared_error(np.expm1(y_test), y_pred_rf, squared=False)
    ],
    'MAE': [
        mean_absolute_error(np.expm1(y_test), y_pred_ridge),
        mean_absolute_error(np.expm1(y_test), y_pred_rf)
    ],
    'R²': [
        r2_score(np.expm1(y_test), y_pred_ridge),
        r2_score(np.expm1(y_test), y_pred_rf)
    ]
})
results


## 12. Hyperparameter Tuning
We will tune:
- Ridge Regression: `alpha`
- Random Forest: `n_estimators`, `max_depth`


In [None]:
from sklearn.model_selection import GridSearchCV

# Ridge Regression Grid Search
ridge_param_grid = {'model__alpha': [0.1, 1.0, 10.0, 50.0, 100.0]}
ridge_gs = GridSearchCV(
    Pipeline([('pre', preprocessor_ridge), ('model', Ridge())]),
    ridge_param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1
)
ridge_gs.fit(X_train, y_train)
print("Best Ridge Params:", ridge_gs.best_params_)
print("Best Ridge RMSE:", -ridge_gs.best_score_)

# Random Forest Grid Search (small grid for speed)
rf_param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20]
}
rf_gs = GridSearchCV(
    Pipeline([('pre', preprocessor_rf), ('model', RandomForestRegressor(random_state=42, n_jobs=-1))]),
    rf_param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1
)
rf_gs.fit(X_train, y_train)
print("Best RF Params:", rf_gs.best_params_)
print("Best RF RMSE:", -rf_gs.best_score_)


## 13. Model Explainability (Random Forest)
We'll use:
- **Permutation Importance** for global feature importance
- **SHAP values** for local interpretability

In [None]:
from sklearn.inspection import permutation_importance

# Train final Random Forest with best params
best_rf = rf_gs.best_estimator_
result = permutation_importance(best_rf, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1)

import numpy as np
import matplotlib.pyplot as plt

# Extract feature names after preprocessing
cat_feature_names = rf_gs.best_estimator_['pre'].named_transformers_['cat']['ohe'].get_feature_names_out(cat_cols)
feature_names = num_cols + list(cat_feature_names) + [f"svd_{i}" for i in range(20)]

sorted_idx = result.importances_mean.argsort()[-15:]
plt.barh(np.array(feature_names)[sorted_idx], result.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.title("Top 15 Features - Random Forest")
plt.show()


### 14. SHAP (Local Interpretability)
SHAP explains individual predictions by showing how each feature pushes the prediction higher or lower.



#### Understanding SHAP and the Necessity of Model Explainability

**SHAP (SHapley Additive exPlanations)** assigns each feature a value showing how much it contributed to a prediction.

Why this matters:
- **Transparency**: We see *why* the model made a certain prediction.
- **Trust**: Stakeholders are more confident in the model.
- **Debugging**: We can catch when the model is using unintended signals.
- **Compliance**: In regulated industries, explainability is not optional.

SHAP is based on Shapley values from cooperative game theory, where each feature is a "player" in predicting the outcome.


In [None]:
!pip install shap

In [None]:
# This can take a lot of time!!!

import shap

# Use TreeExplainer for Random Forest
explainer = shap.TreeExplainer(best_rf['model'])
X_test_transformed = best_rf['pre'].transform(X_test)
shap_values = explainer.shap_values(X_test_transformed)

# Summary plot (requires feature names from preprocessing)
shap.summary_plot(shap_values, features=X_test_transformed, feature_names=feature_names)


SHAP on the full test set can be slow because:

Random Forests with hundreds of trees generate many calculations per sample.

SHAP’s exact method for trees can be heavy when the dataset is large.

You can speed this up by:

Sampling fewer rows from the test set.

Using permutation importance or shap.Explainer with approximate=True for a rougher (but much faster) estimate.

In [None]:
import shap
import numpy as np

# Sample only 100 rows for speed
sample_idx = np.random.choice(len(X_test), 100, replace=False)
X_test_sample = X_test.iloc[sample_idx]

# Use TreeExplainer with the sample
explainer = shap.TreeExplainer(best_rf['model'])
X_test_sample_transformed = best_rf['pre'].transform(X_test_sample)

# Compute SHAP values for only the sample
shap_values_sample = explainer.shap_values(X_test_sample_transformed)

# Summary plot for the sample
shap.summary_plot(shap_values_sample, 
                  features=X_test_sample_transformed, 
                  feature_names=feature_names)


## 15. Final Notes
- Linear models + text features can work surprisingly well, especially with TF-IDF + dimensionality reduction.
- Random Forests often outperform linear models on tabular + text-derived features but are harder to interpret.
- Always balance performance with explainability — stakeholders may need to understand *why* a model predicts certain prices.
- Hyperparameter tuning can make a big difference, but be mindful of overfitting and computation time.


## 16. Making a Sample Prediction

Let's create a fake Airbnb listing and use our trained Random Forest to predict its price.


In [None]:
# Sample new listing
sample_data = pd.DataFrame([{
    'name': 'Cozy apartment near Central Park',
    'host_id': 999999,
    'host_name': 'John Doe',
    'neighbourhood_group': 'Manhattan',
    'neighbourhood': 'Upper West Side',
    'latitude': 40.785091,
    'longitude': -73.968285,
    'room_type': 'Entire home/apt',
    'minimum_nights': 3,
    'number_of_reviews': 15,
    'last_review': '2023-05-10',
    'reviews_per_month': 1.2,
    'calculated_host_listings_count': 1,
    'availability_365': 180
}])

# Reindex to match training columns
sample_data = sample_data.reindex(columns=X.columns, fill_value='')

# Predict
predicted_price = best_rf.predict(sample_data)[0]
print(f"Predicted Price: ${predicted_price:.2f}")


If you’ve built your model inside a pipeline — like:

best_rf = Pipeline([
    ('pre', preprocessor_rf),
    ('model', RandomForestRegressor(...))
])


— then you do NOT need to manually preprocess inputs during prediction.