# Notebook 3 ‚Äî Enrichment and Machine Learning

This notebook performs feature enrichment and applies a machine learning model
to predict Airbnb annual revenue.


In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)


## Load Prepared Data

We load the cleaned dataset produced in Notebook 1.


In [33]:
df = pd.read_csv("/content/airbnb_prepared.csv")
print(df.shape)
df.head()


(145825, 36)


Unnamed: 0,listing_title,property_type,listing_type,amenities,country,state,city,city_file_clean,bedrooms,bathrooms,max_guests,minimum_stay,number_of_reviews,overall_rating,airbnb_value_rating,airbnb_location_rating,airbnb_cleanliness_rating,airbnb_accuracy_rating,airbnb_communication_rating,cleaning_fee_usd,extra_people_fee_usd,average_daily_rate_usd,occupancy_rate_ltm,annual_revenue_ltm_usd,count_available_days_ltm,count_blocked_days_ltm,count_reservation_days_ltm,number_of_bookings_ltm,airbnb_superhost,instant_bookable,pets_allowed,latitude,longitude,created_date,last_scraped_date,amenities_count
0,Waterfront Cozy Escape,Private room in rental unit,private_room,"['Free parking on premises', 'Wifi', 'TV', 'Ha...",CA,Ontario,Toronto,Toronto,2.0,1,4,3,79,4.8,10.0,10.0,10.0,10.0,10.0,80.0,7.98889,100,87,21568,224,141,162,26,True,0.0,False,43.587961,-79.53622,2023-07-31,2024-01-10,29
1,Live by the Lake Ontario-Entire Apartment,Entire condo,entire_home,"['Free parking on premises', 'Wifi', 'Kitchen'...",CA,Ontario,Toronto,Toronto,2.0,1,4,28,19,4.8,10.0,10.0,10.0,10.0,10.0,45.0,0.0,102,71,2407,32,333,19,2,False,0.0,False,43.58916,-79.53253,2023-07-31,2024-01-10,31
2,Home away from home.,Private room in rental unit,private_room,"['Free parking on premises', 'Elevator', 'Wifi...",CA,Ontario,Toronto,Toronto,1.0,1,2,28,5,5.0,9.0,10.0,10.0,10.0,10.0,7.0,0.0,74,51,6550,173,192,73,3,False,1.0,False,43.58832,-79.53094,2023-07-31,2024-01-10,56
3,"‚ù§Ô∏èBeautiful HOUSE!, near everything! LAKE+WIFI...",Entire home,entire_home,"['Free parking on premises', 'Wifi', 'Kitchen'...",CA,Ontario,Toronto,Toronto,1.0,1,2,28,50,4.9,10.0,10.0,10.0,10.0,10.0,118.0,0.0,65,51,3176,78,287,33,5,True,0.0,False,43.59,-79.52848,2023-07-31,2024-01-10,46
4,üíïBEAUTIFUL BSMT STUDIO By LAKE! + wifi PRKN & ...,Entire home,entire_home,"['Free parking on premises', 'Breakfast', 'Wif...",CA,Ontario,Toronto,Toronto,1.0,1,2,28,46,5.0,10.0,10.0,10.0,10.0,10.0,118.0,15.7333,84,68,17958,271,94,153,21,True,0.0,False,43.59156,-79.52895,2023-07-31,2024-01-10,51


## Target Preparation

Annual revenue is highly skewed, so a log transformation is applied.


In [34]:
target = "annual_revenue_ltm_usd"

df = df[df[target].notna()]
df = df[df[target] > 0]

df["log_revenue"] = np.log1p(df[target])
df["log_revenue"].describe()


Unnamed: 0,log_revenue
count,145825.0
mean,8.894193
std,1.573878
min,1.098612
25%,7.978311
50%,9.121618
75%,10.053415
max,14.070336


## Feature Enrichment

Additional features are derived to capture city-level context and efficiency.


In [35]:
df["city_median_revenue"] = df.groupby("city_file_clean")[target].transform("median")

df["revenue_per_booking"] = np.where(
    df["number_of_bookings_ltm"] > 0,
    df[target] / df["number_of_bookings_ltm"],
    np.nan
)

df["revenue_per_available_day"] = np.where(
    df["count_available_days_ltm"] > 0,
    df[target] / df["count_available_days_ltm"],
    np.nan
)

df["relative_revenue_vs_city_median"] = df[target] / df["city_median_revenue"]

df[[
    "city_median_revenue",
    "revenue_per_booking",
    "revenue_per_available_day",
    "relative_revenue_vs_city_median"
]].describe()


Unnamed: 0,city_median_revenue,revenue_per_booking,revenue_per_available_day,relative_revenue_vs_city_median
count,145825.0,121131.0,142346.0,145825.0
mean,9516.22946,2146.855683,143.121008,1.965266
std,2915.711135,3598.117545,233.244619,3.045827
min,5976.5,0.820513,0.090909,0.000221
25%,9029.5,534.538685,54.508257,0.323039
50%,9029.5,1116.771429,98.0,1.0
75%,9293.0,2365.615789,169.854267,2.422947
max,17271.5,213048.0,32255.875,201.788395


## Feature Selection

A compact set of numeric, categorical, and binary features is selected.


In [36]:
numeric_features = [
    "bedrooms","bathrooms","max_guests","minimum_stay","number_of_reviews",
    "overall_rating","amenities_count",
    "average_daily_rate_usd","occupancy_rate_ltm",
    "count_available_days_ltm","count_blocked_days_ltm",
    "count_reservation_days_ltm","number_of_bookings_ltm",
    "city_median_revenue","revenue_per_booking",
    "revenue_per_available_day","relative_revenue_vs_city_median"
]

categorical_features = ["listing_type","property_type","city_file_clean"]

binary_features = ["airbnb_superhost", "pets_allowed"]

numeric_features = [c for c in numeric_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]
binary_features = [c for c in binary_features if c in df.columns]


## Train / Test Split


In [37]:
X = df[numeric_features + categorical_features + binary_features]
y = df["log_revenue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Preprocessing


In [38]:
for c in binary_features:
    X_train[c] = X_train[c].map({True: 1, False: 0})
    X_test[c] = X_test[c].map({True: 1, False: 0})

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), numeric_features),
    ("bin", SimpleImputer(strategy="most_frequent"), binary_features),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), categorical_features)
])


## Linear Regression Model


In [39]:
lin_model = Pipeline([
    ("prep", preprocessor),
    ("model", LinearRegression())
])

lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)

rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred))
r2_lin = r2_score(y_test, y_pred)

print("Linear Regression (log target)")
print("RMSE:", rmse_lin)
print("R^2 :", r2_lin)


Linear Regression (log target)
RMSE: 0.762497294049584
R^2 : 0.765246531417061


## Limitations and Future Work

Only a linear regression model was included due to computational constraints.
Future work could explore more complex models with feature scaling or
greater computational resources.
