# Modeling and Comparisons

## 1. Importing Packages and Loading Data

### Packages

In [4]:
import pandas as pd
import numpy as np
from plotnine import *
import seaborn as sns

### Import and Preview Data

In [6]:
# clean df
df_clean = pd.read_csv("/Users/matteo/Documents/PersonalProjects/airbnb-pricing-optimization/data/listings_clean.csv")
df_clean.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,picture_url,host_id,host_url,host_name,...,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,log_price
0,958,https://www.airbnb.com/rooms/958,20250901181253,2025-09-01,city scrape,"Bright, Modern Garden Unit - 1BR/1BTH",https://a0.muscache.com/pictures/be1bf5ac-a955...,1169,https://www.airbnb.com/users/show/1169,Holly,...,4.98,4.78,STR-0006854,f,1,1,0,0,2.53,5.056246
1,5858,https://www.airbnb.com/rooms/5858,20250901181253,2025-09-01,city scrape,Creative Sanctuary,https://a0.muscache.com/pictures/hosting/Hosti...,8904,https://www.airbnb.com/users/show/8904,Philip Jonathon,...,4.77,4.68,,f,1,1,0,0,0.53,5.521461
2,8014,https://www.airbnb.com/rooms/8014,20250901181253,2025-09-01,city scrape,female HOST quiet fast internet market parking,https://a0.muscache.com/pictures/2cc1fc3d-0ae0...,22402,https://www.airbnb.com/users/show/22402,Jia,...,4.59,4.66,STR-0000974,f,3,0,3,0,0.57,4.204693
3,8142,https://www.airbnb.com/rooms/8142,20250901181253,2025-09-01,city scrape,*FriendlyRoom Apt. Style -UCSF/USF - San Franc...,https://a0.muscache.com/pictures/hosting/Hosti...,21994,https://www.airbnb.com/users/show/21994,Aaron,...,4.7,4.7,,f,20,0,20,0,0.07,4.025352
4,8339,https://www.airbnb.com/rooms/8339,20250901181253,2025-09-01,city scrape,Historic Alamo Square Victorian,https://a0.muscache.com/pictures/miso/Hosting-...,24215,https://www.airbnb.com/users/show/24215,Rosmarie,...,4.94,4.75,STR-0000264,f,1,1,0,0,0.13,6.267201


In [7]:
# clean + engineered features

## 2. Baseline Linear Regression

This model will be trained on the clean data (without engineered features) to understand what our baseline statistics look like.

### Importing Packages

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### Assigning Variable Groups and Creating Train/Test Split

In [13]:
quantitative_vars = [
    # Property characteristics
    "accommodates",
    "bathrooms",
    "bedrooms", 
    "beds",
    
    # Location
    "latitude",
    "longitude",
    
    # Host metrics
    "host_response_rate",
    "host_acceptance_rate",
    "host_listings_count",
    "host_total_listings_count",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms",
    
    # Reviews
    "number_of_reviews",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "number_of_reviews_ly",
    "reviews_per_month",
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    
    # Availability
    "availability_30",
    "availability_60",
    "availability_90",
    "availability_365",
    "availability_eoy",
    "minimum_nights",
    "maximum_nights",
    "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm",
    
    # Other
    "estimated_occupancy_l365d",
    "estimated_revenue_l365d"
]

categorical_vars = [
    # Core categories
    #"neighbourhood_cleansed", --> Removed to reduce overfitting, increased Rsq substantially
    #"property_type",          --> Removed because too many categories, OHE could not handle
    "room_type",
    
    # Host attributes
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    
    # Listing attributes
    "instant_bookable",
    "has_availability"#,
    
    # License
    #"license"                 --> Removed because too many categories, OHE could not handle
]

ignore = [
    "id",
    "listing_url",
    "scrape_id",
    "last_scraped",
    "source",
    "name",
    "picture_url",
    "host_id",
    "host_url",
    "host_name",
    "host_since",
    "host_thumbnail_url",
    "host_picture_url",
    "host_verifications",
    "calendar_last_scraped",
    "amenities",  # Need to parse separately
    "log_price",  # This is your target
    "price",  # Original target
    "property_type",
    "license",
    "neighbourhood_cleansed"
]

target = "log_price"

X = df_clean.drop(ignore, axis = 1)
y = df_clean[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Preprocessing

In [15]:
preproc = make_column_transformer(
    (StandardScaler(), quantitative_vars),
    (OneHotEncoder(drop = "first"), categorical_vars)
)

### Defining and Fitting

In [17]:
Linear_Pipeline = make_pipeline(
    preproc,
    LinearRegression()
)

Linear_Pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('linearregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('standardscaler', ...), ('onehotencoder', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Predicting

In [19]:
y_pred = Linear_Pipeline.predict(X_test)
y_pred_train = Linear_Pipeline.predict(X_train)

### Results

In [21]:
print("MSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("R Squared (Test): ", r2_score(y_test, y_pred))
print("R Squared (Train): ", r2_score(y_train, y_pred_train))

MSE:  0.558995991781547
MAE:  0.35900569235888624
R Squared (Test):  0.5425580037257507
R Squared (Train):  0.6492858994795466


### Observations and Conclusions

Trained the linear model multiple times for various results. Made the following observations:
- Variables "property_type" and "license" both have too many categories and caused the model to throw errors.
- Removing "neighbourhood_cleansed" reduced overfitting and increased R squared substantially.
- There is a large amount of multicollinearity between variables like "accommodates" and "beds" which move similarily.

## 3. Linear Regression (With Engineered Features)