In [1]:
# Data manipulation and numerical operations
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# # Visualization libraries
# import matplotlib.pyplot as plt
# import seaborn as sns

# Preprocessing and scaling
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn import preprocessing  # Import preprocessing separately

# Machine Learning - Train/test split
from sklearn.model_selection import train_test_split

# Pre-processing tools
from sklearn.preprocessing import StandardScaler #, OneHotEncoder, OrdinalEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.decomposition import PCA

# Importing linear regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor, RANSACRegressor

# Importing non-linear regression models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR

# Classification models (if needed)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import tree  # For Decision Trees

# Metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

# # Suppress warnings
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
INPUT_PATH = "airbnb_2022_2024_clean_encoded.csv"

In [3]:
# Define the path to the CSV file
df = pd.read_csv(INPUT_PATH, index_col="index")
print(df.shape)
df.head()

(25881, 18)


Unnamed: 0_level_0,host_identity_verified,instant_bookable,price,service_fee,minimum_nights,review_rate_number,boro_bronx,boro_brooklyn,boro_manhattan,boro_queens,boro_staten,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,room_type_entire,room_type_hotelr,room_type_privater,room_type_sharedr
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,True,False,142.0,28.0,30.0,4.0,False,False,True,False,False,False,True,False,True,False,False,False
5,True,True,1020.0,204.0,1.0,3.0,False,True,False,False,False,False,False,True,False,False,True,False
6,True,False,646.0,129.0,30.0,1.0,False,False,True,False,False,False,False,True,False,False,True,False
8,True,False,909.0,182.0,5.0,4.0,False,False,True,False,False,False,True,False,True,False,False,False
12,True,True,523.0,105.0,7.0,5.0,False,False,True,False,False,True,False,False,True,False,False,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25881 entries, 0 to 26256
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   host_identity_verified        25881 non-null  bool   
 1   instant_bookable              25881 non-null  bool   
 2   price                         25881 non-null  float64
 3   service_fee                   25881 non-null  float64
 4   minimum_nights                25881 non-null  float64
 5   review_rate_number            25881 non-null  float64
 6   boro_bronx                    25881 non-null  bool   
 7   boro_brooklyn                 25881 non-null  bool   
 8   boro_manhattan                25881 non-null  bool   
 9   boro_queens                   25881 non-null  bool   
 10  boro_staten                   25881 non-null  bool   
 11  cancellation_policy_flexible  25881 non-null  bool   
 12  cancellation_policy_moderate  25881 non-null  bool   
 13  cancel

In [5]:
# Assuming df is your DataFrame and 'price' is the target variable
X = df.drop(columns=['price'])  
y = df['price']  

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test) 
del X_train, X_test

In [7]:
# List of models to test
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "BayesianRidge": BayesianRidge(),
    "HuberRegressor": HuberRegressor(),
    "RANSACRegressor": RANSACRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "SupportVectorRegressor": SVR(),
}

# Dictionary to hold model performance
mse_results = {}
trained_models = {}

# Loop through each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}...") 
    # print(model_name)
    # Fit the model
    model.fit(X_train_scaled, y_train)
    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, y_pred)

    # Average the mean squared error for this model
    mse_results[model_name] = mse
    trained_models[model_name] = model

# Display the results
results_df = pd.DataFrame(mse_results.items(), columns=('model_name', 'mse')).sort_values(by='mse')
results_df

Evaluating LinearRegression...
Evaluating Ridge...
Evaluating Lasso...
Evaluating ElasticNet...
Evaluating BayesianRidge...
Evaluating HuberRegressor...
Evaluating RANSACRegressor...
Evaluating DecisionTreeRegressor...
Evaluating RandomForestRegressor...
Evaluating GradientBoostingRegressor...
Evaluating AdaBoostRegressor...
Evaluating KNeighborsRegressor...
Evaluating SupportVectorRegressor...


Unnamed: 0,model_name,mse
6,RANSACRegressor,2.012585
4,BayesianRidge,2.012806
0,LinearRegression,2.013122
1,Ridge,2.014129
5,HuberRegressor,2.014449
8,RandomForestRegressor,2.13875
2,Lasso,3.09209
7,DecisionTreeRegressor,3.589504
9,GradientBoostingRegressor,5.734523
10,AdaBoostRegressor,1696.759266


In [8]:
# Model Selection
best_model_name = results_df.iloc[0]['model_name']
print(f"The best model based on MSE is: {best_model_name}")

best_model = trained_models[best_model_name]
best_y_pred = best_model.predict(X_test_scaled)

The best model based on MSE is: RANSACRegressor


In [9]:
print(r2_score(y_train, best_model.predict(X_train_scaled)))
r2_score(y_test, best_y_pred)

0.9999814902237643


0.9999817991954413

## Unsupervised Learning

In [10]:
unsup_scaler = StandardScaler()
unsup_scaled = unsup_scaler.fit_transform(df)
print(unsup_scaled.shape)
# print(unsup_scaled)
pd.DataFrame(unsup_scaled, columns=df.columns).head()

(25881, 18)


Unnamed: 0,host_identity_verified,instant_bookable,price,service_fee,minimum_nights,review_rate_number,boro_bronx,boro_brooklyn,boro_manhattan,boro_queens,boro_staten,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,room_type_entire,room_type_hotelr,room_type_privater,room_type_sharedr
0,0.997029,-0.9978,-1.456691,-1.462645,0.615386,0.51286,-0.181456,-0.776809,1.127501,-0.406747,-0.098161,-0.709013,1.410166,-0.703176,0.926895,-0.039833,-0.880399,-0.156405
1,0.997029,1.002205,1.195323,1.195293,-0.275294,-0.299362,-0.181456,1.287318,-0.886917,-0.406747,-0.098161,-0.709013,-0.709136,1.422119,-1.078871,-0.039833,1.135848,-0.156405
2,0.997029,-0.9978,0.06565,0.062649,0.615386,-1.923806,-0.181456,-0.776809,1.127501,-0.406747,-0.098161,-0.709013,-0.709136,1.422119,-1.078871,-0.039833,1.135848,-0.156405
3,0.997029,-0.9978,0.860046,0.863051,-0.152442,0.51286,-0.181456,-0.776809,1.127501,-0.406747,-0.098161,-0.709013,1.410166,-0.703176,0.926895,-0.039833,-0.880399,-0.156405
4,0.997029,1.002205,-0.305874,-0.299797,-0.091016,1.325082,-0.181456,-0.776809,1.127501,-0.406747,-0.098161,1.410411,-0.709136,-0.703176,0.926895,-0.039833,-0.880399,-0.156405


**Final Evaluation**

In [11]:
# # Retrain the model with the best parameters found in grid search
# best_model = grid_search.best_estimator_

# # Make predictions on a test set (you may want to keep a separate test set)
# y_test_pred = best_model.predict(X_test)

# # Calculate metrics
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# accuracy = accuracy_score(y_test, y_test_pred)
# precision = precision_score(y_test, y_test_pred, average='weighted')
# recall = recall_score(y_test, y_test_pred, average='weighted')
# f1 = f1_score(y_test, y_test_pred, average='weighted')

# # Display final evaluation metrics
# print(f"Final Evaluation Metrics:")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"Recall: {recall:.4f}")
# print(f"F1-Score: {f1:.4f}")
