In [8]:
# Data manipulation and numerical operations
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# # Visualization libraries
# import matplotlib.pyplot as plt
# import seaborn as sns

# Preprocessing and scaling
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn import preprocessing  # Import preprocessing separately

# Machine Learning - Train/test split
from sklearn.model_selection import train_test_split

# Pre-processing tools
from sklearn.preprocessing import StandardScaler #, OneHotEncoder, OrdinalEncoder
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
# from sklearn.decomposition import PCA

# Importing linear regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor, RANSACRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Importing non-linear regression models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR

# Classification models (if needed)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import tree  # For Decision Trees

# Metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

# # Suppress warnings
# import warnings
# warnings.filterwarnings('ignore')

In [9]:
INPUT_PATH = "airbnb_2022_2024_clean_encoded.csv"

In [10]:
# Define the path to the CSV file
df = pd.read_csv(INPUT_PATH, index_col="index")
print(df.shape)
df.head()

(25881, 18)


Unnamed: 0_level_0,host_identity_verified,instant_bookable,price,service_fee,minimum_nights,review_rate_number,boro_bronx,boro_brooklyn,boro_manhattan,boro_queens,boro_staten,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,room_type_entire,room_type_hotelr,room_type_privater,room_type_sharedr
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,True,False,142.0,28.0,30.0,4.0,False,False,True,False,False,False,True,False,True,False,False,False
5,True,True,1020.0,204.0,1.0,3.0,False,True,False,False,False,False,False,True,False,False,True,False
6,True,False,646.0,129.0,30.0,1.0,False,False,True,False,False,False,False,True,False,False,True,False
8,True,False,909.0,182.0,5.0,4.0,False,False,True,False,False,False,True,False,True,False,False,False
12,True,True,523.0,105.0,7.0,5.0,False,False,True,False,False,True,False,False,True,False,False,False


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25881 entries, 0 to 26256
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   host_identity_verified        25881 non-null  bool   
 1   instant_bookable              25881 non-null  bool   
 2   price                         25881 non-null  float64
 3   service_fee                   25881 non-null  float64
 4   minimum_nights                25881 non-null  float64
 5   review_rate_number            25881 non-null  float64
 6   boro_bronx                    25881 non-null  bool   
 7   boro_brooklyn                 25881 non-null  bool   
 8   boro_manhattan                25881 non-null  bool   
 9   boro_queens                   25881 non-null  bool   
 10  boro_staten                   25881 non-null  bool   
 11  cancellation_policy_flexible  25881 non-null  bool   
 12  cancellation_policy_moderate  25881 non-null  bool   
 13  cancel

In [12]:
# Defining the target variable
X = df.drop(columns=['price'])  
y = df['price']  

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test) 
del X_train, X_test

In [14]:
# List of models to test
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "BayesianRidge": BayesianRidge(),
    "HuberRegressor": HuberRegressor(),
    "RANSACRegressor": RANSACRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "SupportVectorRegressor": SVR(),
}

# Dictionary to hold model performance
mse_results = {}
trained_models = {}
mae_results = {}
rmse_results = {}
r2_results = {}

# Loop through each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}...") 
   
    # Fit the model
    model.fit(X_train_scaled, y_train)
    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate mean squared error (MSE), Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R-squared (R²),
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Results in Dictionaries
    mse_results[model_name] = mse
    trained_models[model_name] = model
    mae_results[model_name] = mae
    rmse_results[model_name] = rmse
    r2_results[model_name] = r2
    
    # Dictionary of the results
    results_dict = {}

    # Populate the dictionary with model results
    for model_name in mse_results.keys():
        results_dict[model_name] = {
            'mse': mse_results[model_name],
            'model_name': trained_models[model_name],
            'mae': mae_results[model_name],
            'rmse': rmse_results[model_name],
            'r2': r2_results[model_name]
        }
              
# Display the results
results_df = pd.DataFrame(results_dict.items(), columns=('model_name', 'mse', 'mae', 'rmse', 'r2')).sort_values(by='mse')
results_df

Evaluating LinearRegression...
Evaluating Ridge...
Evaluating Lasso...
Evaluating ElasticNet...
Evaluating BayesianRidge...
Evaluating HuberRegressor...
Evaluating RANSACRegressor...
Evaluating DecisionTreeRegressor...
Evaluating RandomForestRegressor...
Evaluating GradientBoostingRegressor...
Evaluating AdaBoostRegressor...
Evaluating KNeighborsRegressor...
Evaluating SupportVectorRegressor...


ValueError: 5 columns passed, passed data had 2 columns

In [None]:
# Model Selection
best_model_name = results_df.iloc[0]['model_name']
print(f"The best model based on MSE is: {best_model_name}")
print(f"The best model based on MAE is: {best_model_name}")
print(f"The best model based on RMSE is: {best_model_name}")
print(f"The best model based on R2 is: {best_model_name}")

# Retrieve the best model
best_model = trained_models[best_model_name]
best_y_pred = best_model.predict(X_test_scaled)

In [None]:
# # Evaluate the best model on training and test sets
# print("Train R2 score: ", r2_score(y_train, best_model.predict(X_train_scaled)))
# print("Test R2 score: ", r2_score(y_test, best_model.predict(X_test_scaled)))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")


In [None]:
# Inference function to make predictions on new data
def make_inference(new_data):
    """
    Make predictions using the best model.
    new_data should be a DataFrame with the same structure as X.
    """
    # Preprocess the new data (scaling)
    new_data_scaled = scaler.transform(new_data)
    
    # Make predictions using the best model
    predictions = best_model.predict(new_data_scaled)
    
    return predictions

# Example: Making predictions on new data (replace 'X_test' with actual new data)
new_data_predictions = make_inference(results_df)  # X_test is used for demonstration, replace with actual new data
print("Predictions for new data:", new_data_predictions[:5])

# Now you can delete X_train and X_test if you no longer need them
# del X_train, X_test

## Unsupervised Learning

In [None]:
# unsup_scaler = StandardScaler()
# unsup_scaled = unsup_scaler.fit_transform(df)

# # Checking the shape of the scaled data
# print(unsup_scaled.shape)
# # print(unsup_scaled)

# # Creating a DataFrame from the scaled data and showing the first few rows
# pd.DataFrame(unsup_scaled, columns=df.columns).head()

In [None]:
# # Check the mean and standard deviation of the scaled data
# print("Mean of scaled data (should be close to 0):\n", unsup_scaled.mean())
# print("Standard deviation of scaled data (should be close to 1):\n", unsup_scaled.std())

# # # If you want to save the scaled data to a CSV file
# # unsup_scaled.to_csv('scaled_data.csv', index=False)


**Final Evaluation**