### **Applying advanced analytics to predict price and guest satisfaction using different sets of models such as linear regression, decision trees, and random forest. Evaluate the performance of each model using appropriate metrics (R-squared, Mean Absolute Error, etc.) and identify the most significant predictors for price and guest satisfaction.**

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from IPython.display import display, HTML

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Outlier - Flamingo coding multiple Deviation/Athens_airbnb_predictive_analysis/amsterdam_weekdays.csv')

# One-hot encoding for categorical features
data_encoded = pd.get_dummies(data, columns=['room_type'], drop_first=True)

In [44]:
data_encoded.head()

Unnamed: 0,realSum,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,...,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,city,weekday,room_type_Private room,room_type_Shared room
0,194.033698,False,True,2,False,1,0,10,93,1,...,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,Amsterdam,1,True,False
1,344.245776,False,True,4,False,0,0,8,85,1,...,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,Amsterdam,1,True,False
2,264.101422,False,True,2,False,0,1,9,87,1,...,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,Amsterdam,1,True,False
3,433.529398,False,True,4,False,0,1,9,90,2,...,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,Amsterdam,1,True,False
4,485.552926,False,True,2,True,0,0,10,98,1,...,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,Amsterdam,1,True,False


In [45]:
# Update the feature list after one-hot encoding
features = ['room_shared', 'room_private', 'person_capacity',
            'host_is_superhost', 'multi', 'biz', 'cleanliness_rating',
            'guest_satisfaction_overall', 'dist', 'metro_dist', 'attr_index',
            'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng', 'lat'] + \
            [col for col in data_encoded.columns if col.startswith('room_type_')]

In [46]:
# Splitting the data into training and testing sets
X = data_encoded[features]
y_price = data_encoded['realSum']
y_satisfaction = data_encoded['guest_satisfaction_overall']

X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X, y_price, test_size=0.2, random_state=42)
X_train_satisfaction, X_test_satisfaction, y_train_satisfaction, y_test_satisfaction = train_test_split(X, y_satisfaction, test_size=0.2, random_state=42)

In [47]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42)
}

In [48]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return r2, mae

In [49]:
# Evaluate models for price prediction
results_price = {}
for name, model in models.items():
    r2, mae = evaluate_model(model, X_train_price, X_test_price, y_train_price, y_test_price)
    results_price[name] = {'R-squared': r2, 'Mean Absolute Error': mae}

In [50]:
# Evaluate models for guest satisfaction prediction
results_satisfaction = {}
for name, model in models.items():
    r2, mae = evaluate_model(model, X_train_satisfaction, X_test_satisfaction, y_train_satisfaction, y_test_satisfaction)
    results_satisfaction[name] = {'R-squared': r2, 'Mean Absolute Error': mae}

In [51]:
# Identify the most significant predictors using the Random Forest model
rf_model_price = RandomForestRegressor(random_state=42)
rf_model_price.fit(X_train_price, y_train_price)
importances_price = rf_model_price.feature_importances_

In [52]:
rf_model_satisfaction = RandomForestRegressor(random_state=42)
rf_model_satisfaction.fit(X_train_satisfaction, y_train_satisfaction)
importances_satisfaction = rf_model_satisfaction.feature_importances_

In [53]:
# Combine the results into dataframes for better visualization
results_price_df = pd.DataFrame(results_price).T
results_satisfaction_df = pd.DataFrame(results_satisfaction).T

In [54]:
importances_price_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances_price
}).sort_values(by='Importance', ascending=False)


In [55]:
importances_satisfaction_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances_satisfaction
}).sort_values(by='Importance', ascending=False)

In [56]:
# Display the results
def display_dataframe(name, dataframe):
    display(HTML(f"<h3>{name}</h3>"))
    display(dataframe)

In [57]:
display_dataframe("Model Evaluation Results for Satisfaction", results_satisfaction_df)

Unnamed: 0,R-squared,Mean Absolute Error
Linear Regression,1.0,4.790537e-14
Decision Tree,0.998201,0.02262443
Random Forest,0.999375,0.01384615


In [58]:
display_dataframe("Model Evaluation Results for Price", results_price_df)

Unnamed: 0,R-squared,Mean Absolute Error
Linear Regression,0.505546,156.367739
Decision Tree,0.377999,155.702637
Random Forest,0.528989,138.859397


In [59]:
display_dataframe("Feature Importances for Price Prediction", importances_price_df)

Unnamed: 0,Feature,Importance
2,person_capacity,0.297598
15,lat,0.150608
9,metro_dist,0.08922
8,dist,0.086045
7,guest_satisfaction_overall,0.060757
10,attr_index,0.056686
14,lng,0.048316
1,room_private,0.039772
11,attr_index_norm,0.039539
16,room_type_Private room,0.03441


In [60]:
display_dataframe("Feature Importances for Satisfaction Prediction", importances_satisfaction_df)

Unnamed: 0,Feature,Importance
7,guest_satisfaction_overall,0.974948
10,attr_index,0.005043
11,attr_index_norm,0.004326
14,lng,0.004224
12,rest_index,0.003456
8,dist,0.002364
13,rest_index_norm,0.002069
9,metro_dist,0.001389
15,lat,0.001269
6,cleanliness_rating,0.000732


In [61]:
(results_price_df, results_satisfaction_df, importances_price_df, importances_satisfaction_df)

(                   R-squared  Mean Absolute Error
 Linear Regression   0.505546           156.367739
 Decision Tree       0.377999           155.702637
 Random Forest       0.528989           138.859397,
                    R-squared  Mean Absolute Error
 Linear Regression   1.000000         4.790537e-14
 Decision Tree       0.998201         2.262443e-02
 Random Forest       0.999375         1.384615e-02,
                        Feature  Importance
 2              person_capacity    0.297598
 15                         lat    0.150608
 9                   metro_dist    0.089220
 8                         dist    0.086045
 7   guest_satisfaction_overall    0.060757
 10                  attr_index    0.056686
 14                         lng    0.048316
 1                 room_private    0.039772
 11             attr_index_norm    0.039539
 16      room_type_Private room    0.034410
 12                  rest_index    0.033886
 13             rest_index_norm    0.032019
 6           clean