### Chapter 14
**CH14B AirBnB**

using the airbnb dataset

version 1.0 2023-12-28

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mizani.formatters import percent_format
from plotnine import *
import regex as re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings

warnings.filterwarnings("ignore")

### Get Data

In [None]:
# Current script and repository folder
current_path = os.getcwd()
repository_path = current_path.split('Ch14')[0]

In [None]:
# Add utils folder to sys path 
# Note: os.path.join() creates a string with the right syntax for defining a path for your operating sytem.
sys.path.append(os.path.join(repository_path, 'utils'))

In [None]:
# Define data folder
data_path = os.path.join(repository_path, 'data')

In [None]:
# Import the prewritten helper functions
# We are only using the color palette in this notebook
from py_helper_functions import *

In [None]:
# DATA IMPORT
data = pd.read_csv(os.path.join(data_path, 'airbnb_hackney_workfile.csv'))

In [None]:
data.head()

In [None]:
data.info()

### Feature Engineering

In [None]:
# where do we have missing variables now?
to_filter=data.isna().sum()
to_filter[to_filter>0].index

#### Missing values

In [None]:
# 1. drop if no target (already did)
data.dropna(subset=['price'], inplace = True)

In [None]:
# 2. imput when few, not that important
data['n_bathrooms']=data['n_bathrooms'].fillna(np.nanmedian(data['n_bathrooms']))
data['n_beds']=data['n_beds'].fillna(data['n_accommodates'])
data['f_bathroom']=data['f_bathroom'].fillna(1)
data['f_minimum_nights']=data['f_minimum_nights'].fillna(1)
data['f_number_of_reviews']=data['f_number_of_reviews'].fillna(1)
data['ln_beds']=data['ln_beds'].fillna(0)

In [None]:
data['n_bathrooms'].describe()

In [None]:
# 3. drop columns when many missing not important
data=data.drop(["usd_cleaning_fee", "p_host_response_rate"],axis=1)

In [None]:
# where do we have missing variables now?
to_filter=data.isna().sum()
to_filter[to_filter>0].index

In [None]:
for var in ["flag_days_since","flag_review_scores_rating","flag_reviews_per_month"]:
    data[var]=[int(x) for x in data[var.replace('flag','n')].isna()]

In [None]:
data['flag_days_since'].count()

In [None]:
# 4. Replace missing variables re reviews with zero, when no review + add flags
data['n_days_since']=data['n_days_since'].fillna(np.nanmedian(data['n_days_since']))
data['n_review_scores_rating']=data['n_review_scores_rating'].fillna(np.nanmedian(data['n_review_scores_rating']))
data['n_reviews_per_month']=data['n_reviews_per_month'].fillna(np.nanmedian(data['n_reviews_per_month']))

In [None]:
data.flag_days_since.value_counts()

In [None]:
# Look at data
data.price.describe()

In [None]:
# where do we have missing variables now?
to_filter=data.isna().sum()
to_filter[to_filter>0]

In [None]:
###################################
# Business logic- define our prediction problem
###################################
# Decision
# Size, we need a normal apartment, 1-7persons
data=data.loc[data.n_accommodates < 8]

### EDA

In [None]:
#How is the average price changing in my district by `property_type`, `room_type` and the `bed_type`?
data.groupby(["f_property_type", "f_room_type"]).agg(mean_price=("price", np.mean))

In [None]:
data.groupby(["f_bed_type"]).agg(mean_price=("price", np.mean))

In [None]:
data.price.describe()

In [None]:
# How are extreme prices distributed?
data.price.quantile([0.75, 0.8, 0.9, 0.95, 0.99, 0.995])

In [None]:
# We pick USD 400, above which all observations are excluded in the charts below.
datau=data[data.price<400]

In [None]:
# Distribution of price by type below 400# Histograms# price
(
    ggplot(datau, aes(x="price"))
    + geom_histogram(
        aes(y="stat(count)/sum(stat(count))"),
        binwidth=10,
        fill=color[0],
        color="white",
        alpha=0.8,
        boundary=0,
        closed="left",
    )
    + labs(x="Price (US dollars)", y="Percent")
    + scale_y_continuous(
        expand=(0.00, 0.00),
        limits=(0, 0.15),
        breaks=seq(0, 0.16, by=0.03),
        labels=percent_format(),
    )
    + scale_x_continuous(expand=(0.00, 0.00), limits=(0, 400), breaks=seq(0, 401, 50))
    + theme_bw()
)

In [None]:
(
    ggplot(datau, aes(x="ln_price"))
    + geom_histogram(
        aes(y="stat(count)/sum(stat(count))"),
        binwidth=0.2,
        fill=color[0],
        color="white",
        alpha=0.8,
        boundary=0,
        closed="left",
    )
    + coord_cartesian(xlim=(2.5, 6.5))
    + scale_y_continuous(
        expand=(0.00, 0.00),
        limits=(0, 0.15),
        breaks=seq(0, 0.16, by=0.05),
        labels=percent_format(),
    )
    + scale_x_continuous(expand=(0.00, 0.01), breaks=seq(2.4, 6.7, 0.6))
    + labs(x="ln(price, US dollars)", y="Percent")
    + theme_bw()
)

In [None]:
# relative frequencies with matplotlib
# the distribution is different as matplotlib splits the observations into different buckets than ggplot
from matplotlib.ticker import PercentFormatter
fig = plt.figure(figsize = (10,6))
ax = fig.add_subplot(111)
ax.hist(datau.ln_price, bins = 16, density = True, rwidth = 0.9, color = 'steelblue')
# ax.set_xticks(range(0, data.price.max(), 2000))
ax.set_xlabel('ln price in USD')
ax.yaxis.set_major_formatter(PercentFormatter(xmax=10, decimals = 0))
ax.set_title('Relative frequency of logged car prices')
plt.show()

In [None]:
## Boxplot of price by room type
(
    ggplot(datau, aes(x="f_room_type", y="price"))
    + stat_boxplot(
        aes(group="f_room_type"),
        geom="errorbar",
        width=0.3,
        color=(color[1], color[0], color[2]),
        size=0.5,
        na_rm=True,
    )
    + geom_boxplot(
        aes(group="f_room_type"),
        color=(color[1], color[0], color[2]),
        # fill=(color[1], color[0], color[2]),
        size=0.5,
        width=0.6,
        alpha=0.3,
        na_rm=True,
        outlier_shape="",
    )
    + scale_y_continuous(expand=(0.01, 0.01), limits=(0, 300), breaks=seq(0, 301, 100))
    + labs(x="Room type", y="Price (US dollars)")
    + theme_bw()
)

In [None]:
sns.boxplot(data = datau, x = 'f_room_type', y = 'price', orient = 'v')
plt.show()

In [None]:
datau.f_room_type.value_counts()

In [None]:
(
    ggplot(
        datau,
        aes(
            x="factor(n_accommodates)",
            y="price",
            # fill="factor(f_property_type)",
            color="factor(f_property_type)",
        ),
    )
    + geom_boxplot(alpha=0.8, na_rm=True, outlier_shape="", width=0.8, stat="boxplot")
    + stat_boxplot(geom="errorbar", width=0.8, size=0.3, na_rm=True)
    + scale_color_manual(name="", values=(color[1], color[0]))
    + scale_fill_manual(name="", values=(color[1], color[0]))
    + labs(x="Accomodates (Persons)", y="Price (US dollars)")
    + scale_y_continuous(expand=(0.01, 0.01), limits=(0, 400), breaks=seq(0, 401, 50))
    + theme_bw()
    + theme(legend_position=(0.3, 0.8))
)

### Modelling

**Basic variables**

In [None]:
basic_lev = (
    "n_accommodates",
    "n_beds",
    "f_property_type",
    "f_room_type",
    "n_days_since",
    "flag_days_since",
)
basic_add = ("f_bathroom", "f_cancellation_policy", "f_bed_type")
reviews = ("f_number_of_reviews", "n_review_scores_rating", "flag_review_scores_rating")
poly_lev = ("n_accommodates2", "n_days_since2", "n_days_since3")
# not use p_host_response_rate due to missing obs
amenities = list(data.filter(regex="^d_.*"))

In [None]:
amenities

**Interactions**


In [None]:
## Helper functions

def price_diff_by_variables(df, factor_var, dummy_var):
    stats = df.groupby([factor_var, dummy_var]).agg(
        Mean=("price", np.mean), sd=("price", np.std), size=("price", "size")
    )
    stats["se"] = stats["sd"] / stats["size"] ** (1 / 2)
    stats["Mean_l"] = stats["Mean"] - (1.96 * stats["se"])
    stats["Mean_u"] = stats["Mean"] + (1.96 * stats["se"])
    stats = stats.drop(["sd", "size"], axis=1).reset_index()
    plot = (
        ggplot(
            stats,
            aes(
                stats.columns[0],
                stats.columns[2],
                fill="factor(" + stats.columns[1] + ")",
            ),
        )
        + geom_bar(stat="identity", position=position_dodge(width=0.9))
        + geom_errorbar(
            aes(ymin="Mean_l", ymax="Mean_u"),
            position=position_dodge(width=0.9),
            width=0.25,
        )
        + ylab("Mean Price")
        + theme_bw()
        + theme(
            panel_grid_major=element_blank(),
            panel_grid_minor=element_blank(),
            panel_border=element_blank(),
            axis_line=element_line(),
        )
        + scale_fill_grey()
    )

    return plot


def price_diff_by_variables2(df, factor_var, dummy_var, factor_lab, dummy_lab):
    stats = df.groupby([factor_var, dummy_var]).agg(
        Mean=("price", np.mean), sd=("price", np.std), size=("price", "size")
    )
    stats["se"] = stats["sd"] / stats["size"] ** (1 / 2)
    stats["Mean_l"] = stats["Mean"] - (1.96 * stats["se"])
    stats["Mean_u"] = stats["Mean"] + (1.96 * stats["se"])
    stats = stats.drop(["sd", "size"], axis=1).reset_index()
    plot = (
        ggplot(
            stats,
            aes(
                stats.columns[0],
                stats.columns[2],
                fill="factor(" + stats.columns[1] + ")",
            ),
        )
        + geom_bar(stat="identity", position=position_dodge(width=0.9))
        + geom_errorbar(
            aes(ymin="Mean_l", ymax="Mean_u"),
            position=position_dodge(width=0.9),
            width=0.25,
        )
        + scale_color_manual(name=dummy_lab, values=(color[1], color[0]))
        + scale_fill_manual(name=dummy_lab, values=(color[1], color[0]))
        + ylab("Mean Price")
        + xlab(factor_lab)
        + theme_bw()
        + theme(
            panel_grid_major=element_blank(),
            panel_grid_minor=element_blank(),
            panel_border=element_blank(),
            axis_line=element_line(),
            legend_position="top",
            legend_box="vertical",
            legend_text=element_text(size=5),
            legend_title=element_text(size=5, face="bold"),
        )
    )

    return plot

In [None]:
price_diff_by_variables2(data,"f_room_type","d_familykidfriendly","Room type", "Family kid friendly")

In [None]:
price_diff_by_variables2(data, "f_room_type", "f_property_type", "Room type", "Property type")

Cancelation policy

In [None]:
price_diff_by_variables2(data, "f_cancellation_policy", "d_familykidfriendly", "Cancellation policy", "Family kid friendly")

In [None]:
price_diff_by_variables2(data, "f_cancellation_policy", "d_tv", "Cancellation policy", "TV")

Look up property type

In [None]:
price_diff_by_variables2(data, "f_property_type", "d_cats", "Property type", "Cats")

In [None]:
price_diff_by_variables2(data, "f_property_type", "d_dogs", "Property type", "Dogs")

**Dummies, interactions suggested by graphs**

In [None]:
X1 = ("f_room_type*f_property_type",  "f_room_type*d_familykidfriendly")
X2=("d_airconditioning*f_property_type", "d_cats*f_property_type", "d_dogs*f_property_type")
X3="(f_property_type + f_room_type + f_cancellation_policy + f_bed_type) * ("+ "+".join(amenities) +")"

In [None]:
modellev1="~ n_accommodates"
modellev2="~"+"+".join(basic_lev)
modellev3="~"+"+".join(basic_lev)+"+"+"+".join(basic_add)+"+"+"+".join(reviews)
modellev4="~"+"+".join(basic_lev)+"+"+"+".join(basic_add)+"+"+"+".join(reviews)+"+"+"+".join(poly_lev)
modellev5="~"+"+".join(basic_lev)+"+"+"+".join(basic_add)+"+"+"+".join(reviews)+"+"+"+".join(poly_lev)+"+"+"+".join(X1)
modellev6="~"+"+".join(basic_lev)+"+"+"+".join(basic_add)+"+"+"+".join(reviews)+"+"+"+".join(poly_lev)+"+"+"+".join(X1)+"+"+"+".join(X2)
modellev7="~"+"+".join(basic_lev)+"+"+"+".join(basic_add)+"+"+"+".join(reviews)+"+"+"+".join(poly_lev)+"+"+"+".join(X1)+"+"+"+".join(X2)+"+"+"+".join(amenities)
modellev8="~"+"+".join(basic_lev)+"+"+"+".join(basic_add)+"+"+"+".join(reviews)+"+"+"+".join(poly_lev)+"+"+"+".join(X1)+"+"+"+".join(X2)+"+"+"+".join(amenities)+"+"+X3

#### Regressions with cross-validation

**Split train & holdout for cross-validation**

In [None]:
smp_size = round(0.2 * data.shape[0])-1

In [None]:
smp_size

In [None]:
# Set the seed to make results reproducable
np.random.seed(20240115)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data_work,data_holdout=train_test_split(data, test_size=smp_size)

In [None]:
data_work.shape

In [None]:
data_holdout.shape

In [None]:
n_folds=5

In [None]:
from sklearn.model_selection import KFold
from statsmodels.tools.eval_measures import mse,rmse
k = KFold(n_splits=n_folds, shuffle=False, random_state=None)

In [None]:
def cv_reg(formula, data, kfold, testdata, robustse=None):
    regression_list = []
    predicts_on_test = []
    rsquared = []
    rmse_list = []
    rmse_list_test = []

    # Calculating OLS for each fold

    for train_index, test_index in k.split(data):
        data_train, data_test = data.iloc[train_index, :], data.iloc[test_index, :]
        if robustse is None:
            model = smf.ols(formula, data=data_train).fit()
        else:
            model = smf.ols(formula, data=data_train).fit(cov_type=robustse)
        regression_list += [model]
        predicts_on_test += [model.predict(data_test)]
        rsquared += [model.rsquared]

        rmse_tr = pd.concat(
            [data_train["price"], model.predict(data_train)],
            axis=1,
            keys=["price", "predicted"],
        )
        rmse_tr = rmse_tr[~rmse_tr.isna().any(axis=1)]

        rmse_te = pd.concat(
            [data_test["price"], model.predict(data_test)],
            axis=1,
            keys=["price", "predicted"],
        )
        rmse_te = rmse_te[~rmse_te.isna().any(axis=1)]

        rmse_list += [rmse(rmse_tr["price"], rmse_tr["predicted"], axis=0)]
        rmse_list_test += [rmse(rmse_te["price"], rmse_te["predicted"], axis=0)]
    nvars = model.df_model

    return {
        "regressions": regression_list,
        "test_predict": predicts_on_test,
        "r2": rsquared,
        "rmse": rmse_list,
        "rmse_test": rmse_list_test,
        "nvars": nvars,
    }


def summarize_cv(cvlist, stat="rmse"):
    result = pd.DataFrame(
        {"Model" + str(x + 1): cvlist[x][stat] for x in range(len(cvlist))}
    )
    result["Resample"] = ["Fold" + str(x + 1) for x in range(len(cvlist[0]["rmse"]))]
    result = result.set_index("Resample")
    result = pd.concat([result, pd.DataFrame(result.mean(), columns=["Average"]).T])
    return result

In [None]:
cv_list = []
for i in [
    modellev1,
    modellev2,
    modellev3,
    modellev4,
    modellev5,
    modellev6,
    modellev7,
    modellev8,
]:
    cv_list += [cv_reg("price" + i, data, k, "HC0")]

In [None]:
summarize_cv(cv_list)


In [None]:
# RMSE training vs test graph

rmse_levels = {"nvars": [], "var": [], "value": []}
for i in range(0, 8):
    rmse_levels["nvars"].append(int(cv_list[i]["nvars"]))
    rmse_levels["var"].append("RMSE Training")
    rmse_levels["value"].append(pd.Series(cv_list[i]["rmse"]).mean())
for i in range(0, 8):
    rmse_levels["nvars"].append(int(cv_list[i]["nvars"]))
    rmse_levels["var"].append("RMSE Test")
    rmse_levels["value"].append(pd.Series(cv_list[i]["rmse_test"]).mean())
rmse_levels = pd.DataFrame.from_dict(rmse_levels)
rmse_levels["nvars2"] = rmse_levels["nvars"] + 1

In [None]:
rmse_levels

In [None]:
(
    ggplot(rmse_levels, aes(x="factor(nvars2)", y="value", color="var", group="var"))
    + geom_line(size=1, show_legend=True, na_rm=True)
    + scale_color_manual(name="", values=(color[0], color[1]))
    + scale_y_continuous(name="RMSE", limits=(26, 46), breaks=seq(26, 46, 2))
    + scale_x_discrete(
        name="Number of coefficients",
        expand=(0.01, 0.01)
    )
    + theme_bw()
)

In [None]:
plt.figure(figsize=(7,5))
ax = sns.lineplot(
    x = [str(x) for x in rmse_levels['nvars2']], 
    y = rmse_levels['value'], 
    hue = rmse_levels['var'],
    palette = ['k', 'steelblue']
)
ax.set_title('Train & Test RMSE')
ax.set_ylabel('RMSE')
ax.set_xlabel('# of coefficients')
ax.legend(fontsize = 8, title = None)
plt.show();

#### LASSO

**Doing LASSO the hard way (aka *'naive' grid search*)**

In [None]:
vars_model_7 = "(f_property_type + f_room_type + f_cancellation_policy + f_bed_type) * (d_24hourcheckin + d_airconditioning + d_breakfast + d_buzzerwirelessintercom + d_cabletv + d_carbonmonoxidedetector + d_cats + d_dogs + d_doorman + d_doormanentry + d_dryer + d_elevatorinbuilding + d_essentials + d_familykidfriendly + d_fireextinguisher + d_firstaidkit + d_freeparkingonpremises + d_freeparkingonstreet + d_gym + d_hairdryer + d_hangers + d_heating + d_hottub + d_indoorfireplace + d_internet + d_iron + d_keypad + d_kitchen + d_laptopfriendlyworkspace + d_lockonbedroomdoor + d_lockbox + d_otherpets + d_paidparkingoffpremises + d_petsallowed + d_petsliveonthisproperty + d_pool + d_privateentrance + d_privatelivingroom + d_safetycard + d_selfcheckin + d_shampoo + d_smartlock + d_smokedetector + d_smokingallowed + d_suitableforevents + d_tv + d_washer + d_washerdryer + d_wheelchairaccessible + d_wirelessinternet)"
vars_model_8 = modellev8

In [None]:
from sklearn.linear_model import Lasso
import patsy
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler

# define model evaluation method
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)
y, X = patsy.dmatrices("price" + vars_model_8, data)

In [None]:
y

In [None]:
X.shape

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=smp_size, random_state=10
)

lambdas = [i/100 for i in range(5, 100,5)]

train_r_squared = np.zeros(len(lambdas))
test_r_squared = np.zeros(len(lambdas))

pred_num = X.shape[1]
coeff_a = np.zeros((len(lambdas), pred_num))

In [None]:
import datetime

In [None]:
from sklearn.model_selection import cross_val_score

for ind, i in enumerate(lambdas):
    print(f"Run: {str(ind).rjust(2)}, lambda: {i:.2f}, start: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    reg = Lasso(alpha = i, random_state= 20240115)
    reg.fit(X_train, y_train)
    results = cross_val_score(reg, X, y, cv=cv, scoring="r2")

    train_r_squared[ind] = reg.score(X_train, y_train)    
    test_r_squared[ind] = reg.score(X_test, y_test)

In [None]:
r_squared_data = (
    pd.DataFrame(
        {
            "$R^2$ Test set": test_r_squared,
            "$R^2$ Training set": train_r_squared,
            "lambda": lambdas,
        }
    )
    .melt(id_vars=["lambda"])
)

In [None]:
r_squared_data.info()

In [None]:
r_squared_data["variable"] = r_squared_data["variable"].astype("category").cat.reorder_categories(
    ["$R^2$ Training set", "$R^2$ Test set"]
)

In [None]:
r_squared_data.info()

#### Plot results using `ggplot` and `seaborn`

In [None]:
(
    ggplot(r_squared_data, aes(x="lambda", y="value", color="variable"))
    + geom_point()
    + geom_line(size=1, show_legend=False, na_rm=True)
    + scale_color_manual(name="", values=(color[1], color[0]))
    + scale_y_continuous(name="$R^2$")
    + facet_wrap("variable", scales="free")
    + theme_bw()
    + theme(subplots_adjust={"wspace": 0.25},legend_title=element_blank())
)

In [None]:
(
    ggplot(r_squared_data, aes(x="lambda", y="value", color="variable"))
    + geom_point()
    + geom_line(size=1, show_legend=False, na_rm=True)
    + scale_color_manual(name="", values=(color[1], color[0]))
    + scale_y_continuous(name="$R^2$")
    + facet_grid("variable ~ .", scales="free")
    + theme_bw()
    + theme(subplots_adjust={"wspace": 0.25}, legend_title=element_blank())
)

In [None]:
plt.figure(figsize=(7,5))
ax = sns.lineplot(
    data = r_squared_data, 
    x = 'lambda', y = 'value', hue = 'variable', 
    palette = ['k', 'steelblue'],
    marker = 'o'
)
ax.set_title('Train & Test $R^2$')
ax.set_ylabel('$R^2$')
ax.set_xlabel('lambda')
ax.set_xticks(lambdas[1::2])
ax.legend(fontsize = 8, title = None)
plt.show();

In [None]:
df_lam = pd.DataFrame(test_r_squared*100, columns=['R_squared'])
df_lam['lambda'] = (lambdas)
# returns the index of the row where column has maximum value.
df_lam.loc[df_lam['R_squared'].idxmax()]

In [None]:
reg_best = Lasso(alpha = df_lam.loc[df_lam['R_squared'].idxmax()]['lambda'])
reg_best.fit(X_train, y_train)

In [None]:
reg_best.coef_[reg_best.coef_>0]

In [None]:
reg_best.get_params()

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, reg_best.predict(X_test))

**LASSO using `GridSearch`**

- cross-validation strategies: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
- RepeatedKFold: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedKFold.html#sklearn.model_selection.RepeatedKFold
- scoring: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
- GridSearch: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
y,  X = patsy.dmatrices('price'+vars_model_8, data)

In [None]:
from sklearn.model_selection import GridSearchCV

`GridSearchCV` not only searches for the best parameters, but also automatically fits a new model on the whole training dataset with the parameters that yielded the best cross-validation performance.  

In [None]:
# define model
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=1)
# define grid
grid = dict()
grid["alpha"] = np.arange(0.05, 1, 0.05)
# define search
search = GridSearchCV(model, grid, scoring="neg_root_mean_squared_error", cv=cv)
# perform the search
results = search.fit(X, y)

In [None]:
print("RMSE: " + str(results.best_score_ * -1))

In [None]:
results.cv_results_['rank_test_score']

In [None]:
results.best_estimator_

### Diagnostics 

In [None]:
model3_level = smf.ols('price'+modellev3, data=data_work).fit(cov_type='HC0')
model7_level = smf.ols('price'+modellev7, data=data_work).fit(cov_type='HC0')

In [None]:
# look at holdout RMSE
model7_level_work_predictions = pd.concat(
    [data_work["price"], model7_level.predict(data_work)],
    axis=1,
    keys=["price", "predicted"],
)

In [None]:
model7_level_work_predictions.tail()

In [None]:
model7_level_work_predictions = model7_level_work_predictions[~model7_level_work_predictions.isna().any(axis=1)]
model7_level_work_rmse = rmse(model7_level_work_predictions["price"], model7_level_work_predictions["predicted"], axis=0)

In [None]:
model7_level_work_rmse

In [None]:
model7_level_holdout_predictions = pd.concat(
    [data_holdout["price"], model7_level.predict(data_holdout)],
    axis=1,
    keys=["price", "predicted"],
)
model7_level_holdout_predictions = model7_level_holdout_predictions[~model7_level_holdout_predictions.isna().any(axis=1)]
model7_level_holdout_rmse = rmse(model7_level_holdout_predictions["price"], model7_level_holdout_predictions["predicted"], axis=0)
print(
    f"RMSE work:{round(model7_level_work_rmse,2)}",
    "\t",
    f"RMSE holdout:{model7_level_holdout_rmse:.2f}",
)

#### Charting fitted vs actual

In [None]:
Ylev=data_holdout['price']
meanY=Ylev.mean()
sdY=Ylev.std()
meanY_m2SE = meanY -1.96 * sdY
meanY_p2SE = meanY + 1.96 * sdY
Y5p=Ylev.quantile(.05)
Y95p=Ylev.quantile(.95)

Note: the `statsmodels` API is kind of reticent in its documentation. So much about the [summary_frame()](https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.PredictionResults.summary_frame.html#statsmodels.regression.linear_model.PredictionResults.summary_frame) method...

In [None]:
model7_level.get_prediction(data_holdout).summary_frame(alpha=0.05)

In [None]:
# Create data frame with the real and predicted values
data_diagnostic = model7_level.get_prediction(data_holdout).summary_frame(alpha=0.05)

df_ = pd.concat(
    [data_holdout["price"], model7_level.predict(data_holdout)],
    axis=1,
    keys=["price", "predicted"],
)

In [None]:
df_.isna().sum()

In [None]:
df_ = df_[~df_.isna().any(axis=1)].reset_index(drop=True)
data_diagnostic["Ylev"] = df_["price"]

`pd.DataFrame.any()`: Return whether any element is True, potentially over an axis. 
Returns False unless there is at least one element within a series or along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty).

In [None]:
(
    ggplot(data=data_diagnostic)
    + geom_point(
        aes(y="Ylev", x="mean"),
        color=color[1],
        size=1,
        alpha=0.7,
        show_legend=False,
        na_rm=True,
    )
    + geom_segment(aes(x=0, y=0, xend=350, yend=350), size=0.5, color=color[2])
    + coord_cartesian(xlim=(0, 350), ylim=(0, 350))
    + scale_x_continuous(
        expand=(0.01, 0.01), limits=(0, 350), breaks=seq(0, 350, by=50)
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 350), breaks=seq(0, 350, by=50)
    )
    + labs(y="Price (US dollars)", x="Predicted price  (US dollars)")
    + theme_bw()
)

In [None]:
x = np.linspace(0, 300, data_diagnostic.shape[0])

plt.figure(figsize = (6,6))
ax = sns.scatterplot(
    data=data_diagnostic, x="mean", y="Ylev",
    s = 25, color = 'steelblue'
)
plt.plot(x,x, color = 'k')
ax.set_xlim(0,300)
ax.set_ylim(0,300)
ax.set_ylabel('actual')
ax.set_xlabel('predicted')
ax.set_title('Actual vs predicted in USD')
plt.show()

#### Redo predicted values at 80% PI

In [None]:
dt = model7_level.get_prediction(data_holdout).summary_frame(alpha=0.2)
data_extra = pd.concat(
    [
        data_holdout["price"],
        data_holdout["n_accommodates"],
        model7_level.predict(data_holdout),
    ],
    axis=1,
    keys=["price", "n_accommodates", "predicted"],
)
data_extra = data_extra[~data_extra.isna().any(axis=1)].reset_index(drop=True)
dt["n_accommodates"] = data_extra["n_accommodates"]
dt["Ylev"] = data_extra["price"]
dt["elev"] = dt["Ylev"] - dt["mean"]

In [None]:
data_diagnostic = model7_level.get_prediction(data_holdout).summary_frame(alpha=0.2)
df_ = pd.concat(
    [
        data_holdout["price"],
        data_holdout["n_accommodates"],
        model7_level.predict(data_holdout),
    ],
    axis=1,
    keys=["price", "n_accommodates", "predicted"],
)
df_ = df_[~df_.isna().any(axis=1)].reset_index(drop=True)
data_diagnostic["n_accommodates"] = df_["n_accommodates"]
data_diagnostic["Ylev"] = df_["price"]
data_diagnostic["elev"] = data_diagnostic["Ylev"] - data_diagnostic["mean"]

In [None]:
predictionlev_holdout_summary = data_diagnostic.groupby(by=["n_accommodates"]).mean().reset_index()

In [None]:
predictionlev_holdout_summary

In [None]:
(
    ggplot(predictionlev_holdout_summary, aes(x="n_accommodates"))
    + geom_bar(aes(y="mean"), stat="identity", fill=color[0], alpha=0.7)
    + geom_errorbar(aes(ymin="obs_ci_lower", ymax="obs_ci_upper"), width=0.2)
    + geom_errorbar(aes(ymin="mean_ci_lower", ymax="mean_ci_upper"), width=0.5)
    + scale_y_continuous(name="Predicted price (US dollars)")
    + scale_x_continuous(name="Accomodates (Persons)")
    + scale_color_manual(values=(color[1], color[1]))
    + labs(title = "Mean redictions and confidence intervals")
    + theme_bw()
    + theme(legend_title=element_blank(), legend_position="none")
    # + theme(legend_title=element_blank(), legend_position="none")
)

In [None]:
(
    ggplot(predictionlev_holdout_summary, aes(x="n_accommodates"))
    + geom_point(aes(y="mean"), fill='k',  size = 5)
    + geom_errorbar(aes(ymin="obs_ci_lower", ymax="obs_ci_upper"), width=0.2)
    + geom_errorbar(aes(ymin="mean_ci_lower", ymax="mean_ci_upper"), width=0.5)
    + scale_y_continuous(name="Predicted price (US dollars)")
    + scale_x_continuous(name="Accomodates (Persons)", breaks = [1,2,3,4,5,6, 7])
    + scale_color_manual(values=(color[1], color[1]))
    + labs(title = "Mean predictions and confidence intervals")
    + theme_bw()
    + theme(legend_title=element_blank(), legend_position="none")
)