In [None]:
%load_ext lab_black

In [None]:
import time
import copy

import pandas as pd
import numpy as np
import altair as alt

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import xgboost

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

from wordcloud import WordCloud
import matplotlib.pyplot as plt

alt.data_transformers.disable_max_rows()
alt.data_transformers.enable("json")

In [None]:
NUM_FEATURES = 1280


def get_metrics(name, y_test, y_pred):
    return pd.DataFrame(
        {
            "experiment_name": name,
            "r2_score": [r2_score(y_test, y_pred)],
            "explained_variance_score": [explained_variance_score(y_test, y_pred)],
            "max_error": [max_error(y_test, y_pred)],
            "mean_absolute_error": [mean_absolute_error(y_test, y_pred)],
            "mean_squared_error": [mean_squared_error(y_test, y_pred)],
            "median_absolute_error": [median_absolute_error(y_test, y_pred)],
        }
    )

## Load data

In [None]:
df_recipes = pd.read_csv("../../data/sparkrecipes_filtered.csv")
df_recipes

In [None]:
df_embeddings.recipe_id.value_counts().nlargest(20)

In [None]:
df = df_embeddings[df_embeddings.recipe_id.isin(df_recipes.recipe_id)]

## Prepare data

In [None]:
X = df[[f"f_{i}" for i in range(NUM_FEATURES)]]
X

In [None]:
y = np.log(df.total_calories)
y

In [None]:
y.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

## Use XGBoost Regressor to predict total calories

In [None]:
model = xgboost.XGBRegressor(
    colsample_bytree=0.4,
    gamma=1,
    learning_rate=0.01,
    max_depth=3,
    n_estimators=1000,
    objective="reg:squarederror",
    reg_alpha=0.3,
    subsample=0.8,
    n_jobs=-1,
    seed=42,
    silent=True,
)

In [None]:
eval_set = [(X_train, y_train), (X_test, y_test)]

In [None]:
%time model.fit(X_train, y_train, eval_metric=["rmse", "mae"], eval_set=eval_set, verbose=False)

In [None]:
eval_results = pd.DataFrame(model.evals_result())
eval_results.columns = ["train", "test"]

In [None]:
def expand_results(eval_results):
    dfs = []
    cols = []
    for col in eval_results:
        for metric in eval_results.index:
            cols.append(f"{col}_{metric}")
            dfs.append(pd.DataFrame(eval_results[col][metric]))
    result = pd.concat(dfs, axis=1)
    result.columns = cols
    return result

In [None]:
df_eval_results = expand_results(eval_results)

In [None]:
df_eval_results.shape

In [None]:
df_eval_results.plot.line(figsize=(15, 11), loglog=True)

In [None]:
y_pred = model.predict(X_test)

In [None]:
df_xgboost_results = get_metrics("xgboost regressor", y_test, y_pred)
df_xgboost_results

## Use Linear Regression

In [None]:
lin_model = LinearRegression()

In [None]:
lin_model.fit(X_train, y_train)

In [None]:
y_pred_lin = lin_model.predict(X_test)

In [None]:
pd.Series(y_pred_lin).hist(bins=150)

In [None]:
df_lin_results = get_metrics("linear regression", y_test, y_pred_lin)
df_lin_results

## Baseline

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.utils.validation import check_is_fitted


class BaselineModel(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass

    def fit(self, _, y):
        self.mean_ = y.mean()
        return self

    def predict(self, X):
        check_is_fitted(self, [])

        return np.array(X.shape[0] * [self.mean_])

In [None]:
baseline_model = BaselineModel()

In [None]:
baseline_model.fit(X_train, y_train)

In [None]:
y_pred_baseline = baseline_model.predict(X_test)

In [None]:
y_pred_baseline

In [None]:
df_baseline_results = get_metrics("baseline", y_test, y_pred_baseline)
df_baseline_results

## Compare to Baseline

In [None]:
df_results = (
    pd.concat([df_baseline_results, df_xgboost_results, df_lin_results])
    .reset_index(drop=True)
    .T
)
df_results.columns = df_results.loc["experiment_name"].values
df_results = df_results.iloc[1:]

In [None]:
df_results

In [None]:
df_results.plot.bar(log=True, figsize=(12, 7))

## Inspect predictions

In [None]:
ylim = (0, 1200)

In [None]:
df.iloc[X_train.index].total_calories.plot.hist(bins=300, figsize=(16, 11), ylim=ylim)

In [None]:
df_predictions = df.iloc[X_test.index].assign(predicted_calories=np.exp(y_pred)).sample(5000)

In [None]:
df_predictions[["total_calories", "predicted_calories"]].plot.hist(
    bins=300, figsize=(16, 11), alpha=0.8, ylim=ylim
)

In [None]:
line = (
    alt.Chart(df_predictions).mark_line().encode(x="total_calories", y="total_calories")
)

scatter = (
    alt.Chart(df_predictions)
    .mark_circle(color="red")
    .encode(x="total_calories", y="predicted_calories")
)

(line + scatter).properties(width=800, height=800)

In [None]:
NON_FEATURE_COLS = ["title", "total_calories", "servings", "predicted_calories"]

In [None]:
df_high_cal = df_predictions[df_predictions.predicted_calories > 250]
df_low_cal = df_predictions[df_predictions.predicted_calories < 150]

In [None]:
df_high_cal[NON_FEATURE_COLS].sample(10)

In [None]:
df_low_cal[NON_FEATURE_COLS].sample(10)

In [None]:
df_high_cal[NON_FEATURE_COLS].describe()

In [None]:
df_low_cal[NON_FEATURE_COLS].describe()

In [None]:
high_cal_wc = WordCloud().generate(" ".join(df_high_cal.title.str.lower()))
low_cal_wc = WordCloud().generate(" ".join(df_low_cal.title.str.lower()))

In [None]:
plt.figure(figsize=(15, 15))
plt.imshow(high_cal_wc, interpolation="nearest")

In [None]:
plt.figure(figsize=(15, 15))
plt.imshow(low_cal_wc, interpolation="nearest")