# AutoML approach
its interesting to cross check some of our findings from simple linear regression with more complex models. It can be challenging to iterate through the many possible models, but the advent of low code auto ML libraries like pycaret and H2O make it easier.

In [15]:
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Step 2: Initialize H2O

In [16]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_422"; OpenJDK Runtime Environment (build 1.8.0_422-8u422-b05-1~22.04-b05); OpenJDK 64-Bit Server VM (build 25.422-b05, mixed mode)
  Starting server from /home/noone/.cache/pypoetry/virtualenvs/dominos-senior-data-scientist-mPMyyBsO-py3.11/lib/python3.11/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpojbml9rb
  JVM stdout: /tmp/tmpojbml9rb/h2o_noone_started_from_python.out
  JVM stderr: /tmp/tmpojbml9rb/h2o_noone_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Australia/Brisbane
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_noone_fgjamv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,26.64 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


# Step 3: Load Data

In [17]:
data_path = './../data/price_elasticity_data.csv'
df = pd.read_csv(data_path)

# Step 4: Preprocess Data

In [18]:
# rename columns for convenience
df.rename(columns={'Menu_Price': 'unit_price', 'Pizza_Count': 'quantity', 'Profit_Percentage': 'profit_percentage'}, inplace=True)
df['gross_margin'] = df['profit_percentage'] / 100
df['gross_profit'] = df['unit_price'] * (1 - df['gross_margin'])
df['cost'] = df['unit_price'] - df['gross_profit']

# subset the dataframe for 'Individuale' product and create a copy
communita_df = df[df['Product'] == 'Communita'].copy()

# Remove outliers

In [19]:
# todo

# Feature Engineering

### create interaction features

In [20]:
communita_df['unit_price_squared'] = communita_df['unit_price'] ** 2
communita_df['unit_price_cost_interaction'] = communita_df['unit_price'] * communita_df['cost']

### normalize features

In [21]:
communita_df['unit_price_scaled'] = (communita_df['unit_price'] - communita_df['unit_price'].mean()) / communita_df['unit_price'].std()
communita_df['cost_scaled'] = (communita_df['cost'] - communita_df['cost'].mean()) / communita_df['cost'].std()

In [22]:
# convert to H2O frame
h2o_df = h2o.H2OFrame(communita_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [23]:
# add scaled features to H2O frame
h2o_df['unit_price_scaled'] = h2o_df['unit_price_scaled']
h2o_df['cost_scaled'] = h2o_df['cost_scaled']

# step 5: Run H2O AutoML

In [None]:
%%time
x = ['unit_price', 'cost', 'gross_margin', 'unit_price_squared', 'unit_price_cost_interaction', 'unit_price_scaled', 'cost_scaled']
y = 'quantity'

aml = H2OAutoML(
    max_runtime_secs=1200, 
    seed=42, 
    project_name="price_elasticity", 
    nfolds=0  # our dataset is too small for cross validation
)

aml.train(
    x=x, 
    y=y, 
    training_frame=h2o_df
)

# view the leaderboard
lb = aml.leaderboard
print(lb)

AutoML progress: |
21:08:02.19: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 60.0.

██████████████████████████████████████████████████████████

# Step 6: Calculate Predictions

In [None]:
best_model = aml.leader
preds = best_model.predict(h2o_df)
communita_df['predicted_quantity'] = h2o.as_list(preds)['predict']  # add preds to df

# Step 7: Find Optimal Price to Maximize Gross Profit

In [None]:
communita_df['gross_profit_predictions'] = (communita_df['unit_price'] - communita_df['cost']) * communita_df['predicted_quantity']

# find price that maximises GP
optimal_price = communita_df.loc[communita_df['gross_profit_predictions'].idxmax(), 'unit_price']
max_gross_profit = communita_df['gross_profit_predictions'].max()
optimal_quantity = communita_df.loc[communita_df['gross_profit_predictions'].idxmax(), 'predicted_quantity']

In [None]:
print(f'Optimal Price: {optimal_price}')
print(f'Maximum Gross Profit: {max_gross_profit}')
print(f'Optimal Quantity: {optimal_quantity}')

# Plot the relationship between unit price and quantity

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(
    communita_df['unit_price'], 
    communita_df['quantity'], 
    alpha=0.5, 
    label='Actual Quantity'
)
plt.scatter(
    communita_df['unit_price'], 
    communita_df['predicted_quantity'], 
    color='red', 
    alpha=0.5, 
    label='Predicted Quantity'
)
plt.title('Unit Price vs Quantity')
plt.xlabel('Unit Price')
plt.ylabel('Quantity')
plt.legend()
plt.grid(True)
plt.show()

# Plot the gross profit curve

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(
    communita_df['unit_price'], 
    communita_df['gross_profit_predictions'], 
    label='Gross Profit Curve'
)
plt.scatter(
    optimal_price, 
    max_gross_profit, 
    color='red', 
    label=f'Optimal Price: {optimal_price:.2f}'
)
plt.title('Gross Profit Curve')
plt.xlabel('Unit Price')
plt.ylabel('Gross Profit')
plt.legend()
plt.grid(True)
plt.show()

# Step 8: Extend Sample Space

In [None]:
# extend the range of unit prices beyond the original dataset
new_unit_prices = np.linspace(
    communita_df['unit_price'].min(),
	2 * communita_df['unit_price'].max(), 
    100
)

# create new H2O frame with extended unit prices
new_h2o_df = h2o_df.cbind(
	h2o.H2OFrame(pd.DataFrame(new_unit_prices, columns=['unit_price']))
)

# add interaction features
new_h2o_df['unit_price_squared'] = new_h2o_df['unit_price'] ** 2
new_h2o_df['unit_price_cost_interaction'] = new_h2o_df['unit_price'] * new_h2o_df['cost']

# add scaled features
new_h2o_df['unit_price_scaled'] = (new_h2o_df['unit_price'] - new_h2o_df[
	'unit_price'].mean()) / new_h2o_df['unit_price'].std()
new_h2o_df['cost_scaled'] = (new_h2o_df['cost'] - new_h2o_df['cost'].mean()) / new_h2o_df['cost'].std()

# predict with the best model
new_preds = best_model.predict(new_h2o_df)

# convert predictions to pandas dataframe
new_predicted_quantities = h2o.as_list(new_preds)['predict']

# calculate new gross profit predictions
new_gross_profits = (new_unit_prices - communita_df['cost'].mean()) * new_predicted_quantities

# find the optimal price in the extended range
optimal_price_extended = new_unit_prices[np.argmax(new_gross_profits)]
max_gross_profit_extended = max(new_gross_profits)
optimal_quantity_extended = new_predicted_quantities[np.argmax(new_gross_profits)]

print(f'Extended Optimal Price: {optimal_price_extended}')
print(f'Extended Maximum Gross Profit: {max_gross_profit_extended}')
print(f'Extended Optimal Quantity: {optimal_quantity_extended}')

# Plot the relationship between unit price and quantity with regression line

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(
    communita_df['unit_price'], 
    communita_df['quantity'], 
    alpha=0.5, 
    label='Actual Quantity'
)
plt.scatter(
    communita_df['unit_price'], 
    communita_df['predicted_quantity'], 
    color='red', 
    alpha=0.5, 
    label='Predicted Quantity'
)
plt.plot(
    new_unit_prices, 
    new_predicted_quantities, 
    color='blue', 
    linewidth=2, 
    label='Regression Line'
)
plt.title('Unit Price vs Quantity with Regression Line')
plt.xlabel('Unit Price')
plt.ylabel('Quantity')
plt.legend()
plt.grid(True)
plt.show()

# Plot the gross profit curve

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(
    communita_df['unit_price'], 
    communita_df['gross_profit_predictions'], 
    label='Gross Profit Curve'
)
plt.plot(
    new_unit_prices, 
    new_gross_profits, 
    label='Extended Gross Profit Curve'
)
plt.scatter(
    optimal_price, 
    max_gross_profit, 
    color='red', 
    label=f'Optimal Price: {optimal_price:.2f}'
)
plt.scatter(
    optimal_price_extended, 
    max_gross_profit_extended, 
    color='green', 
    label=f'Extended Optimal Price: {optimal_price_extended:.2f}'
)
plt.axhline(
    y=max_gross_profit_extended, 
    color='red', linestyle='--', 
    label=f'Max Gross Profit: {max_gross_profit_extended:.2f}'
)
plt.title('Gross Profit Curve')
plt.xlabel('Unit Price')
plt.ylabel('Gross Profit')
plt.legend()
plt.grid(True)
plt.show()

# Shutdown H2O

In [None]:
h2o.cluster().shutdown()