In [1]:
import evalml
from evalml import AutoMLSearch

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

  def _pt_shuffle_rec(i, indexes, index_mask, partition_tree, M, pos):
  def delta_minimization_order(all_masks, max_swap_size=100, num_passes=2):
  def _reverse_window(order, start, length):
  def _reverse_window_score_gain(masks, order, start, length):
  def _mask_delta_score(m1, m2):
  def identity(x):
  def _identity_inverse(x):
  def logit(x):
  def _logit_inverse(x):
  def _build_fixed_single_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _build_fixed_multi_output(averaged_outs, last_outs, outputs, batch_positions, varying_rows, num_varying_rows, link, linearizing_weights):
  def _init_masks(cluster_matrix, M, indices_row_pos, indptr):
  def _rec_fill_masks(cluster_matrix, indices_row_pos, indptr, indices, M, ind):
  def _single_delta_mask(dind, masked_inputs, last_mask, data, x, noop_code):
  def _delta_masking(masks, x, curr_delta_inds, varying_rows_out,
  def _jit_build_partition_tree(xmin, xmax, ymi

### Dataset

In [2]:
raw_data = pd.read_csv('../../data/pizza_v2.csv')
raw_data.head()

Unnamed: 0,company,price_rupiah,diameter,topping,variant,size,extra_sauce,extra_cheese,extra_mushrooms
0,A,"Rp235,000",22 inch,chicken,double_signature,jumbo,yes,yes,no
1,A,"Rp198,000",20 inch,papperoni,double_signature,jumbo,yes,yes,no
2,A,"Rp120,000",16 inch,mushrooms,double_signature,reguler,yes,yes,yes
3,A,"Rp155,000",14 inch,smoked_beef,double_signature,reguler,yes,no,yes
4,A,"Rp248,000",18 inch,mozzarella,double_signature,jumbo,yes,no,yes


In [3]:
raw_data.info() # notice that there is not na/null value

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company          129 non-null    object
 1   price_rupiah     129 non-null    object
 2   diameter         129 non-null    object
 3   topping          129 non-null    object
 4   variant          129 non-null    object
 5   size             129 non-null    object
 6   extra_sauce      129 non-null    object
 7   extra_cheese     129 non-null    object
 8   extra_mushrooms  129 non-null    object
dtypes: object(9)
memory usage: 9.2+ KB


### Feature Engineering

In [4]:
dep_var_col = 'price_rupiah'

In [5]:
data = raw_data.copy()
data['diameter'] = (data['diameter'].str.replace(' inch', '')).astype(float)
data['price_rupiah'] = (data['price_rupiah'].str.replace('Rp', ''))
data['price_rupiah'] = (data['price_rupiah'].str.replace(',', '')).astype(float)

data.head()

Unnamed: 0,company,price_rupiah,diameter,topping,variant,size,extra_sauce,extra_cheese,extra_mushrooms
0,A,235000.0,22.0,chicken,double_signature,jumbo,yes,yes,no
1,A,198000.0,20.0,papperoni,double_signature,jumbo,yes,yes,no
2,A,120000.0,16.0,mushrooms,double_signature,reguler,yes,yes,yes
3,A,155000.0,14.0,smoked_beef,double_signature,reguler,yes,no,yes
4,A,248000.0,18.0,mozzarella,double_signature,jumbo,yes,no,yes


### Data analysis

In [6]:
 = data['topping'].value_counts()
toppings_value_count

SyntaxError: invalid syntax (1571783160.py, line 1)

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(20, 14))

sns.boxplot(data=data, x='company', y='price_rupiah', hue='size', ax=axs[0][0])
sns.boxplot(data=data, x='company', y='price_rupiah', hue='extra_mushrooms', ax=axs[0][1])
sns.boxplot(data=data, x='company', y='price_rupiah', hue='extra_cheese', ax=axs[1][0])
sns.boxplot(data=data, x='company', y='price_rupiah', hue='extra_sauce', ax=axs[1][1])

toppings_value_count = data['topping'].value_counts()
axs[2][0].pie(toppings_value_count, labels=toppings_value_count.index, autopct='%1.1f%%')

sns.scatterplot(data=data, x='diameter', y='price_rupiah', hue='company', ax=axs[2][1]);
plt.show()

In [None]:
encoder = OneHotEncoder()

def onehotEncode(data, column):
    encoded_data = encoder.fit_transform(data[[column]]).toarray()
    data[encoder.get_feature_names_out([column]).tolist()] = encoded_data
    data.drop(columns=[column], axis=1, inplace=True)
    return data
    
data=onehotEncode(data, 'company')
data=onehotEncode(data, 'topping')
data=onehotEncode(data, 'variant')
data=onehotEncode(data, 'size')
data=onehotEncode(data, 'extra_sauce')
data=onehotEncode(data, 'extra_cheese')
data=onehotEncode(data, 'extra_mushrooms')

data.head()

In [None]:
y = data[dep_var_col]
X = data.drop(columns=[dep_var_col], axis=1)

In [None]:
print(y)
X.head()

In [None]:
X_train, X_evaluate, y_train, y_evaluate = evalml.preprocessing.split_data(X, y, problem_type='regression', 
                                                                           test_size=0.15, random_seed=22)

print('X_train: ', X_train.shape)
print('X_evaluate: ', X_evaluate.shape)
print('y_train: ', y_train.shape)
print('y_evaluate: ', y_evaluate.shape)

In [None]:
y_train

### AutoML training

In [None]:
automl = AutoMLSearch(
    X_train=X_train,
    y_train=y_train,
    problem_type='regression',
    max_batches=3, # default is 3
    verbose=True
)

automl.search(interactive_plot=False)

In [None]:
automl.rankings

In [None]:
best_pipeline = automl.best_pipeline

In [None]:
best_pipeline.describe()

In [None]:
model_objectives = [
    'medianae',
     'mse',
     'mae',
     'r2',
     'mean squared log error',
     'root mean squared log error',
     'root mean squared error',
]

best_pipeline_score = best_pipeline.score(X_train, y_train, objectives=model_objectives)
best_pipeline_score

### Evaluate model

In [None]:
y_pred = best_pipeline.predict(X_evaluate)
resid = y_evaluate - y_pred

In [None]:
best_pipeline.score(X_evaluate, y_evaluate, objectives=model_objectives)

Faired badly on the test set

In [None]:
sns.displot(resid)

### try another pipeline

In [None]:
xgboost_pipeline = automl.get_pipeline(5)
xgboost_pipeline.fit(X_train, y_train)
xgboost_pipeline.score(X_train, y_train, objectives=model_objectives)

In [None]:
y_pred_2 = xgboost_pipeline.predict(X_evaluate)
xgboost_pipeline.score(X_evaluate, y_evaluate, objectives=model_objectives)

In [None]:
resid_2 = y_evaluate - y_pred_2
sns.displot(resid_2)

In [None]:
resid_train = y_train - best_pipeline.predict(X_train)
sns.displot(resid_train)

### Analyzing the residual

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.gofplots import qqplot

In [None]:
acorr_ljungbox(resid, np.arange(1, 11, 1))

In [None]:
acorr_ljungbox(resid_2, np.arange(1, 11, 1))

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(20, 6))

qqplot(resid, line='45', ax=axs[0]);
qqplot(resid_2, line='45', ax=axs[1]);

axs[0].set_title(f'{best_pipeline.summary}')
axs[1].set_title(f'{xgboost_pipeline.summary}')

plt.show()

### Conclusion
While the metrics and results of the train and test sets look impressive the residuals shows non-normality in their distribution. 

More data points are therefore needed and feature to get a better generalize model than these.