## Imports

In [1]:
import itertools
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import MeanAbsoluteError

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors

## Read in Data

In [2]:
df1 = pd.read_csv("data/skin_care_cleaned.csv")
df1 = df1[["product_names", "product_category", "brand", "ingredient", "size", "price", "size_num", "size_unit", 
        'active_ingredient', 'inactive_ingredient', 'n_inactive_ingredient', 'n_active_ingredient']]

In [3]:
df1.active_ingredient = df1.active_ingredient.str.replace("(", "")
df1.active_ingredient = df1.active_ingredient.str.replace(")", "")
df1.active_ingredient = df1.active_ingredient.str.replace(",", "")
df1.active_ingredient = df1.active_ingredient.str.replace("%.", "%")
df1.active_ingredient = df1.active_ingredient.str.strip()
df1.active_ingredient = df1.active_ingredient.str.split("%")
df1 = df1.dropna(subset=["ingredient", "size", "size_unit"])

In [4]:
df2 = pd.read_csv("data/ingredients_skincare_dermstore.csv").drop(columns = ["Unnamed: 0", "url"])

In [5]:
# remove dollar signs from dataframe
df2["price"] = df2["price"].str.replace("$", "")

# turn active ingredients into list, input inactive ingredients
df2["active_ingredients"] = df2["active_ingredients"].str.split(",")
df2["inactive_ingredients"] = df2["inactive_ingredients"].fillna(df2["ingredients"])

# create and fill n_active_ingredient column
df2["n_active_ingredient"] = df2["active_ingredients"]
df2["n_active_ingredient"] = df2["n_active_ingredient"].fillna(0)
for index, row in df2.iterrows():
    if type(row["n_active_ingredient"]) == list:
        if len(row["n_active_ingredient"]) > 0:
            row["n_active_ingredient"] = len(row["n_active_ingredient"])
            
# create and fill n_inactive_ingredient column
df2["n_inactive_ingredient"] = df2["inactive_ingredients"].str.split(",")
for index, row in df2.iterrows():
    if type(row["n_inactive_ingredient"]) == list:
        if len(row["n_inactive_ingredient"]) > 0:
            row["n_inactive_ingredient"] = len(row["n_inactive_ingredient"])

In [6]:
# create product categories for items
df2["product_category"] = df2["product_name"]
for index, row in df2.iterrows():
    if "Serum" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Hydr" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Refi" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Bar" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Stem" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Neck" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "neck" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Peptide" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Resurfac" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Collagen" in row["product_category"]:
        row["product_category"] = "Serum"
    elif "Sun" in row["product_category"]:
        row["product_category"] = "Sunscreen" 
    elif "SPF" in row["product_category"]:
        row["product_category"] = "Sunscreen" 
    elif "Clean" in row["product_category"]:
        row["product_category"] = "Cleansers"
    elif "Wash" in row["product_category"]:
        row["product_category"] = "Cleansers"
    elif "Wipes" in row["product_category"]:
        row["product_category"] = "Cleansers"
    elif "Pads" in row["product_category"]:
        row["product_category"] = "Cleansers" 
    elif "Makeup Remover" in row["product_category"]:
        row["product_category"] = "Cleansers"
    elif "Soap" in row["product_category"]:
        row["product_category"] = "Cleansers" 
    elif "Cream" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "Creme" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "Night" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "night" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "Nuit" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "Gel" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "Sleep" in row["product_category"]:
        row["product_category"] = "Nighttime Moisturizer"
    elif "Lotion" in row["product_category"]:
        row["product_category"] = "Daytime Moisturizer"
    elif "Moisturizer" in row["product_category"]:
        row["product_category"] = "Daytime Moisturizer"
    elif "Butter" in row["product_category"]:
        row["product_category"] = "Daytime Moisturizer"
    elif "Lip" in row["product_category"]:
        row["product_category"] = "Lip Scrub"
    elif "Mask" in row["product_category"]:
        row["product_category"] = "Face Mask"
    elif "Masque" in row["product_category"]:
        row["product_category"] = "Face Mask"
    elif "Acne" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "Blemish" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "Pore" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "Spot" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "Clinical" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "System" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "Treatment" in row["product_category"]:
        row["product_category"] = "Acne & Blemish Treatment"
    elif "Eye" in row["product_category"]:
        row["product_category"] = "Eye Cream & Treatment" 
    elif "Toner" in row["product_category"]:
        row["product_category"] = "Toner & Face Mist"
    elif "Mist" in row["product_category"]:
        row["product_category"] = "Toner & Face Mist"
    elif "mist" in row["product_category"]:
        row["product_category"] = "Toner & Face Mist" 
    elif "Scrub" in row["product_category"]:
        row["product_category"] = "Exfoliants"
    elif "Exfoli" in row["product_category"]:
        row["product_category"] = "Exfoliants" 
    elif "Polish" in row["product_category"]:
        row["product_category"] = "Exfoliants"
    elif "Glow" in row["product_category"]:
        row["product_category"] = "Exfoliants"
    elif "Peel" in row["product_category"]:
        row["product_category"] = "Exfoliants"
    elif "Vitamin C" in row["product_category"]:
        row["product_category"] = "Vitamin C"
    elif "Vita C" in row["product_category"]:
        row["product_category"] = "Vitamin C"
    elif "Refin" in row["product_category"]:
        row["product_category"] = "Vitamin C"
    elif "Bright" in row["product_category"]:
        row["product_category"] = "Skin Lightener"
    elif "-C" in row["product_category"]:
        row["product_category"] = "Vitamin C"
    elif "Vit C" in row["product_category"]:
        row["product_category"] = "Vitamin C"
    elif "- C" in row["product_category"]:
        row["product_category"] = "Vitamin C"
    elif "Oil" in row["product_category"]:
        row["product_category"] = "Facial Oil" 
    elif "Retinol" in row["product_category"]:
        row["product_category"] = "Retinol" 
    elif "Matt" in row["product_category"]:
        row["product_category"] = "Oil Control Products" 
        
product_types = ["Nighttime Moisturizer", "Cleansers", "Serum", "Exfoliants", "Eye Cream & Treatment", "Daytime Moisturizer",
                "Sunscreen", "Toner & Face Mist", "Face Mask", "Acne & Blemish Treatment", "Lip Balm", "Retinol",
                "Skin Lightener", "Oil Control Products", "Face Oil", "Vitamin C", "Lip Scrub"]
df2 = df2.loc[df2["product_category"].isin(product_types)]

In [7]:
# make and fill size_num column
pattern = r'(\d+(\.\d+)?)\s*(fl\.?\s*oz\.?|g\.?|ml\.?)'
extracted_values = df2["product_name"].str.extract(pattern, expand=True).fillna("")
df2["size_num"] = extracted_values.iloc[:, 0]


# make and fill size_unit column
df2["size_unit"] = df2["product_name"]
for index, row in df2.iterrows():
    if "g." in row["size_unit"]:
        row["size_unit"] = "grams"
    elif "fl. oz" in row["size_unit"]:
        row["size_unit"] = "fluid ounces"
    elif "ml" in row["size_unit"]:
        row["size_unit"] = "milliliters"
    else:
        row["size_unit"] = "piece/other"

# make and fill a size column
df2["size"] = df2["size_num"] + " " + df2["size_unit"]
df2["size"] = df2["size"].str.replace("grams", "g.")
df2["size"] = df2["size"].str.replace("milliliters", "ml")
df2["size"] = df2["size"].str.replace("fluid ounces", "fl. oz.")

In [8]:
df2 = df2[["product_name", "product_category", "brand", "ingredients", "size", "price", "size_num", "size_unit", 
          "active_ingredients", "inactive_ingredients", "n_inactive_ingredient", "n_active_ingredient"]]
df2.columns = ['product_names', 'product_category', 'brand', 'ingredient', 'size', 'price', 'size_num', 'size_unit', 
               'active_ingredient', 'inactive_ingredient', 'n_inactive_ingredient', 'n_active_ingredient']

In [9]:
df2["active_ingredient"] = df2["active_ingredient"].astype(str)
df2["active_ingredient"] = df2["active_ingredient"].str.replace("'", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("[", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("]", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("<strong>Active: </strong>", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("<strong>Active Ingredients:</strong>", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("Active Ingredients: ", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("\n", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("%", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace(")", "")
df2["active_ingredient"] = df2["active_ingredient"].str.replace("(", "")

In [10]:
df = pd.concat([df1, df2]).reset_index().drop(columns = "index")
df

Unnamed: 0,product_names,product_category,brand,ingredient,size,price,size_num,size_unit,active_ingredient,inactive_ingredient,n_inactive_ingredient,n_active_ingredient
0,Clear Complexion Spot Treatment,Acne & Blemish Treatment,Merle Norman,"Active: Salicylic Acid (2%), Other: Alcohol D...",0.50 fl. oz.,20.0,15.0,ml,"[Salicylic Acid 2, ]","Alcohol Denat., Hamamelis Virginiana (Witch ...",10,1
1,Acne Solutions Emergency Gel Lotion,Acne & Blemish Treatment,Clinique,"Active: Benzoyl Peroxide (5%), Other: Water P...",0.50 fl. oz.,17.0,15.0,ml,"[Benzoyl Peroxide 5, ]","Water Purified, C12-15 Alkyl Benzoate, Glyc...",23,1
2,RESIST Daily Pore-Refining Solution 2% BHA,Acne & Blemish Treatment,Paula's Choice Skincare,"Water (Aqua), Dipropylene Glycol, Salicylic ...",3.00 fl. oz.,33.0,89.0,ml,,"Water (Aqua), Dipropylene Glycol, Salicylic ...",26,0
3,Naturals Acne Spot Treatment,Acne & Blemish Treatment,Neutrogena,Active Ingredient: Salicylic Acid 1%. Inactive...,0.75 fl. oz.,8.49,22.0,ml,"[Salicylic Acid 1, ]","Water, Hydrated Silica, Glycerin, Dicapryl...",18,1
4,Normaderm Daily Anti-Acne Hydrating Lotion,Acne & Blemish Treatment,Vichy,"Active: Salicylic Acid 1%. Inactive: Water, G...",1.70 fl. oz.,23.5,50.0,ml,"[Salicylic Acid 1, ]","Water, Glycerin, Alcohol Denat., Phenyl Tr...",24,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7443,VERSO Verso Lip Serum (15 ml.),Serum,VERSO,"Theobroma Cacao Seed Butter, Bis-Diglyceryl Po...",15 ml,65.00,15,milliliters,,"Theobroma Cacao Seed Butter, Bis-Diglyceryl Po...",34,0
7444,Avene Cleanance HYDRA Soothing Cream (1.3 fl. ...,Cleansers,Avene,"Avene Thermal Spring Water, Carthamus Tinctori...",1.3 fl. oz.,25.00,1.3,fluid ounces,,"Avene Thermal Spring Water, Carthamus Tinctori...",25,0
7445,GlyDerm Gly Mist (8 fl. oz.),Toner & Face Mist,GlyDerm,"Purified Water, Propylene Glycol (and) Diazoli...",8 fl. oz.,23.00,8,fluid ounces,,"Purified Water, Propylene Glycol (and) Diazoli...",5,0
7446,Dr. Hauschka Revitalizing Mask (1 oz.),Face Mask,Dr. Hauschka,"Water/Aqua, Pyrus Cydonia (Quince) Seed Extrac...",piece/other,35.00,,piece/other,,"Water/Aqua, Pyrus Cydonia (Quince) Seed Extrac...",26,0


We'll be dropping size columns from our dataset as size of product is often determined by the type of product (i.e. lotions and sunscreens come in larger bottles than serums). To reduce collinearity, we'll just use product category.

## Prepare Data

In [17]:
# create count lists for extracts, peptides, and oils
df["count_extract"] = list(map(lambda x: x.count("extract"), df["inactive_ingredient"]))
df["count_peptide"] = list(map(lambda x: x.count("peptide"), df["inactive_ingredient"]))
df["count_oil"] = list(map(lambda x: x.count("oil"), df["inactive_ingredient"]))

# turn inactive ingredient column into lists
df["inactive_ingredient"] = df["inactive_ingredient"].str.replace(".", "")
df["inactive_ingredient"] = df["inactive_ingredient"].str.replace(" ", "")
df["inactive_ingredient"] = df["inactive_ingredient"].str.split(",")

In [18]:
# get ingredient counts
ingredients_df = pd.get_dummies(pd.DataFrame(df["inactive_ingredient"].tolist()).stack()).sum(level=0)
ingredients_df.shape

(7448, 19476)

In [19]:
# determine most important ingredients
ing_list = []
for column in ingredients_df.columns:
    if ingredients_df[column].sum() > 100 and ingredients_df[column].sum() < 800:
        ing_list.append(column)

ing_list = [ele.strip() for ele in ing_list]

# get rid of empty string
ing_list = ing_list[1:]

In [20]:
# create dataframe from columns in ing_list
ingredients_df = ingredients_df.rename(columns=lambda x: x.strip())
clean_ing = ingredients_df[ing_list]

# concat dataframes
df_ing = pd.concat([df, clean_ing], axis = 1)
df_ing.shape

(7448, 307)

In [21]:
# remove duplicate columns
df_ing = df_ing.groupby(axis=1, level=0).sum()
col_list = ["Aqua", "Aqua(Water)", "Aqua/Water", "Aqua/Water/Eau", "Water(Aqua)", "Water/Aqua/Eau"]
df_ing["Water"] = df_ing[col_list].sum(axis=1)
df_ing = df_ing.drop(columns = col_list)

In [22]:
index_list = []
for index, row in df_ing.iterrows():
    if "-product" in str(row["price"]):
        index_list.append(index)
    elif "-qubit" in str(row["price"]):
        index_list.append(index)
        
df_ing = df_ing.drop(index = index_list)

In [23]:
df_ing["price"] = df_ing["price"].astype(float)
df_ing["price"] = df_ing["price"].round(2)
df_ing["size_num"] = df_ing["size_num"].replace(r'^\s*$', np.nan, regex=True)
df_ing["size_num"] = df_ing["size_num"].astype(float)

## Preprocess Data

In [18]:
#num_cols
num_cols = ["size_num", "n_active_ingredient", "n_inactive_ingredient", "count_extract", "count_oil", "count_peptide",
           "Water"]
ingredients = df_ing.columns.to_list()[:-16]
for ing in ingredients:
    num_cols.append(ing)
    
#ord_cols
ord_cols = ["product_category", "brand", "size_unit"]

In [19]:
ordinal_transformer = Pipeline(steps = [
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('target_enc', TargetEncoder()),
    ("scaler", StandardScaler())])

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ("ord", ordinal_transformer, ord_cols),
        ("num", numeric_transformer, num_cols),
    ]
)

## Train-Test Split

In [20]:
y = df_ing["price"]
X = df_ing.drop(columns = ["price", "active_ingredient", "inactive_ingredient", "ingredient", "product_names", "size"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42)

In [21]:
# determine baseline error
rmse_train = np.sqrt(mean_squared_error(y_train, np.ones([y_train.shape[0],])* y_train.mean()))
rmse_test = np.sqrt(mean_squared_error(y_test, np.ones([y_test.shape[0],])* y_train.mean()))

mae_train = mean_absolute_error(y_train, np.ones([y_train.shape[0],])* y_train.median())
mae_test = mean_absolute_error(y_test, np.ones([y_test.shape[0],])* y_train.median())

print("RMSE")
print(f"train: {rmse_train:.3f}") 
print(f"test: {rmse_test:.3f}")
print("\n")
print("MAE")
print(f"train: {mae_train:.3f}") 
print(f"test: {mae_test:.3f}")

# Modeling
## Model 1 - Simple Regressor

In [22]:
X_train_ct = preprocessor.fit_transform(X_train, y_train)
boost = XGBRegressor()
boost.fit(X_train_ct, y_train)
pred_train_boost = boost.predict(X_train_ct)

X_test_ct = preprocessor.transform(X_test)
pred_test_boost = boost.predict(X_test_ct)

In [23]:
rmse_train = np.sqrt(mean_squared_error(y_train, pred_train_boost))
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test_boost))

mae_train = mean_absolute_error(y_train, pred_train_boost)
mae_test = mean_absolute_error(y_test, pred_test_boost)

ev_train = explained_variance_score(y_train, pred_train_boost)
ev_test = explained_variance_score(y_test, pred_test_boost)

print("number of features:",len(X_train.columns))
print("rmse: (train) %.3f, (test) %.3f"%(rmse_train, rmse_test))
print("mae: (train) %.3f, (test) %.3f"%(mae_train, mae_test))
print("explained variance: (train cv) %.3f, (test) %.3f"%(ev_train, ev_test))

In [24]:
from sklearn.model_selection import GridSearchCV

# parameter grid
param_grid = {
    "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.1],
    "n_estimators": [100, 250, 500, 750, 1000],
    "max_depth": [3, 5, 7, 9, 11]
}


# instantiate and fit
boost = XGBRegressor()
grid_search = GridSearchCV(boost, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_ct, y_train)

# return best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# get predictions off best model
best_boost = grid_search.best_estimator_
pred_test_boost = best_boost.predict(X_test_ct)

## Model 2 - Neural Network, 3 Hidden Layers

In [None]:
X_train_ct = preprocessor.fit_transform(X_train, y_train)
X_test_ct = preprocessor.transform(X_test)

learning_rate = 0.01

model2 = Sequential([
    Dense(520, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(480, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(256, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ])

mae_k = MeanAbsoluteError()
model2.compile(loss=mae_k, optimizer=Adam(learning_rate=learning_rate), metrics=[mae_k])

model_2 = model2.fit(X_train_ct, y_train, epochs=100, batch_size=64, validation_split=0.2)

In [None]:
y_train_pred = model2.predict(X_train_ct)
y_test_pred = model2.predict(X_test_ct)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

ev_train = explained_variance_score(y_train, y_train_pred)
ev_test = explained_variance_score(y_test, y_test_pred)

print("number of features:",len(X_train.columns))
print("rmse: (train) %.3f, (test) %.3f"%(rmse_train, rmse_test))
print("mae: (train) %.3f, (test) %.3f"%(mae_train, mae_test))
print("explained variance: (train cv) %.3f, (test) %.3f"%(ev_train, ev_test))

In [None]:
X_train_ct = preprocessor.fit_transform(X_train, y_train)
X_test_ct = preprocessor.transform(X_test)

learning_rate = 0.01

model2 = Sequential([
    Dense(160, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(480, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(256, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ])

mae_k = MeanAbsoluteError()
model2.compile(loss=mae_k, optimizer=Adam(learning_rate=learning_rate), metrics=[mae_k])

model_2 = model2.fit(X_train_ct, y_train, epochs=100, batch_size=64, validation_split=0.2)

In [None]:
y_train_pred = model2.predict(X_train_ct)
y_test_pred = model2.predict(X_test_ct)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

ev_train = explained_variance_score(y_train, y_train_pred)
ev_test = explained_variance_score(y_test, y_test_pred)

print("number of features:",len(X_train.columns))
print("rmse: (train) %.3f, (test) %.3f"%(rmse_train, rmse_test))
print("mae: (train) %.3f, (test) %.3f"%(mae_train, mae_test))
print("explained variance: (train cv) %.3f, (test) %.3f"%(ev_train, ev_test))

## Model 3 - Neural Network, 5 Hidden Layers¶

In [None]:
learning_rate = 0.01

model3 = Sequential([
    Dense(160, kernel_initializer='normal', activation='relu'),
    Dense(160, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(480, kernel_initializer='normal', activation='relu'),
    Dense(480, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(256, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ])

model3.compile(loss=mae_k, optimizer=Adam(learning_rate=learning_rate), metrics=[mae_k])
model_3 = model3.fit(X_train_ct, y_train, epochs=100, batch_size=64, validation_split=0.2)

In [None]:
y_train_pred = model3.predict(X_train_ct)
y_test_pred = model3.predict(X_test_ct)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

ev_train = explained_variance_score(y_train, y_train_pred)
ev_test = explained_variance_score(y_test, y_test_pred)

print("number of features:",len(X_train.columns))
print("rmse: (train) %.3f, (test) %.3f"%(rmse_train, rmse_test))
print("mae: (train) %.3f, (test) %.3f"%(mae_train, mae_test))
print("explained variance: (train cv) %.3f, (test) %.3f"%(ev_train, ev_test))

## Model 4 - Neural Network, 10 Hidden Layers¶

In [None]:
learning_rate = 0.01

model4 = Sequential([
    Dense(160, kernel_initializer='normal', activation='relu'),
    Dense(160, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(480, kernel_initializer='normal', activation='relu'),
    Dense(480, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(520, kernel_initializer='normal', activation='relu'),
    Dense(520, kernel_initializer='normal', activation='relu'),
    Dense(760, kernel_initializer='normal', activation='relu'),
    Dense(760, kernel_initializer='normal', activation='relu'),
    Dense(760, kernel_initializer='normal', activation='relu'),
    Dense(256, kernel_initializer='normal', activation='relu'),
    Dense(256, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ])

model4.compile(loss=mae_k, optimizer=Adam(learning_rate=learning_rate), metrics=[mae_k])

model_4 = model4.fit(X_train_ct, y_train, epochs=100, batch_size=64, validation_split=0.2)

In [None]:
y_train_pred = model4.predict(X_train_ct)
y_test_pred = model4.predict(X_test_ct)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

ev_train = explained_variance_score(y_train, y_train_pred)
ev_test = explained_variance_score(y_test, y_test_pred)

print("number of features:",len(X_train.columns))
print("rmse: (train) %.3f, (test) %.3f"%(rmse_train, rmse_test))
print("mae: (train) %.3f, (test) %.3f"%(mae_train, mae_test))
print("explained variance: (train cv) %.3f, (test) %.3f"%(ev_train, ev_test))

## Feature Engineering

In [None]:
features_df = pd.DataFrame()
features_df["feature"] = X_train.columns
features_df["importance"] = boost.feature_importances_
features_df = features_df.sort_values("importance", ascending=False)
plt.figure(figsize=(5,12))
sns.barplot(x = "importance", y = "feature", data=features_df.iloc[:15])
plt.show()

## Predictions

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(y_test, pred_test_boost, marker='.')
plt.plot([0,300],[0,300],color='r')
plt.axis('equal')
plt.xlim(0,300)
plt.ylim(0,300)
plt.xlabel('true price')
plt.ylabel('predicted price')
plt.show()

In [None]:
df_prices = df_ing.loc[y_test.index.to_list()]
df_prices = df_prices[["product_names", "product_category", "brand", "ingredient"]]
df_prices["real_price"] = y_test
df_prices["predicted_price"] = model1_test_pred
df_prices["difference"] = (df_prices["real_price"] - df_prices["predicted_price"]).round(2)
df_prices

In [None]:
overpriced = df_prices.loc[df_prices["difference"] > 20]
overpriced.brand.value_counts().head(10)

In [None]:
underpriced = df_prices.loc[df_prices["difference"] < 0]
underpriced.brand.value_counts().head(10)

# Dupe Machine

In [None]:
# remove non-essential columns
features = df_ing.copy()
features = features.drop(columns = ["product_names", "size", "size_num", "size_unit", "brand", "active_ingredient", 
                        "inactive_ingredient", "ingredient"])
features = pd.get_dummies(features)

# fit KNN model
model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')
model.fit(features)
dist, idlist = model.kneighbors(features)

In [None]:
# define function
def SuperDuper(product_name):
    product_list = []
    product_id = df_ing[df_ing["product_names"] == product_name].index
    product_id = product_id[0]
    product_price = df_ing["price"].iloc[product_id] 
    for newid in idlist[product_id]:
        name = df_ing.loc[newid].product_names
        brand = df_ing.loc[newid].brand
        price = df_ing.loc[newid].price
        if name == product_name:
            pass
        elif price < product_price:
            product_list.append(f"{name} from {brand}, ${price: .2f}")
        
    return product_list   

In [None]:
SuperDuper("Absolue Premium Bx Advanced Replenishing Cream Cleanser")

In [None]:
features

In [27]:
products_data = df_ing[["brand", "product_names", "price"]]

In [28]:
products_data.to_csv(r"data/products_data.csv")

In [None]:
products_data.to_pickle("./products_data.pkl") 

In [None]:
features.to_pickle("./features.pkl")