In [1]:
import itertools
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error,explained_variance_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
import itertools
import gc

In [2]:
df = pd.read_csv("data/skin_care_cleaned.csv")
df.shape

(5105, 86)

In [3]:
# clean inactive ingredient list
df["inactive_ingredient_list"] = df["inactive_ingredient_list"].str.replace("[","")
df["inactive_ingredient_list"] = df["inactive_ingredient_list"].str.replace("]","")
df["inactive_ingredient_list"] = df["inactive_ingredient_list"].str.replace(".","")
df["inactive_ingredient_list"] = df["inactive_ingredient_list"].str.replace("'","")
df["inactive_ingredient_list"] = df["inactive_ingredient_list"].str.lower()
df["inactive_ingredient_list"] = df["inactive_ingredient_list"].str.split(", ")


# create count lists for extracts, peptides, and oils
df["count_extract"] = list(map(lambda x: x.count("extract"), df["inactive_ingredient_list"]))
df["count_peptide"] = list(map(lambda x: x.count("peptide"), df["inactive_ingredient_list"]))
df["count_oil"] = list(map(lambda x: x.count("oil"), df["inactive_ingredient_list"]))

In [4]:
# get ingredient counts
ingredients_df = pd.get_dummies(pd.DataFrame(df["inactive_ingredient_list"].tolist()).stack()).sum(level=0)
ingredients_df.shape

(5105, 12456)

In [5]:
# determine most important ingredients
ing_list = []
for column in ingredients_df.columns:
    if ingredients_df[column].sum() > 100 and ingredients_df[column].sum() < 500:
        ing_list.append(column)

ing_list = [ele.strip() for ele in ing_list]

# get rid of empty string
ing_list = ing_list[1:]

In [6]:
# create dataframe from columns in ing_list
ingredients_df = ingredients_df.rename(columns=lambda x: x.strip())
clean_ing = ingredients_df[ing_list]

# concat dataframes
df_ing = pd.concat([df, clean_ing], axis = 1)
df_ing.columns[89:].to_list()

['2-hexanediol',
 'acetyl glucosamine',
 'acetyl hexapeptide-8',
 'acrylates copolymer',
 'acrylates/c10-30 alkyl acrylate crosspolymer',
 'adenosine',
 'alcohol',
 'alcohol denat',
 'algae extract',
 'aloe barbadensis leaf extract',
 'aloe barbadensis leaf juice',
 'alpha-isomethyl ionone',
 'aluminum starch octenylsuccinate',
 'aminomethyl propanol',
 'ammonium acryloyldimethyltaurate/vp copolymer',
 'aqua (water)',
 'arginine',
 'ascorbic acid',
 'ascorbyl glucoside',
 'ascorbyl palmitate',
 'avena sativa (oat) kernel extract',
 'beeswax',
 'behenyl alcohol',
 'benzoic acid',
 'benzyl alcohol',
 'benzyl benzoate',
 'benzyl salicylate',
 'betaine',
 'biosaccharide gum-1',
 'bisabolol',
 'blue 1',
 'butylparaben',
 'butylphenyl methylpropional',
 'butyrospermum parkii (shea butter)',
 'butyrospermum parkii (shea) butter',
 'c12-15 alkyl benzoate',
 'c13-14 isoparaffin',
 'caffeine',
 'calendula officinalis flower extract',
 'camellia sinensis leaf extract',
 'ceteareth-20',
 'cetearyl

In [7]:
# sum together duplicate columns
col_list = ['butyrospermum parkii (shea butter)', 'butyrospermum parkii (shea) butter',]
df_ing["butyrospermum parkii (shea butter)"] = df_ing[col_list].sum(axis=1)
df_ing = df_ing.drop(columns = col_list[1])

col_list = ['water (aqua)', 'water/aqua/eau',]
df_ing["water"] = df_ing[col_list].sum(axis=1)
df_ing = df_ing.drop(columns = col_list[:1])

In [8]:
num_cols = ["n_inactive_ingredient", "n_active_ingredient", "active_mean_rating", "inactive_mean_rating",
           "inactive_mean_rating_w1", "inactive_mean_rating_w2"]
num_cols += df_ing.columns[89:].to_list()

ord_cols = ["product_category", "brand", "size_unit"]

In [10]:
y = df_ing["price"]
X = df_ing.drop(columns = "price")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42)

In [12]:
df_ing["product_category"].value_counts()

Nighttime Moisturizer       1068
Cleansers                    935
Serum                        530
Exfoliants                   502
Eye Cream & Treatment        448
Daytime Moisturizer          329
Sunscreen                    282
Face Mask                    258
Toner & Face Mist            254
Acne & Blemish Treatment     183
Lip Balm                     145
Retinol                       49
Skin Lightener                35
Oil Control Products          29
Face Oil                      24
Vitamin C                     24
Lip Scrub                     10
Name: product_category, dtype: int64

In [None]:
product_ranks = {"Nighttime Moisturizer": }

In [None]:
ordinal_transformer = Pipeline(steps = [
    ('ordimputer', SimpleImputer(strategy = 'most_frequent')),
    ('ordenc', OrdinalEncoder(categories = [membership_order])),
    ('ordnorm', StandardScaler())])

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ("ord", ordinal_transformer, ord_cols),
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [66]:
target = 'price'
enc_feats = ['product_category', 'brand', 'size_unit']
gen_features = ['product_category_encode', 'brand_encode', 'size_num', 'size_unit_encode']

ingredient_features = ['n_inactive_ingredient','n_active_ingredient','active_mean_rating', 'inactive_mean_rating', 'inactive_mean_rating_w1','inactive_mean_rating_w2']
ingredient_features += [f_ for f_ in df_ing.columns.values if f_.find('count')>=0]
ingredient_features += [f_ for f_ in df_ing.columns.values if f_.find('_nmf_')>=0]
ingredient_features += df_ing.columns[89:].to_list()

In [None]:
num_cols = ["age", "days_since_last_login", "avg_time_spent", "avg_transaction_value", "avg_frequency_login_days", 
           "points_in_wallet", "age_of_account_in_days"]

cat_cols = ["gender", "region_category", "joined_through_referral", "preferred_offer_types",
           "medium_of_operation", "internet_option", "used_special_discount", "offer_application_preference",
           "past_complaint", "complaint_status", "feedback"]

ord_cols = ["membership_category"]