# Import package and create helper variables

In [1]:
import os
import pandas as pd
import numpy as np
import re
import dill
from scipy.sparse import csr_matrix
import random
from pandas import Series
import sklearn.cluster
import Levenshtein
from collections import OrderedDict, Counter

from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

In [129]:
# Category names and ids under skincare-face
cat_name = ['Treatments (Face)', 'Toners', 'Cleansers'
            , 'Moisturizers', 'Masks', 'Eye Makeup Remover', 'Neck/Decollete Cream', 'Scrubs', 'Treatments (Eye)', ]
cat_id = ['705', '702', '701', '707', '703', '708', '709', '704', '706']
cat_dict = dict(zip(cat_id, cat_name))

select_cols = ['pro_ids', 'ratings', 'repurchases', 'pkg_quals', 'prices'
       ,'ingredients', 'brand_ids', 'users', 'reviews', 'lipies', 'age', 'eyes'
       ,'skin_type', 'skin_tone', 'skin_undertone', 'hair_color', 'hair_type'
       ,'hair_texture']

colab_filt_cols = ['pro_ids', 'users', 'lipies']

content_cols = ['names', 'pro_ids', 'ratings', 'repurchases', 'pkg_quals', 'prices', 'ingredients', 'brands']

user_features = ['age', 'eyes', 'skin_type', 'skin_tone', 'skin_undertone'
    , 'hair_color', 'hair_type', 'hair_texture']

# Load clean dataset from previous steps

In [None]:
# load clean dataset
df = dill.load(open('data/df_clean.pkd', 'rb'))
df_cf = df[colab_filt_cols]

In [None]:
# create unique id for users
users = df_cf['users'].unique()

len(users)
# 98840

# create random sample of 98840, from range(98840) without replacement,
random.seed(42)
id = Series(random.sample(range(len(users)), len(users)))
len(id.unique())

# join unique users and their unique ids
df_users = pd.DataFrame({'users': users, 'uid': id})

# join uid to main df by 'users' column
df_uid = pd.merge(df, df_users, on='users')

# join uid to main df_cf by 'users' column
df_cf_uid = pd.merge(df_cf, df_users, on='users')

In [None]:
# create new df of unique product contents
df_content = df[content_cols]
# drop duplicates
df_content_uniq = df_content.drop_duplicates()
# using .loc will turn pro_ids into index
# df_content_uniq = df_content.loc[~df_content.pro_ids.duplicated(), :]
df_content_uniq.shape
# (5298, 8)
# table of product id and names
prod_df = df_content_uniq.copy()
# dill.dump(prod_df, open('data/prod_df.pkd', 'wb'))

In [4]:
prod_df = dill.load(open('data/prod_df.pkd', 'rb'))
df_content_uniq = prod_df.copy()

df_content_uniq['pro_ids'] = df_content_uniq['pro_ids'].astype('int64')
df_content_uniq = df_content_uniq.set_index('pro_ids')

# Ingredient texts preprocessing

In [94]:
def rm_delimiters(ingr_row):
    """
    split ingredients column into a list of string instead of strings
    :param ingr_row: a string row (of dataframe) of ingredients
    :return: list of (str) ingredients
    """
    delim = [',', 'and', '\.+', ':', '/', '\s-+\s', '\*+', '\[\]', '\a', '\n', '\t', '\s\s+', 'â¢', ';', '-+', '_+'
        , 'active\singredients', 'also\scontains', 'rapid\sactivation\sgel', 'step\s1', 'step\s2', 'step\s3'
        , 'active\singredient', 'inactive\singredients', 'inactive\singredient', 'other\singredients'
        , 'ingredients', 'ingredient', 'previous\singredients', 'others', 'other']

    return re.split('|'.join(delim), ingr_row.lower())


def water_parse(ingr_list):
    """
    find and keep ingredients with word 'water-binding'; the rest if it has the word 'water', just keep
    'water'
    :param ingr_list: list of string ingredients
    :return: ingredients 'water' and 'water-binding'
    """
    water = ['water' if ('water' in ingr and 'water-binding' not in ingr) else ingr for ingr in ingr_list]
    return water

def oil_parse(ingr_list):
    """
    if ingredient has the word 'oil', just keep 'oil'; if ingredient is 'mineral oi', keep 'mineral oi'
    :param ingr_list: list of string ingredients
    :return: ingredients 'oil'
    """
    oil = ['oil' if (' oil' in ingr and 'mineral oil' not in ingr) else ingr for ingr in ingr_list]
    return oil


def extract_parse(ingr_list):
    """
    if ingredient has the word 'extract', just keep 'extract'
    :param ingr_list: list of string ingredients
    :return: ingredients 'extract'
    """
    extract = ['extract' if ' extract' in ingr else ingr for ingr in ingr_list]
    return extract


In [113]:
# split ingredients column into a list of string instead of strings
# turn lower case, strip leading and trailing spaces, remove empty lists
df_content_uniq['ingredients_n'] = df_content_uniq['ingredients'].apply(lambda x: rm_delimiters(x))\
                    .apply(lambda x: list(filter(None, [i.lower().strip() for i in x])))\
                    .apply(lambda x: water_parse(x))\
                    .apply(lambda x: oil_parse(x))\
                    .apply(lambda x: extract_parse(x))\
                    .apply(lambda x: [word for word in x if len(word)>3])

# list of ingredients of each product
words = list(df_content_uniq.ingredients_n)

In [114]:
len(words)

5298

In [115]:
df_content_uniq.loc[614,'ingredients']

'Water, Cyclopentasiloxane, Dimethicone, Polysilicone-11, Acetyl Glucosamine, Sodium Lactobionate, Morus Nigra (Mulberry) Root Extract, Yeast Extract, Serenoa Serrulata (Saw Palmetto) Fruit Extract, Triticum Vulgare (Wheat) Germ Extract, Vitis Vinifera (Grape) Fruit Extract, Scutellaria Baicalensis Extract, Castanea Sativa (Chestnut) Seed Extract, Camellia Sinensis (Green Tea) Leaf Extract, Hordeum Vulgare (Barley), Lavandula Angustifolia (Lavender), Amorphophallus Konjac Root Powder, Caffeine, Laminaria Saccharina Extract, Tocopheryl Acetate, Salvia Sclarea (Clary) Extract, Sodium Hyaluronate, Coriandrum Sativum (Coriander), Citrus Grandis (Grapefruit California), Cholesterol, Glycerin, Ethylhexylglycerin, Squalane, Polyethylene, Isopentyldiol, Phenyl Trimethicone, Isohexadecane, Polysorbate 20, Pantethine, Methyldihydrojasmonate, Acrylamide/Sodium Acryloyldimethyltaurate Copolymer, Ammonium Acryloyldimethyltaurate/Vp Copolymer, Polysorbate 80, Peg-8, Ethyl 2,2-Dimethylhydrocinnamal, 

In [116]:
words[3]

['water',
 'cyclopentasiloxane',
 'dimethicone',
 'polysilicone',
 'acetyl glucosamine',
 'sodium lactobionate',
 'extract',
 'extract',
 'extract',
 'extract',
 'extract',
 'extract',
 'extract',
 'extract',
 'hordeum vulgare (barley)',
 'ula angustifolia (lavender)',
 'amorphophallus konjac root powder',
 'caffeine',
 'extract',
 'tocopheryl acetate',
 'extract',
 'sodium hyaluronate',
 'cori',
 'rum sativum (cori',
 'citrus gr',
 'is (grapefruit california)',
 'cholesterol',
 'glycerin',
 'ethylhexylglycerin',
 'squalane',
 'polyethylene',
 'isopentyldiol',
 'phenyl trimethicone',
 'isohexadecane',
 'polysorbate 20',
 'pantethine',
 'methyldihydrojasmonate',
 'acrylamide',
 'sodium acryloyldimethyltaurate copolymer',
 'ammonium acryloyldimethyltaurate',
 'vp copolymer',
 'polysorbate 80',
 'ethyl 2',
 'dimethylhydrocinnamal',
 'phospholipids',
 'palmitoyl oligopeptide',
 'butylene glycol',
 'magnesium ascorbyl phosphate',
 'glyceryl polymethacrylate',
 'sodium glycyrrhetinate',
 'no

In [120]:
words_flat = [item for word in words for item in word]
words_flat_uniq = list(OrderedDict.fromkeys(words_flat))
print('There are {} unique ingredients.'.format(len(words_flat_uniq)))
words_arr = np.asarray(words_flat_uniq) #So that indexing with a list will work

There are 9435 unique ingredients.


In [118]:
# array of all unique ingredients
words_arr

array(['polyquaternium', 'silica', 'water', ...,
       'chondrus crispus powder', 'phenoxyethanol i71',
       'extrait de luvure'], dtype='<U432')

In [None]:
# create distance matrix with Levenshtein distance
lev_similarity = -1*np.array([[Levenshtein.distance(w1,w2) for w1 in words_arr] for w2 in words_arr])

# Use AP clustering
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.99)
affprop.fit(lev_similarity)

# the cluster label for each word
affprop_labels = affprop.labels_

# the index of the word that other words will cluster around
affprop_clusters = affprop.cluster_centers_indices_

Levenshtein distance (edit distance) is the minimum number of edits to turn one word into another.

In [70]:
print("distance between 'retinol' and 'retinal':", Levenshtein.distance('retinol', 'retinal'))
print("distance between 'retinol' and 'titanium dioxide':", Levenshtein.distance('retinol', 'titanium dioxide'))

distance between 'retinol' and 'retinal': 1
distance between 'retinol' and 'titanium dioxide': 13


Affinity Propagation is a clustering method that can be faster than KNN and doesnt require setting number of clusters beforehand. It it can be fed with a similarity matrix so it works well with Levenshtein distance.

In [104]:
# dill.dump(affprop_labels, open('data/affprop_labels.pkd', 'wb'))
# dill.dump(affprop_clusters, open('data/affprop_clusters.pkd', 'wb'))

affprop_labels = dill.load(open('data/affprop_labels.pkd', 'rb'))
affprop_clusters = dill.load(open('data/affprop_clusters.pkd', 'rb'))

# there are 261 clusters for ingredients
print('There are {} clusters for ingredients'.format(len(affprop_clusters)))

There are 261 clusters for ingredients


In [110]:
# create dictionary of unique ingredients and their cluster number/index
ingred_labels = dict(zip(words_flat_uniq, affprop_labels))

for cluster_id in np.unique(affprop_labels[5]):
    exemplar = words_arr[affprop_clusters[cluster_id]]
    cluster = np.unique(words_arr[np.nonzero(affprop_labels == cluster_id)])
    cluster_str = ", ".join(cluster)
    print('Exemplar word:', exemplar, '\n')
    print('Other words in this cluster:', cluster_str)

Exemplar word: titanium dioxide 

Other words in this cluster: "titanium dioxide", 0 zinc oxide, 01 (antioxidant, 10% zinc oxide, 15% titanium dioxide, 20 itaconate copolymer, 3% zinc oxide, 45 alkyl dimethicone, 5 benzoyl peroxide, 5% benzoyl peroxide, 6% zinc oxide 3, 77492)titanium oxide, agar agar decoction, aluminium oxide, aluminum oxide, amino acid complex, anthemis nobius (chamomile), astaxanthin (bioastin®), behenoxy dimethicone, behentrimonium chloride, bensalkonium chloride, benzalkonium chloride, benzethonium chloride, benzlkonium chloride, benzoyl perioxide, benzoyl peroxide, beta glucan (antioxidant), bisabolol (antioxidant, bismuth oxychloride, calcium chloride, centrimonium bromide, cetrimonium bromide, cetrimonium chloride, cetyl triethylmonium dimethicone, cetyl triethylmonium dimethicone peg, chitosan succinamide, chromium oxide, chromium oxide greens, ci 77491 (iron oxide red), ci 77491 (iron oxides), ci 77492 (iron oxide), ci 77891 (titanium dioxide), ci 77947 (zin

# Creating features for content filtering

In [121]:
### onehot encoding for ingredient groups
one_hot_transformer = OneHotEncoder(sparse=False)

# map ingredients into groups and make into dictionary
df_content_uniq['words_mapped'] = df_content_uniq['ingredients_n'].apply(lambda x: [ingred_labels[ingredient] for ingredient in x])\
                                                                .apply(lambda x: np.unique(x))\
                                                                .apply(lambda x: Counter(x))
# vectorize ingredient groups
dict_vec = DictVectorizer(sparse=False)
ingred_ohe = dict_vec.fit_transform(df_content_uniq['words_mapped'])


### add other information about products
ingred_features = np.hstack([ingred_ohe, df_content_uniq[['ratings', 'repurchases', 'pkg_quals', 'prices']]])

### fit knn to identify products that are similar
ingred_nn = NearestNeighbors(n_neighbors=11).fit(ingred_features)

# use dill to save nearest neighbor model
# dill.dump(ingred_nn, open('data/nn_ingred.pkd', 'wb'))
ingred_nn = dill.load(open('data/nn_ingred.pkd', 'rb'))


# ohe matrix to find nn products
cbf_matrix = pd.DataFrame(ingred_features).set_index(df_content_uniq['names'])
# dill.dump(cbf_matrix, open('data/cbf_matrix.pkd', 'wb'))
cbf_matrix = dill.load(open('data/cbf_matrix.pkd', 'rb'))

In [122]:
# example
# find 10 products similar to the test product
# indices are indices of the 10 most similar products, not the actual product id
ingred_dists, ingred_indices = ingred_nn.kneighbors(ingred_features[0].reshape(1, -1))

# list of 10 products similar to test product, excluding the first one, which is the test product
prod_list = prod_df.iloc[ingred_indices[0]]
print(prod_list)

                                                  names pro_ids  ratings  \
0                   BiorÃ© Pore Cleansing Strips - Nose    1824      3.1   
1693            BiorÃ© Ultra Deep Cleansing Pore Strips   29424      3.7   
294        Trader Joe's Spa Face Wash with Tea Tree Oil  144660      3.5   
0     Desert Essence Thoroughly Clean Face Wash with...   35843      3.6   
42                            Mario Badescu aloe lotion   24573      3.6   
251                         Karin Herzog oxygen face 2%     772      3.2   
370                     Mario Badescu Honey moisturizer    7380      3.5   
2841                      Neutrogena Alcohol-Free Toner   22869      3.6   
155                  The Original Skin Store--Acne Stop   83201      3.7   
2166   Neutrogena Clear Pore Oil-Eliminating Astringent   12339      3.5   
469               Avon Anew Vitamin C Brightening Serum    4908      3.6   

      repurchases  pkg_quals  prices  \
0            0.56        3.4       2   
1693   

# Collaborative filtering using ALS in Spark

ALS (Alternating Least Square) is a matrix factorization technique to decompose an original matrix into two matrices so that when multiplying those two matrices together, they approximate the original matrix. By doing this, we would be able to get a score for an item that has not actually been scored.

![Matrix factorization](images/mf.png)

In [None]:
# ALS in Spark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from datetime import datetime
from lxml import etree
import bz2
import os


# get current location for this main.py file, then add path to SE (StackExchange) files
def localpath(path):
    return 'file://' + str(os.path.abspath(os.path.curdir)) + '/' + path

# initialize pyspark and sql
sc = SparkContext("local[*]", "demo")
print(sc.version)

sqlContext = SQLContext(sc)

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

tot_df = spark.read.csv("df_cf.csv",header=True, inferSchema=True)

# Create ALS model
als_cv = ALS(userCol="uid", itemCol="pro_ids", ratingCol="lipies", nonnegative = True, implicitPrefs = False)
# Confirm that a model called "als" was created
type(als_cv)


# Add hyperparameters and their respective values to param_grid
# rank is the number of latent features
# regparam: regularization lambda
# coldstartstrategy: for train/test when all user's ratings are in the test set
# so if train set doesnt have rating for that user, dont calculate the rmse
param_grid = ParamGridBuilder() \
           .addGrid(als_cv.rank, [10, 20, 50]) \
           .addGrid(als_cv.regParam, [.05, .1, .15]) \
           .build()

# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="lipies", predictionCol="prediction")
print("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidaator
cv = CrossValidator(estimator=als_cv, estimatorParamMaps=param_grid, evaluator=evaluator,numFolds=5)

# Confirm cv was built
print(cv)

#Fit cross validator to the 'train' dataset
model = cv.fit(tot_df)

#Extract best model from the cv model above
best_model = model.bestModel

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs_pd = userRecs.toPandas()
# need to convert to string for split below to work
userRecs_pd['new_rec'] = userRecs_pd['recommendations'].apply(lambda x: [str(pair[0]) for pair in x])
# create dataframe with rec columns
userRecs_pd[['rec1', 'rec2', 'rec3', 'rec4', 'rec5', 'rec6', 'rec7', 'rec8', 'rec9', 'rec10']] = pd.DataFrame(userRecs_pd.new_rec.values.tolist(), index= userRecs_pd.index)
userRecs_pd.drop(['recommendations', 'new_rec'], axis=1, inplace=True)
# dill.dump(userRecs_pd, open('userRecs_pd.pkd', 'wb'))


In [126]:
# load recommendation table by ALS
df_als = dill.load(open('data/userRecs_pd.pkd', 'rb'))
df_als.index = df_als['uid']

# Address cold-start problem by finding similar existing users

Users using the Holy Grail app will not have their historical product ratings known. So a solution to this is to find existing users with similar characteristics and their recommended products.

In [None]:
### find users similar to current user by using their characteristics

df_features = df_uid[user_features]
# have to have uid in the df so when dropping dont drop users with same chars
df_features['uid'] = df_uid['uid']
df_features.index = df_uid['uid']

# dataframe of unique users and their chars
df_features = df_features.drop_duplicates()

one_hot_transformer = OneHotEncoder(sparse=False)

# one hot encoding of characteristics
user_features_ohe = one_hot_transformer.fit_transform(df_features[user_features])

# get feature names for each onehot encoder
feat_names = one_hot_transformer.get_feature_names()
for i, feat in enumerate(feat_names):
    print(i, feat)

# fit knn to identify users who are similar
nn = NearestNeighbors(n_neighbors=10).fit(user_features_ohe)

In [145]:
# dill.dump(df_features['uid'], open('data/df_features.pkd', 'wb'))
df_features = dill.load(open('data/df_features.pkd', 'rb'))
print('There are {} unique users.'.format(len(df_features)))

# use dill to save nearest neighbor model
# dill.dump(nn, open('data/nn_users.pkd', 'wb'))
nn = dill.load(open('data/nn_users.pkd', 'rb'))

There are 111820 unique users.


In [152]:
# example
# 19-24; blue eyes; acne-prone; dark skin tone; neutral undertone; brown; curly hair; 
u_test = np.array([0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0])
# 19-24; black eyes; acne-prone; dark skin tone; neutral undertone; brown; curly hair; 
# u_test = np.array([0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0])

# find 10 users similar to the test user
# indices are indices of the 10 most similar users, not the actual uid
# dists, indices = nn.kneighbors(user_features_ohe[0].reshape(1, -1))
dists, indices = nn.kneighbors(u_test.reshape(1, -1))

# list of users similar to our test user
user_list = df_features.iloc[indices[0]]

In [153]:
user_list

Unnamed: 0_level_0,age,eyes,skin_type,skin_tone,skin_undertone,hair_color,hair_type,hair_texture,uid
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
43840,19-24,Hazel,Acne-prone,Fair-Medium,Neutral,Brown,Curly,Medium,43840
95738,19-24,Blue,Acne-prone,Fair,Cool,Brown,Curly,Medium,95738
49247,19-24,Blue,Acne-prone,Fair-Medium,Neutral,Blond,Curly,Medium,49247
3315,19-24,Brown,Acne-prone,Tan,Neutral,Brown,Curly,Medium,3315
70282,19-24,Blue,Acne-prone,Fair-Medium,Cool,Brown,Curly,Medium,70282
81016,19-24,Blue,Acne-prone,Fair,Neutral,Brown,Wavy,Medium,81016
15022,19-24,Blue,Sensitive,Fair,Neutral,Brown,Curly,Medium,15022
38650,19-24,Blue,Acne-prone,Fair,Neutral,Brunette,Curly,Medium,38650
35760,19-24,Blue,Acne-prone,Fair,Cool,Brown,Curly,Medium,35760
15753,19-24,Blue,Acne-prone,Fair-Medium,Neutral,Brown,Straight,Medium,15753


In [155]:
# merge list of similar user with list of users' recommendations
# rec is a list of product ids
rec = pd.merge(user_list, df_als, left_on=user_list.index, right_on=df_als.index, how='left')[['uid_x', 'rec1']]

prod_rec = pd.merge(rec, prod_df, left_on=rec.rec1, right_on=prod_df.pro_ids, how='left')

prod_rec_dedup = prod_rec.drop(['key_0', 'uid_x', 'rec1'], axis=1).drop_duplicates()

In [156]:
prod_rec_dedup

Unnamed: 0,names,pro_ids,ratings,repurchases,pkg_quals,prices,ingredients,brands
0,Ducray Keracnyl Creme Regulatrice,50891,4.4,0.9,4.3,3,"WATER (AQUA), TRIETHYLHEXANOIN, GLYCOLIC ACID,...",Ducray
1,CeraVe Facial Moisturizing Lotion - AM SPF 30,133879,3.2,0.47,4.0,3,"Active Ingredients:Homosalate 10, Meradimate 5...",CeraVe
2,Sulwhasoo Concentrated Ginseng Renewing Eye Cr...,173077,5.0,1.0,2.0,4,,Sulwhasoo
3,Erno Laszlo Ocu-pHel Emollient Eye Cream,62982,4.2,0.73,4.1,4,"Aqua [Water], Cyclopentasiloxane, Cococaprylat...",Erno Laszlo
4,Coconut Hut Hawaiian Icing Whipped Sugar Scrub,68071,4.8,1.0,4.2,2,,Unlisted Brand
5,Polysporin,72274,4.5,0.9,4.0,2,Active Ingredients: Each grams contains: Bacit...,Unlisted Brand
6,ELEMIS Skin Buff,102843,4.6,0.86,4.3,4,,ELEMIS
7,Garden of Wisdom Manuka Honey,106411,4.7,0.9,3.6,3,Manuka Honey,Garden of Wisdom
9,Peter Thomas Roth Peter Thomas Roth FirmX Peel...,163055,4.1,0.72,4.1,3,"Water, Polyethylene Glycol, Cellulose, Butylen...",Peter Thomas Roth
