# LUSH: Recommendation system

#### Libraries & dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from faker import Faker
import random
from sklearn.metrics.pairwise import cosine_similarity 

## i. Creating the table 

In [2]:
fake = Faker()
# Define categories and items
categories = ["Bath Bomb", "Skin Care", "Body Wash", "Hair Care", "Facial Care"]
items = [
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bath Bombs","product": "lord of misrule","product_id": 1,
     "ingredients":"Sodium Bicarbonate, Citric Acid, Corn Starch, Black Pepper Oil, Dark Sumatran Patchouli Oil, \
     Vanilla Absolute, Water (Aqua), Titanium Dioxide, Dipropylene Glycol, Sodium Coco Sulfate, Propylene Glycol, \
     Synthetic Fluorphlogopite, Tin Oxide, Cocamidopropyl Betaine, *Limonene, *Linalool, Parfum/Fragrance"},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bath Bombs","product": "bat art","product_id": 2,
    "ingredients":"Sodium Bicarbonate, Citric Acid, Alcohol Denat., Parfum/Fragrance, Sicilian Lemon Oil, \
    Green Mandarin Oil, Rosemary Oil, Sage Oil, Water (Aqua), Titanium Dioxide, Corn Starch, Talc, \
    Sodium Coco Sulfate, Dipropylene Glycol, Synthetic Fluorphlogopite, Tin Oxide, Propylene Glycol, \
    Alpha-Isomethyl Ionone, *Limonene"},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bath Bombs","product": "screamo","product_id": 3,
    "ingredients":"Sodium Bicarbonate, Citric Acid, Benzoin Resinoid, Cananga Odorata Flower Oil, \
    Almond Essential Oil, Cream of Tartar, Water (Aqua), Rice Starch, Corn Starch, Titanium Dioxide, Talc, \
    Synthetic Fluorphlogopite, Sodium Coco Sulfate, Dipropylene Glycol, Cocamidopropyl Betaine, Tin Oxide, \
    Benzyl Alcohol, Coumarin, Hexyl Cinnamal, Parfum/Fragrance"},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bath Bombs","product": "magic potion","product_id": 4},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bath Bombs","product": "barbie bath","product_id": 5},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bubble Bars","product": "lord of misrule","product_id": 6},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bubble Bars","product": "bonehead","product_id": 7},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bubble Bars","product": "alien","product_id": 8},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bubble Bars","product": "barbie bubble","product_id": 9},
    {"category": "Bath and Shower","brand": "Bath","sub-brand": "Bubble Bars","product": "sleepy","product_id": 10},
    {"category": "Bath and Shower","brand": "Shower","sub-brand": "Shower Gel","product": "lord of misrule","product_id": 11},
    {"category": "Bath and Shower","brand": "Shower","sub-brand": "Shower Gel","product": "snow fairy","product_id": 12},
    {"category": "Bath and Shower","brand": "Shower","sub-brand": "Shower Gel","product": "grass","product_id": 13},
    {"category": "Bath and Shower","brand": "Shower","sub-brand": "Shower Gel","product": "sleepy","product_id": 14},
    {"category": "Bath and Shower","brand": "Shower","sub-brand": "Shower Gel","product": "happy hippy","product_id": 15},
    {"category": "Skincare","brand": "Face","sub-brand": "Moisturizer","product": "peace","product_id": 16},
    {"category": "Skincare","brand": "Face","sub-brand": "Moisturizer","product": "celestial","product_id": 17},
    {"category": "Skincare","brand": "Face","sub-brand": "Moisturizer","product": "gorgeous","product_id": 18},
    {"category": "Skincare","brand": "Face","sub-brand": "Moisturizer","product": "imperialis","product_id": 19},
    {"category": "Skincare","brand": "Face","sub-brand": "Moisturizer","product": "skin drink","product_id": 20},
]
# Generate data for 1000 people
data = []
k=1
for i in range(10000):
    person_id = i + 1
    pronoun = random.choice(["He/Him", "She/Her", "They/Them", None])
    for j in range(random.randint(1, 20)):
        item = random.choice(items)
        transaction_id = k
        k = k + 1
        product_id = item['product_id']
        product = item['product']
        sub_brand = item['sub-brand']
        brand = item['brand']
        category = item['category']
        purchase_date = fake.date_between(start_date='-2y', end_date='today').strftime('%Y-%m-%d')
        rating = random.randint(0, 5)
        data.append([person_id, pronoun, transaction_id, product_id, product, sub_brand, brand, category, purchase_date, rating])
        
# Create a pandas DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'pronoun', 'transaction_id', 'product_id', 'product', 'sub_brand', 'brand', 'category', 'date', 'rating'])
# Save the data frame to a CSV file
df.to_csv('bath_cosmetics_data.csv', index=False)
# Display the first few rows of the DataFrame

In [3]:
df.shape

(104968, 10)

## 1) A User Independent System
We'll start with building recommendations for the case where we have no knowledge about a user. In this case, we probably would find the most popular items, and recommend them to our user.

The simplest system would just sort the items based on their average score and present a list in that order:

In [4]:
product_df = df.groupby(['product_id','product', 'category', 'brand', 'sub_brand']).agg({'rating': ['mean', 'count'],'transaction_id': ['count']}).reset_index()
# Rename columns with two levels to a single-level column name
product_df.columns = ['_'.join(col).strip() if col[1] != '' else col[0] for col in product_df.columns]

In [5]:
threshold = 500
product_with_many_reviews_df = product_df[product_df['rating_count'] >= threshold]

In [6]:
top_rated = product_with_many_reviews_df.sort_values(by=['rating_mean'], ascending=False)
top_rated.head(5)

Unnamed: 0,product_id,product,category,brand,sub_brand,rating_mean,rating_count,transaction_id_count
7,8,alien,Bath and Shower,Bath,Bubble Bars,2.526894,5094,5094
8,9,barbie bubble,Bath and Shower,Bath,Bubble Bars,2.520448,5184,5184
11,12,snow fairy,Bath and Shower,Shower,Shower Gel,2.515765,5138,5138
2,3,screamo,Bath and Shower,Bath,Bath Bombs,2.513438,5358,5358
10,11,lord of misrule,Bath and Shower,Shower,Shower Gel,2.508857,5363,5363


In [7]:
top_seller = product_with_many_reviews_df.sort_values(by=['transaction_id_count'], ascending=False)
top_seller.head(5)

Unnamed: 0,product_id,product,category,brand,sub_brand,rating_mean,rating_count,transaction_id_count
17,18,gorgeous,Skincare,Face,Moisturizer,2.479696,5393,5393
10,11,lord of misrule,Bath and Shower,Shower,Shower Gel,2.508857,5363,5363
2,3,screamo,Bath and Shower,Bath,Bath Bombs,2.513438,5358,5358
16,17,celestial,Skincare,Face,Moisturizer,2.498782,5335,5335
12,13,grass,Bath and Shower,Shower,Shower Gel,2.496236,5314,5314


## 2) Similar Users
What makes two users similar? The simplest way to measure if user $a$ and $b$ are similar is to look at the items both of them reviewed. For example consider the following two users:

In [8]:
usersimilar_df = df.groupby(['customer_id','product_id']).agg({'rating': ['mean']}).reset_index()
# Rename columns with two levels to a single-level column name
usersimilar_df.columns = ['_'.join(col).strip() if col[1] != '' else col[0] for col in usersimilar_df.columns]

In [9]:
usersimilar_df

Unnamed: 0,customer_id,product_id,rating_mean
0,1,6,0.0
1,1,7,2.5
2,1,8,3.0
3,1,15,0.0
4,1,18,4.5
...,...,...,...
78149,10000,13,5.0
78150,10000,14,2.0
78151,10000,15,3.0
78152,10000,19,3.0


In [10]:
customer_id = usersimilar_df['customer_id'].unique()
product_id = usersimilar_df['product_id'].unique()

num_customer = len(customer_id)
num_product = len(product_id)

# np.nan means they user has not reviewed the movie
R = np.full((num_customer, num_product), np.nan)

#Build the user-item matrix
for row in usersimilar_df.itertuples(): # same as zip(df.index, df["user"], df["movie"], df["score"])
    customer = row[1]
    product = row[2]
    rating = row[3]
    R[customer-1, product-1] = rating
    
R_df = pd.DataFrame(data=R, index=range(1, num_customer+1), columns=range(1, num_product+1))

In [11]:
# Select unique users with all four features
unique_users_df = df.drop_duplicates(subset=['customer_id','pronoun'])

# Create dummy variables for user_pronoun and user_gender
unique_users_df = pd.get_dummies(unique_users_df, columns=['pronoun'])

In [12]:
# Select only the dummy columns related to 'pronoun' and 'gender'
columns_to_keep = ['customer_id'] + [col for col in unique_users_df.columns if col.startswith('pronoun_')]

# Create a new DataFrame with only the selected dummy columns
dummies_df = unique_users_df[columns_to_keep]

# Set 'customer_id' as the index
dummies_df.set_index('customer_id', inplace=True)

In [13]:
# Merge the DataFrames on 'customer_id'
merged_df = pd.concat([R_df, dummies_df], axis=1)

In [16]:
merged_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,pronoun_He/Him,pronoun_She/Her,pronoun_They/Them
1,,,,,,0.0,2.5,3.0,,,...,,0.0,,,4.5,,,0,0,1
2,,,,0.0,0.000000,0.0,1.0,,,,...,,,5.0,,2.0,,,0,0,0
3,,,,,2.000000,3.0,,,,,...,2.0,3.5,3.0,,,,5.0,0,0,0
4,3.0,,,0.0,1.333333,2.5,5.0,5.0,,3.0,...,5.0,5.0,1.0,2.0,5.0,,,0,0,1
5,,,,5.0,,,,,1.0,,...,,1.0,,,2.0,,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,,,3.0,,,,,,,,...,,,,,,,,1,0,0
9997,,,4.0,0.0,,2.0,,,,0.0,...,,0.0,,0.0,4.5,,,0,0,0
9998,,2.0,,1.0,,,2.0,,,2.0,...,,,,,5.0,,,0,0,0
9999,,,,,4.000000,,,,,,...,,,,2.0,,,,0,0,0


In [19]:
def find_customer_similarity(customer_1, customer_2, R_df):
    
    # Define the mask which finds all movies they rated together
    feature_1st_customer_rated = ~R_df.loc[customer_1, :].isna()
    feature_2nd_customer_rated = ~R_df.loc[customer_2, :].isna()

    feature_both_customers_rated = feature_1st_customer_rated & feature_2nd_customer_rated

    # Sum boolean to get the counts
    number_of_features_rated_together = feature_both_customers_rated.sum()
        
    # Find the ratings of both customers for movies they both watched
    ratings_of_customer1 = R_df.loc[customer_1, feature_both_customers_rated].values.reshape(1, -1)
    ratings_of_customer2 = R_df.loc[customer_2, feature_both_customers_rated].values.reshape(1, -1)
    
    # Finally, calculate the similarity between them
    similarity = cosine_similarity(ratings_of_customer1, ratings_of_customer2)[0][0]
    
    return similarity, number_of_features_rated_together

In [21]:
current_user = 16 # we will only do this to user 16 for demonstration purposes
current_feature = 2
similarities_to_user_16 = []
ratings_given_to_feature_2 = []

# Find only the users who rated movie 2 (rows)
R_df2 = R_df[~R_df.iloc[:, 1].isna()].copy()

for other_user in R_df2.index:
    
    similarity, number_of_features_rated_together = find_customer_similarity(current_user, other_user, merged_df)
    similarities_to_user_16.append(similarity)
    ratings_given_to_feature_2.append(R_df.loc[other_user, current_feature])
            
# Finally, let's turn these into numpy arrays so life is easier
similarities_to_user_16 = np.array(similarities_to_user_16)
ratings_given_to_feature_2 = np.array(ratings_given_to_feature_2)

In [22]:
predicted_rating = np.dot(ratings_given_to_feature_2, similarities_to_user_16)/np.sum(similarities_to_user_16)

print(f'Predicted rating for feature 2 by user 16 is {round(predicted_rating, 2)}')

Predicted rating for feature 2 by user 16 is 2.5


In [23]:
current_customer = 1 # we will only do this to customer 1 for demonstration purposes
customer_predicted_rating_per_feature=[]

for feature in range(num_product):
    current_feature = feature
    similarities_to_customer = []
    ratings_given_to_feature = []

    # Find only the customers who rated same feature (rows)
    R_df2 = merged_df[~merged_df.iloc[:, feature].isna()].copy()

    for other_customer in R_df2.index:

        similarity, number_of_features_rated_together = find_customer_similarity(current_customer, other_customer, merged_df)
        similarities_to_customer.append(similarity)
        ratings_given_to_feature.append(merged_df.loc[other_customer, current_feature+1])

    # Finally, let's turn these into numpy arrays so life is easier
    similarities_to_customer = np.array(similarities_to_customer)
    ratings_given_to_feature = np.array(ratings_given_to_feature)
    
    predicted_rating = np.dot(ratings_given_to_feature, similarities_to_customer)/np.sum(similarities_to_customer)
    customer_predicted_rating_per_feature.append(predicted_rating)

In [None]:
# Create a DataFrame
customer_product_prediction_df = pd.DataFrame({'product_id': range(1, num_product+1), 'customer_predicted_rating_per_feature': customer_predicted_rating_per_feature})

In [None]:
customer_product_prediction_df

In [None]:
# Sort the DataFrame by 'user_predicted_rating_per_feature' in ascending order
sorted_df = user_product_prediction_df.sort_values(by='user_predicted_rating_per_feature', ascending = False)
sorted_df.head(5)

## 3) Similar Items

What makes two items similar? As with measuring if users are similar there are many ways of measuring item similarity. For our simple method to find the similarity of two items, $x$ and $y$, we look at every user who reviewed both items. For example let us consider the items in columns 15 and 18.

In [None]:
def find_item_similarity(item_1, item_2, R_df):
    
    # Define the mask which finds users who rated both items
    users_who_reviewed_1st_item = R_df.loc[:, item_1].notna()
    users_who_reviewed_2nd_item = R_df.loc[:, item_2].notna()

    users_who_reviewed_both_items = users_who_reviewed_1st_item & users_who_reviewed_2nd_item
    
    # Find how many users rated both items
    number_of_users = users_who_reviewed_both_items.sum()
    
    # Find the ratings of both users for movies they both watched
    ratings_of_item1 = R_df.loc[users_who_reviewed_both_items, item_1].values.reshape(1, -1)
    ratings_of_item2 = R_df.loc[users_who_reviewed_both_items, item_2].values.reshape(1, -1)
    
    # Finally, calculate the similarity between them
    similarity = cosine_similarity(ratings_of_item1, ratings_of_item2)[0][0]
    
    return similarity, number_of_users

In [None]:
customer= 1 # we will only do this to customer 1 for demonstration purposes
item_predicted_rating=[]

for item in range(num_product):
    current_item = item
    
    similarities_to_item = []
    ratings_given_by_customer = []

    # Find only the items rated by customer 
    R_df3 = R_df.loc[:,~R_df.iloc[customer-1, :].isna()].copy()

    for other_item in R_df3.columns:

        similarity, number_of_customer_rated = find_item_similarity(current_item+1, other_item, R_df)
        similarities_to_item.append(similarity)
        ratings_given_by_customer.append(R_df.loc[customer, other_item])

    # Finally, let's turn these into numpy arrays so life is easier
    similarities_to_item = np.array(similarities_to_item)
    ratings_given_by_customer = np.array(ratings_given_by_customer)
    
    predicted_rating = np.dot(ratings_given_by_customer, similarities_to_item)/np.sum(similarities_to_item)
    item_predicted_rating.append(predicted_rating)

In [None]:
# Create a DataFrame
item_customer_prediction_df = pd.DataFrame({'product_id': range(1, num_product+1),  'item_predicted_rating': item_predicted_rating})

In [None]:
item_customer_prediction_df

In [None]:
# Sort the DataFrame by 'user_predicted_rating_per_feature' in ascending order
sorted2_df = item_customer_prediction_df.sort_values(by='item_predicted_rating', ascending = False)
sorted2_df.head(5)

## i) Appendix in case we need this in the future

In [None]:
def find_item_similarity(item_1, item_2, R_df):
    
    # Define the mask which finds users who rated both items
    users_who_reviewed_1st_item = R_df.loc[:, item_1].notna()
    users_who_reviewed_2nd_item = R_df.loc[:, item_2].notna()

    users_who_reviewed_both_items = users_who_reviewed_1st_item & users_who_reviewed_2nd_item
    
    # Find how many users rated both items
    number_of_users = users_who_reviewed_both_items.sum()
    
    # Find the ratings of both users for movies they both watched
    ratings_of_item1 = R_df.loc[users_who_reviewed_both_items, item_1].values.reshape(1, -1)
    ratings_of_item2 = R_df.loc[users_who_reviewed_both_items, item_2].values.reshape(1, -1)
    
    # Finally, calculate the similarity between them
    similarity = cosine_similarity(ratings_of_item1, ratings_of_item2)[0][0]
    
    return similarity, number_of_users

In [None]:
current_item = 1 # we will only do this to customer 1 for demonstration purposes
item_predicted_rating_per_customer=[]

for customer in range(num_customer):
    current_customer = customer
    
    similarities_to_item = []
    ratings_given_by_customer = []

    # Find only the items rated by customer 
    R_df3 = R_df.loc[:,~R_df.iloc[customer, :].isna()].copy()

    for other_item in R_df3.columns:

        similarity, number_of_customer_rated = find_item_similarity(current_item, other_item, R_df)
        similarities_to_item.append(similarity)
        ratings_given_by_customer.append(R_df.loc[current_customer+1, other_item])

    # Finally, let's turn these into numpy arrays so life is easier
    similarities_to_item = np.array(similarities_to_item)
    ratings_given_by_customer = np.array(ratings_given_by_customer)
    
    predicted_rating = np.dot(ratings_given_by_customer, similarities_to_item)/np.sum(similarities_to_item)
    item_predicted_rating_per_customer.append(predicted_rating)

In [None]:
# Create a DataFrame
item_customer_prediction_df = pd.DataFrame({'customer_id': range(1, num_customer + 1), 'item_predicted_rating_per_customer': item_predicted_rating_per_customer})

In [None]:
item_customer_prediction_df