# Analysis for Insight Project

## Setup

In [17]:
# import modules
import pandas as pd
import numpy as np
import rootpath
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import sklearn.metrics as metrics
from sklearn.utils import resample

from sklearn.metrics.pairwise import cosine_similarity as cos_sim

#import qgrid
import matplotlib.pyplot as plt
import seaborn as sns

### Locate files

In [4]:
# Set paths for project

# Set rootpath for project
rpath = rootpath.detect()

# Set directory
directory_name = '/data/clean/'

# Set filename
file_name = 'data_tsvd_full.csv'

### Compile data

In [5]:
df_tsvd_full = pd.read_csv(rpath+directory_name+file_name, index_col=0)

# Analysis: Cosine similarity

In [6]:
df_tsvd_full.head()

Unnamed: 0,product,use_category,brand,brand_generic,size,price,link,0,1,2,...,390,391,392,393,394,395,396,397,398,399
0,''Buffet'',treatments and serums,The Ordinary,non_generic,1.0,14.8,https://www.ulta.com/buffet?productId=pimprod2...,0.119214,-0.00380444,-0.04712197,...,0.004975841,-0.003680744,0.0002543059,-0.01989861,-0.0007760412,0.004739525,0.003849221,-0.008177239,-0.003971372,-0.002401545
1,''Buffet'' + Copper Peptides 1%,treatments and serums,The Ordinary,non_generic,1.0,28.9,https://www.ulta.com/buffet-copper-peptides-1?...,0.1128972,-0.005423432,-0.04304833,...,0.007069513,-0.001534115,0.001733428,-0.02460251,-0.006212722,0.004548193,0.01157866,-0.01369132,-0.009901973,-0.003717259
2,+Retinol Vita C Power Serum Firming + Brighten...,treatments and serums,Kate Somerville,non_generic,1.0,98.0,https://www.ulta.com/retinol-vita-c-power-seru...,0.1763208,-0.09768409,-0.04131444,...,-0.001618833,0.0001823084,-0.005280194,0.01789686,-0.02325558,-0.005912855,-0.009878723,-0.002759905,0.01271354,-0.009616596
3,+Retinol Vitamin C Moisturizer,moisturizer,Kate Somerville,non_generic,1.7,90.0,https://www.ulta.com/retinol-vitamin-c-moistur...,0.180027,-0.100916,0.09997509,...,-0.003876801,-0.0282702,-0.00468605,-0.002194605,-0.01696343,0.01074598,-0.006572581,0.0001421381,0.003262727,-0.006652446
4,100% Plant-Derived Squalane,treatments and serums,The Ordinary,non_generic,1.0,7.9,https://www.ulta.com/100-plant-derived-squalan...,3.152109e-17,2.409649e-14,-2.723232e-14,...,2.049383e-07,-2.436632e-07,-2.774561e-07,-2.640047e-07,-2.18212e-07,2.153553e-07,1.834869e-07,1.445371e-07,-1.898183e-07,-2.561501e-07


In [151]:
# Separate features from product information

prod_info = ['use_category', 'brand', 'brand_generic',
             'size', 'price', 'link']

df_tsvd_full_features = df_tsvd_full.drop(prod_info, axis=1) \
                                    .set_index('product')

## Validation: Similarity recommendations for brand vs. generic versions

1. Get list of brand_generic products
2. Get list of brands
3. For each product in brands, check cosine similarity
    See if top result brand_generic matches 
    Yes or no

In [11]:
def filter_string(list_of_strings, string_to_remove):
    """
    Purpose: filters a string(s) from a list of strings
    Returns: a list without the indicated string(s)
    string_to_remove: MUST BE A LIST
    """
    new_list = [string for string in list_of_strings \
                    if string not in string_to_remove]
    return new_list

In [22]:
# Extract brand_generic product labels

brand_gen_labels = (
    filter_string(df_tsvd_full['brand_generic'].unique(),'non_generic')
)

In [201]:
# List of brand name product brands 
# This does not include their generic versions

brand_names = ['Aveeno','Banana boat','Cetaphil','Clean and Clear',
               'St. Ives']

In [154]:
# Extract df of brand_generic products for comparison to similarity output

# Get columns of product info only
df_brand_gen = (
    df_tsvd_full[df_tsvd_full.columns.intersection(prod_info)].copy()
)

# Add product
df_brand_gen['product'] = df_tsvd_full['product']

# Filter out only brand_generic products
df_brand_gen = df_brand_gen.query('brand_generic == @brand_gen_labels')

Unnamed: 0,use_category,brand,brand_generic,size,price,link,product
74,cleanser,Solimo,Cetaphil_cleanser_generic,blank,blank,https://www.amazon.com/Amazon-Brand-Solimo-Fac...,"Amazon Brand - Solimo Daily Facial Cleanser, N..."
75,moisturizer,Solimo,Cetaphil_cream_generic,blank,blank,https://www.amazon.com/Amazon-Brand-Moisturizi...,Amazon Brand - Solimo Ultra Moisturizing Skin ...
104,cleanser,Aveeno,Aveeno_generic,blank,blank,https://www.cvs.com/shop/aveeno-positively-rad...,Aveeno Positively Radiant Brightening & Exfoli...
107,moisturizer,Banana boat,BB_generic,blank,blank,https://www.amazon.com/Banana-Boat-Sunscreen-P...,"Banana Boat Ultra Sport Sunscreen Lotion, Broa..."
109,cleanser,Beauty 360,Aveeno_generic,blank,blank,https://www.cvs.com/shop/beauty-360-illuminati...,Beauty 360 Illuminating Facial Scrub
141,cleanser,CVS,CC_generic,blank,blank,https://www.cvs.com/shop/cvs-oil-controlling-a...,CVS Oil Controlling Astringent Sensitive Skin
157,cleanser,Cetaphil,Cetaphil_cleanser_generic,blank,blank,https://www.amazon.com/Cetaphil-Daily-Facial-C...,Cetaphil Daily Facial Cleanser
158,moisturizer,Cetaphil,Cetaphil_cream_generic,blank,blank,https://www.amazon.com/Cetaphil-Moisturizing-C...,Cetaphil Fragrance Free Moisturizing Cream
173,cleanser,Clean and Clear,CC_generic,blank,blank,https://www.cvs.com/shop/clean-clear-essential...,Clean & Clear Essentials Deep Cleaning Toner S...
507,moisturizer,Mountain falls,BB_generic,blank,blank,https://www.amazon.com/Mountain-Falls-Sunscree...,"Mountain Falls Active Sport Sunscreen Lotion, ..."


Next steps:
- Automate check for correct brand generic as top recommendation
- Remove self from top recommendations and then return top 5
- Do this for all brand_generics
- Automate with functions and display results

In [224]:
# Extract product names for brand name products only
# These are the input products for checking recommendation
#   accuracy for specific products

brand_name_prod = (
    df_tsvd_full.query('brand_generic == @brand_gen_labels and \
                        brand == @brand_names')['product']
)

In [335]:
# Check recommendation accuracy for list of brand name products

# Set initial value for number of valid recommendations
correct_rec = []
input_prod = []
top_output_prod = []
top_cos_sim = []

for product in brand_name_prod:
    # Calculate Cosine Similarities
    res_cosine = (
        cos_sim(df_tsvd_full_features.loc[[product],:],
                          df_tsvd_full_features)
    )
    # Convert to df
    res_cosine = pd.DataFrame(res_cosine.transpose()) \
                   .rename(columns={0:'cosine_sim'})
    # Add product info to cosine sim results
    df_cos_sim = add_prod_info(new_df=res_cosine, original_df=df_tsvd_full,
                               col_names=prod_info, prod_names_col='product')
    # Round to 4 digits
    df_cos_sim['cosine_sim'] = df_cos_sim['cosine_sim'].round(4)
    # Extract top 6 most similar results
    df_cos_sim_topres = df_cos_sim.nlargest(6,'cosine_sim')
    # Filter df without input product (should always be with cosine sim of 1)
    df_cos_sim_topres = (
        df_cos_sim_topres[~df_cos_sim_topres['product'].isin([product])]
    )
    # Get brand generic label for input product
    input_prod_label = (
        df_brand_gen.query('product in @product')['brand_generic']
    )
    # Get brand_generic label for top result
    top_res_prod_label = (
        df_cos_sim_topres.nlargest(1,'cosine_sim')['brand_generic']
    )
    # Compare brand_generic label of input product and add 1 if it matches
    if input_prod_label.values == top_res_prod_label.values:
        correct_rec.append(True)
    else:
        correct_rec.append(False)
    # Save the input product, comparison product, similarity 
    input_prod.append(product)
    top_output_prod.extend(
        df_cos_sim_topres.nlargest(1,'cosine_sim')['product'].tolist()
    )
    top_cos_sim.extend(
        df_cos_sim_topres.nlargest(1,'cosine_sim')['cosine_sim'].tolist()
    )
    
# Add lists together and turn into df 
df_valid_res = (
    pd.DataFrame(list(zip(input_prod, top_output_prod, top_cos_sim, 
                          correct_rec)),
                         columns=['input_prod','top_output_prod','cos_sim',
                                  'correct_rec'])
)

In [336]:
df_valid_res

Unnamed: 0,input_prod,top_output_prod,cos_sim,correct_rec
0,Aveeno Positively Radiant Brightening & Exfoli...,Beauty 360 Illuminating Facial Scrub,0.8016,True
1,"Banana Boat Ultra Sport Sunscreen Lotion, Broa...","Mountain Falls Active Sport Sunscreen Lotion, ...",0.7545,True
2,Cetaphil Daily Facial Cleanser,"Amazon Brand - Solimo Daily Facial Cleanser, N...",0.9745,True
3,Cetaphil Fragrance Free Moisturizing Cream,Amazon Brand - Solimo Ultra Moisturizing Skin ...,0.6605,True
4,Clean & Clear Essentials Deep Cleaning Toner S...,No:Rinse Intensive Pore Minimizing Toner,0.4136,False
5,"St. Ives Fresh Skin Face Scrub, Apricot",Mountain Falls Invigorating Apricot Scrub Faci...,0.8393,True


In [322]:
# Calculate percentage of correct recommendations

prop_correct = sum(df_valid_res['correct_rec'].values)
per_correct = round((sum(df_valid_res['correct_rec'].values)/6) * 100,2)
print(f"Correct recommendations out of 6 labeled products: {prop_correct}/6 or {per_correct}%")

Correct recommendations out of 6 labeled products: 5/6 or 83.33%


In [169]:
# Get example product to test

ex_prod = brand_name_prod[:1].values

In [170]:
print(ex_prod)

['Amazon Brand - Solimo Daily Facial Cleanser, Normal to Oily Skin']


In [159]:
# Calculate Cosine Similarities

res_cosine = (
    cos_sim(df_tsvd_full_features.loc[ex_prod,:],
                      df_tsvd_full_features)
)

In [160]:
# Convert to df

res_cosine = pd.DataFrame(res_cosine.transpose()) \
               .rename(columns={0:'cosine_sim'})

In [161]:
def reorder_first_cols(df, col_order):
    '''
    Reorder columns in dataframe with col_order as a list of column names
    in the order you want them in to appear at the beginner of the dataframe.
    The rest of the columns will remain in the same order as before. 
    '''
    # Create new column ordering
    new_col_order = (
          col_order + [col for col in df.columns if col not in col_order]
    )
    # Reindex columns based on new order
    df = df.reindex(columns=new_col_order)
    
    return df

In [162]:
def add_prod_info(new_df, original_df, col_names, prod_names_col):
    """
    Purpose: Add product information to TSVD dataframe
    Returns: dataframe with product information in rows and columns
    new_df: df with cosine similarity results results
    original_df: df before tsvd with product information
    col_names: List of strings of column names to be added into tsvd df
    prod_names_col: Name of column with product names, string
    """
    # Add product names
    new_df['product']=original_df[prod_names_col]
    # Copy columns to new df
    col_copy = original_df[col_names].copy()
    # Copy product names to col names df
    col_copy['product'] = original_df[prod_names_col]
    # Join with tsvd df on product
    new_df = pd.merge(new_df, col_copy, how='inner', on='product')
    # Reorder df columns
    cols_order = ['product','cosine_sim', 'use_category','brand',
                  'brand_generic','size','price','link']
    new_df = reorder_first_cols(new_df, cols_order)
    return new_df

In [163]:
# Add product info to cosine sim results

df_cos_sim = add_prod_info(new_df=res_cosine, original_df=df_tsvd_full,
                           col_names=prod_info, prod_names_col='product')

In [164]:
# Round to 4 digits

df_cos_sim['cosine_sim'] = df_cos_sim['cosine_sim'].round(4)

In [177]:
# Extract top 6 most similar results

df_cos_sim_topres = df_cos_sim.nlargest(6,'cosine_sim')

In [192]:
# Filter df without input product (should always be with cosine sim of 1)

df_cos_sim_topres = (
    df_cos_sim_topres[~df_cos_sim_topres['product'].isin(ex_prod)]
)

In [173]:
# Check if top result matches by 
# 1 remove top result
# 2 compare brand_generic ID to itself

ex_prod

df_brand_gen.query('product in @ex_prod')['brand_generic']

74    Cetaphil_cleanser_generic
Name: brand_generic, dtype: object

In [198]:
# Get brand generic label for input product

input_prod_label = df_brand_gen.query('product in @ex_prod')['brand_generic']

In [199]:
# Get brand_generic label for top result

top_res_prod_label = (
    df_cos_sim_topres.nlargest(1,'cosine_sim')['brand_generic']
)

In [None]:
# Compare brand_generic label of input product 

input_prod_label == top_res_prod_label

In [None]:
def get_prod_rec_acc():
    """
    Purpose: Check performance of recommendations for list of products.
    Returns:
    """

In [None]:
# Calculate cosine similarity for feature 1 
res_cosine = cosine_similarity(features.loc['A+ High-Dose Retinoid Serum',:].to_frame().transpose(), features) #[0:1] .loc["Essential-C Cleanser",:]
res_cosine = res_cosine.reshape(-1)
res_cosine = pd.DataFrame(res_cosine)
res_sim=df[['product','brand','product_type','price','size','ratings', 'active','vit_a',
            'total_reviews','link','price_oz']].copy()
res_sim['similarity']=res_cosine[[0]]
# Round similarity metric
#res_sim['similarity']=round(res_sim['similarity'],2)
# Maybe don't round so you don't have to deal with ties?
#indexNames = res_sim[res_sim['product']=='Essential-C Cleanser'].index
#res_sim.drop(indexNames, inplace=True)
# Sort from top similarity metrics and ignoring self, so starting at 1, not zero
test = res_sim.nlargest(5, 'similarity')[1:5]
#res_sim.head()
# Select top match
test[:10]
#Good Genes All-In-One Lactic Acid Treatment
#A+ High-Dose Retinoid Serum
# Generic vs similar validation
# Cetaphil Daily Facial Cleanser
# Cetaphil Fragrance Free Moisturizing Cream : 0.959282
# Banana Boat Ultra Sport Sunscreen Lotion, Broad Spectum SPF 30 : 0.999971
# St. Ives Fresh Skin Face Scrub, Apricot : 0.999748
# Clean & Clear Essentials Deep Cleaning Toner Sensitive Skin : 0.999994
# Aveeno Positively Radiant Brightening & Exfoliating Face Scrub : 0.990599

#test[test['vit_a']==1]