# Analysis for Insight Project

## Setup

In [17]:
# import modules
import pandas as pd
import numpy as np
import rootpath
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import sklearn.metrics as metrics
from sklearn.utils import resample

from sklearn.metrics.pairwise import cosine_similarity as cos_sim

#import qgrid
import matplotlib.pyplot as plt
import seaborn as sns

### Locate files

In [4]:
# Set paths for project

# Set rootpath for project
rpath = rootpath.detect()

# Set directory
directory_name = '/data/clean/'

# Set filename
file_name = 'data_tsvd_full.csv'

### Compile data

In [5]:
df_tsvd_full = pd.read_csv(rpath+directory_name+file_name, index_col=0)

# Analysis: Cosine similarity

In [6]:
df_tsvd_full.head()

Unnamed: 0,product,use_category,brand,brand_generic,size,price,link,0,1,2,...,390,391,392,393,394,395,396,397,398,399
0,''Buffet'',treatments and serums,The Ordinary,non_generic,1.0,14.8,https://www.ulta.com/buffet?productId=pimprod2...,0.119214,-0.00380444,-0.04712197,...,0.004975841,-0.003680744,0.0002543059,-0.01989861,-0.0007760412,0.004739525,0.003849221,-0.008177239,-0.003971372,-0.002401545
1,''Buffet'' + Copper Peptides 1%,treatments and serums,The Ordinary,non_generic,1.0,28.9,https://www.ulta.com/buffet-copper-peptides-1?...,0.1128972,-0.005423432,-0.04304833,...,0.007069513,-0.001534115,0.001733428,-0.02460251,-0.006212722,0.004548193,0.01157866,-0.01369132,-0.009901973,-0.003717259
2,+Retinol Vita C Power Serum Firming + Brighten...,treatments and serums,Kate Somerville,non_generic,1.0,98.0,https://www.ulta.com/retinol-vita-c-power-seru...,0.1763208,-0.09768409,-0.04131444,...,-0.001618833,0.0001823084,-0.005280194,0.01789686,-0.02325558,-0.005912855,-0.009878723,-0.002759905,0.01271354,-0.009616596
3,+Retinol Vitamin C Moisturizer,moisturizer,Kate Somerville,non_generic,1.7,90.0,https://www.ulta.com/retinol-vitamin-c-moistur...,0.180027,-0.100916,0.09997509,...,-0.003876801,-0.0282702,-0.00468605,-0.002194605,-0.01696343,0.01074598,-0.006572581,0.0001421381,0.003262727,-0.006652446
4,100% Plant-Derived Squalane,treatments and serums,The Ordinary,non_generic,1.0,7.9,https://www.ulta.com/100-plant-derived-squalan...,3.152109e-17,2.409649e-14,-2.723232e-14,...,2.049383e-07,-2.436632e-07,-2.774561e-07,-2.640047e-07,-2.18212e-07,2.153553e-07,1.834869e-07,1.445371e-07,-1.898183e-07,-2.561501e-07


In [16]:
# Separate features from product information

prod_info = ['use_category', 'brand', 'brand_generic',
             'size', 'price', 'link']

df_tsvd_full_features = df_tsvd_full.drop(prod_info, axis=1) \
                                    .set_index('product')

## Validation: Similarity recommendations for brand vs. generic versions

1. Get list of brand_generic products
2. Get list of brands
3. For each product in brands, check cosine similarity
    See if top result brand_generic matches 
    Yes or no

In [11]:
def filter_string(list_of_strings, string_to_remove):
    """
    Purpose: filters a string(s) from a list of strings
    Returns: a list without the indicated string(s)
    string_to_remove: MUST BE A LIST
    """
    new_list = [string for string in list_of_strings \
                    if string not in string_to_remove]
    return new_list

In [22]:
# Extract brand vs. generic products

brand_gen_labels = (
    filter_string(df_tsvd_full['brand_generic'].unique(),'non_generic')
)

In [None]:
# List of brand names

brand_names = ['Aveeno','Banana boat','Cetaphil','Clean and Clear',
               'St. Ives']

In [None]:
# Create new column of brand + brand_generic for unique value for each brand

df_tsvd_full['item_id'] = df_tsvd_full['brand']+df_tsvd_full['brand_generic']

In [None]:
# List of products 

In [33]:
# filter df by brand generic

test = df_tsvd_full.query('brand_generic == @brand_gen_labels')['product']
test[:1]

74    Amazon Brand - Solimo Daily Facial Cleanser, N...
Name: product, dtype: object

In [38]:
test2 = test[:1].values
test2

array(['Amazon Brand - Solimo Daily Facial Cleanser, Normal to Oily Skin'],
      dtype=object)

In [70]:
df_tsvd_full_features.shape

(969, 400)

In [80]:
df_tsvd_full_features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
''Buffet'',0.119214,-0.00380444,-0.04712197,-0.008680362,-0.1013724,-0.06034554,-0.01653528,-0.09325051,0.006747849,-0.04098656,...,0.004975841,-0.003680744,0.0002543059,-0.01989861,-0.0007760412,0.004739525,0.003849221,-0.008177239,-0.003971372,-0.002401545
''Buffet'' + Copper Peptides 1%,0.1128972,-0.005423432,-0.04304833,-0.004935596,-0.0955279,-0.06084031,-0.01841047,-0.088872,0.00499468,-0.03516285,...,0.007069513,-0.001534115,0.001733428,-0.02460251,-0.006212722,0.004548193,0.01157866,-0.01369132,-0.009901973,-0.003717259
+Retinol Vita C Power Serum Firming + Brightening Treatment,0.1763208,-0.09768409,-0.04131444,-0.01654339,-0.02253272,-0.002184843,-0.06650978,0.0591389,-0.02265567,0.01287037,...,-0.001618833,0.0001823084,-0.005280194,0.01789686,-0.02325558,-0.005912855,-0.009878723,-0.002759905,0.01271354,-0.009616596
+Retinol Vitamin C Moisturizer,0.180027,-0.100916,0.09997509,0.04622951,-0.04057518,0.03429551,-0.001973233,0.03803657,0.03623264,0.00971905,...,-0.003876801,-0.0282702,-0.00468605,-0.002194605,-0.01696343,0.01074598,-0.006572581,0.0001421381,0.003262727,-0.006652446
100% Plant-Derived Squalane,3.152109e-17,2.409649e-14,-2.723232e-14,-9.592659e-14,-1.451955e-12,-2.846308e-12,1.90672e-12,-3.670501e-12,-2.95417e-12,1.390982e-12,...,2.049383e-07,-2.436632e-07,-2.774561e-07,-2.640047e-07,-2.18212e-07,2.153553e-07,1.834869e-07,1.445371e-07,-1.898183e-07,-2.561501e-07


In [81]:
df_tsvd_full_features.loc[test2,:]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Amazon Brand - Solimo Daily Facial Cleanser, Normal to Oily Skin",0.142109,0.096347,0.01388,-0.101135,0.008288,0.039969,-0.006787,-0.04459,0.035305,0.021486,...,0.00767,0.018247,-0.001402,-0.00121,-0.012219,0.007814,0.003712,-0.003739,-0.006965,-0.00305


1. Transpose results
2. Add product info
3. Sort by similarity score

In [123]:
# Calculate Cosine Similarities

res_cosine = (
    cos_sim(df_tsvd_full_features.loc[test2,:],
                      df_tsvd_full_features)
)

 # np.sort(

In [124]:
# Convert to df

res_cosine = pd.DataFrame(res_cosine.transpose()) \
               .rename(columns={0:'cosine_sim'})

In [125]:
# Add product information back

res_cosine.head()

Unnamed: 0,cosine_sim
0,0.05623523
1,0.05012048
2,0.0223973
3,0.0424913
4,4.159175e-08


In [126]:
def reorder_first_cols(df, col_order):
    '''
    Reorder columns in dataframe with col_order as a list of column names
    in the order you want them in to appear at the beginner of the dataframe.
    The rest of the columns will remain in the same order as before. 
    '''
    # Create new column ordering
    new_col_order = (
          col_order + [col for col in df.columns if col not in col_order]
    )
    # Reindex columns based on new order
    df = df.reindex(columns=new_col_order)
    
    return df

In [130]:
def add_prod_info(new_df, original_df, col_names, prod_names_col):
    """
    Purpose: Add product information to TSVD dataframe
    Returns: dataframe with product information in rows and columns
    new_df: df with cosine similarity results results
    original_df: df before tsvd with product information
    col_names: List of strings of column names to be added into tsvd df
    prod_names_col: Name of column with product names, string
    """
    # Add product names
    new_df['product']=original_df[prod_names_col]
    # Copy columns to new df
    col_copy = original_df[col_names].copy()
    # Copy product names to col names df
    col_copy['product'] = original_df[prod_names_col]
    # Join with tsvd df on product
    new_df = pd.merge(new_df, col_copy, how='inner', on='product')
    # Reorder df columns
    cols_order = ['product','cosine_sim', 'use_category','brand',
                  'brand_generic','size','price','link']
    new_df = reorder_first_cols(new_df, cols_order)
    return new_df

In [131]:
# Add product info to cosine sim results

df_cos_sim = add_prod_info(new_df=res_cosine, original_df=df_tsvd_full,
                           col_names=prod_info, prod_names_col='product')

In [None]:
# 

In [132]:
df_cos_sim.head()

Unnamed: 0,product,cosine_sim,use_category,brand,brand_generic,size,price,link
0,''Buffet'',0.05623523,treatments and serums,The Ordinary,non_generic,1.0,14.8,https://www.ulta.com/buffet?productId=pimprod2...
1,''Buffet'' + Copper Peptides 1%,0.05012048,treatments and serums,The Ordinary,non_generic,1.0,28.9,https://www.ulta.com/buffet-copper-peptides-1?...
2,+Retinol Vita C Power Serum Firming + Brighten...,0.0223973,treatments and serums,Kate Somerville,non_generic,1.0,98.0,https://www.ulta.com/retinol-vita-c-power-seru...
3,+Retinol Vitamin C Moisturizer,0.0424913,moisturizer,Kate Somerville,non_generic,1.7,90.0,https://www.ulta.com/retinol-vitamin-c-moistur...
4,100% Plant-Derived Squalane,4.159175e-08,treatments and serums,The Ordinary,non_generic,1.0,7.9,https://www.ulta.com/100-plant-derived-squalan...


In [110]:
res_cosine = res_cosine.transpose()
res_cosine.tail()

Unnamed: 0,0
964,0.345554
965,0.425355
966,0.933994
967,0.974542
968,1.0


In [107]:
res_cosine.columns

RangeIndex(start=0, stop=969, step=1)

In [111]:
res_cosine.nlargest(5,0)

Unnamed: 0,0
968,1.0
967,0.974542
966,0.933994
965,0.425355
964,0.345554


In [None]:
# Calculate cosine similarity for feature 1 
res_cosine = cosine_similarity(features.loc['A+ High-Dose Retinoid Serum',:].to_frame().transpose(), features) #[0:1] .loc["Essential-C Cleanser",:]
res_cosine = res_cosine.reshape(-1)
res_cosine = pd.DataFrame(res_cosine)
res_sim=df[['product','brand','product_type','price','size','ratings', 'active','vit_a',
            'total_reviews','link','price_oz']].copy()
res_sim['similarity']=res_cosine[[0]]
# Round similarity metric
#res_sim['similarity']=round(res_sim['similarity'],2)
# Maybe don't round so you don't have to deal with ties?
#indexNames = res_sim[res_sim['product']=='Essential-C Cleanser'].index
#res_sim.drop(indexNames, inplace=True)
# Sort from top similarity metrics and ignoring self, so starting at 1, not zero
test = res_sim.nlargest(5, 'similarity')[1:5]
#res_sim.head()
# Select top match
test[:10]
#Good Genes All-In-One Lactic Acid Treatment
#A+ High-Dose Retinoid Serum
# Generic vs similar validation
# Cetaphil Daily Facial Cleanser
# Cetaphil Fragrance Free Moisturizing Cream : 0.959282
# Banana Boat Ultra Sport Sunscreen Lotion, Broad Spectum SPF 30 : 0.999971
# St. Ives Fresh Skin Face Scrub, Apricot : 0.999748
# Clean & Clear Essentials Deep Cleaning Toner Sensitive Skin : 0.999994
# Aveeno Positively Radiant Brightening & Exfoliating Face Scrub : 0.990599

#test[test['vit_a']==1]