# Analysis for Insight Project

## Setup

In [1]:
# import modules
import pandas as pd
import numpy as np
import rootpath
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import sklearn.metrics as metrics
from sklearn.utils import resample

from sklearn.metrics.pairwise import cosine_similarity as cos_sim

#import qgrid
import matplotlib.pyplot as plt
import seaborn as sns

### Locate files

In [2]:
# Set paths for project

# Set rootpath for project
rpath = rootpath.detect()

# Set directory
directory_name = '/data/clean/'

# Set filename
file_name = 'data_tsvd_full.csv'

### Compile data

In [3]:
df_tsvd_full = pd.read_csv(rpath+directory_name+file_name, index_col=0)

# Analysis: Cosine similarity

## Validation
### Recommendations for brand vs. generic versions

In [4]:
def get_features(df, prod_info):
    """
    Purpose: extract ingredient features from tsvd dataframe
    Returns: df of ingredient features only
    df: df of tsvd and product info
    prod_info: list of strings of column names to remove
    """
    df_tsvd_full_features = df.drop(prod_info, axis=1) \
                                    .set_index('product')
    
    return df_tsvd_full_features

In [5]:
# Separate features from product information

prod_info = ['use_category', 'brand', 'brand_generic',
             'size', 'price', 'link']

df_tsvd_full_features = get_features(df_tsvd_full, prod_info)

In [6]:
def filter_string(list_of_strings, string_to_remove):
    """
    Purpose: filters a string(s) from a list of strings
    Returns: a list without the indicated string(s)
    string_to_remove: MUST BE A LIST
    """
    new_list = [string for string in list_of_strings \
                    if string not in string_to_remove]
    return new_list

In [7]:
# Extract brand_generic product labels

brand_gen_labels = (
    filter_string(df_tsvd_full['brand_generic'].unique(),'non_generic')
)

In [8]:
# List of brand name product brands 
# This does not include their generic versions

brand_names = ['Aveeno','Banana boat','Cetaphil','Clean and Clear',
               'St. Ives']

In [9]:
# Extract df of brand_generic products for comparison to similarity output

# Get columns of product info only

df_brand_gen = (
    df_tsvd_full[df_tsvd_full.columns.intersection(prod_info)].copy()
)

# Add product

df_brand_gen['product'] = df_tsvd_full['product']

# Filter out only brand_generic products

df_brand_gen = df_brand_gen.query('brand_generic == @brand_gen_labels')

In [10]:
# Extract product names for brand name products only
# These are the input products for checking recommendation
#   accuracy for specific products

brand_name_prod = (
    df_tsvd_full.query('brand_generic == @brand_gen_labels and \
                        brand == @brand_names')['product']
)

In [11]:
def reorder_first_cols(df, col_order):
    '''
    Reorder columns in dataframe with col_order as a list of column names
    in the order you want them in to appear at the beginner of the dataframe.
    The rest of the columns will remain in the same order as before. 
    '''
    # Create new column ordering
    new_col_order = (
          col_order + [col for col in df.columns if col not in col_order]
    )
    # Reindex columns based on new order
    df = df.reindex(columns=new_col_order)
    
    return df

In [12]:
def add_prod_info(new_df, original_df, col_names, prod_names_col):
    """
    Purpose: Add product information to TSVD dataframe
    Returns: dataframe with product information in rows and columns
    new_df: df with cosine similarity results results
    original_df: df before tsvd with product information
    col_names: List of strings of column names to be added into tsvd df
    prod_names_col: Name of column with product names, string
    """
    # Add product names
    new_df['product']=original_df[prod_names_col]
    # Copy columns to new df
    col_copy = original_df[col_names].copy()
    # Copy product names to col names df
    col_copy['product'] = original_df[prod_names_col]
    # Join with tsvd df on product
    new_df = pd.merge(new_df, col_copy, how='inner', on='product')
    # Reorder df columns
    cols_order = ['product','cosine_sim', 'use_category','brand',
                  'brand_generic','size','price','link']
    new_df = reorder_first_cols(new_df, cols_order)
    return new_df

In [13]:
def get_prod_rec_acc(prod_list, prod_df, feat_df):
    """
    Purpose: Check performance of recommendations for list of products.
    Returns: df of the results for each input product in prod_list
    prod_list: list of product names for calculating cosine similarities
    prod_df: dataframe of cosine sim with products as rows 
    feat_df: dataframe of cosine sim only with products as indices
    """
    # Set initial value for number of valid recommendations
    correct_rec = []
    input_prod = []
    top_output_prod = []
    top_cos_sim = []    
    
    # Check recommendation accuracy for each product in prod_list
    for product in prod_list:
        # Calculate Cosine Similarities
        res_cosine = (
            cos_sim(feat_df.loc[[product],:],
                              feat_df)
        )
        # Convert to df
        res_cosine = pd.DataFrame(res_cosine.transpose()) \
                       .rename(columns={0:'cosine_sim'})
        # Add product info to cosine sim results
        df_cos_sim = add_prod_info(new_df=res_cosine,
                                   original_df=prod_df,
                                   col_names=prod_info, 
                                   prod_names_col='product')
        # Round to 4 digits
        df_cos_sim['cosine_sim'] = df_cos_sim['cosine_sim'].round(4)
        # Extract top 6 most similar results
        df_cos_sim_topres = df_cos_sim.nlargest(6,'cosine_sim')
        # Filter df without input product 
        #   (should always be with cosine sim of 1)
        df_cos_sim_topres = (
            df_cos_sim_topres[~df_cos_sim_topres['product'].isin([product])]
        )
        # Get brand generic label for input product
        input_prod_label = (
            df_brand_gen.query('product in @product')['brand_generic']
        )
        # Get brand_generic label for top result
        top_res_prod_label = (
            df_cos_sim_topres.nlargest(1,'cosine_sim')['brand_generic']
        )
        # Compare brand_generic label of input product and add 1 if it matches
        if input_prod_label.values == top_res_prod_label.values:
            correct_rec.append(True)
        else:
            correct_rec.append(False)
        # Save the input product, comparison product, similarity 
        input_prod.append(product)
        top_output_prod.extend(
            df_cos_sim_topres.nlargest(1,'cosine_sim')['product'].tolist()
        )
        top_cos_sim.extend(
            df_cos_sim_topres.nlargest(1,'cosine_sim')['cosine_sim'].tolist()
        )

    # Add lists together and turn into df 
    df_valid_res = (
        pd.DataFrame(list(zip(input_prod, top_output_prod, top_cos_sim, 
                              correct_rec)),
                     columns=['input_prod','top_output_prod','cos_sim',
                                      'correct_rec'])
    )
    
    return df_valid_res

In [14]:
# Get recommendation accuracy for list of brand name products

df_valid_res = get_prod_rec_acc(brand_name_prod, prod_df=df_tsvd_full,
                                feat_df=df_tsvd_full_features)

In [23]:
def get_overall_rec_acc(df, print_res=True):
    """
    Purpose: print proportion and percent correct results from cos_sim results
    Returns:
    df: df of cosine sim results for a list of products
    """
    # Calculate percentage of correct recommendations
    num_correct = sum(df['correct_rec'].values)
    per_correct = [round((num_correct/6) * 100,2)]
    # convert num_correct to list for zip
    num_correct = [num_correct]
    df_acc_res = pd.DataFrame(list(zip(num_correct, per_correct)),
                              columns=['num_correct','perc_correct'])
    if print_res:
        print(f"Correct recommendations out of 6 labeled products: \
        {num_correct[0]}/6 or {per_correct[0]}%")
    
    return df_acc_res

In [42]:
# Check recommendation accuracy for list of brand name products

df_cos_sim_val_res = get_overall_rec_acc(df_valid_res)
df_cos_sim_val_res

Correct recommendations out of 6 labeled products:         5/6 or 83.33%


Unnamed: 0,num_correct,perc_correct
0,5,83.33


### Recommendations for brand vs. generic versions - randomization result

In [31]:
def get_prod_rec_acc_rand(prod_list, df_tsvd, randiter):
    """
    Purpose: Gets recommendation accuracy results from randomization with 
        randiter reshuffling. Recommended minimum is 1000
    Returns: df of randomization results
    prod_list: list of product names for calculating cosine similarities
    df_tsvd: dataframe of tsvd results
    randiter: Number of randomization iterations to run
    """

    from numpy import random
    import sys, os
    
    # Initialize empty lists for storing results
    iteration = []
    prop_correct = []
    
    # create list of non-feature columns for the get_features function
    prod_info = ['use_category', 'brand', 'brand_generic',
                 'size', 'price', 'link']
    
    # Perform randomization and calculate correct recommendations each time
    for i in range(randiter-1):
        # Randomize the product names for tsvd results df 
        df_tsvd['product'] = random.permutation(df_tsvd['product'].values)
        # get df of randomized features
        df_tsvd_feat = get_features(df_tsvd, prod_info)
        # Get accuracy of recommendations
        df_valid_res = get_prod_rec_acc(prod_list, 
                                        prod_df=df_tsvd,
                                        feat_df=df_tsvd_feat)
        # Check recommendation accuracy for list of brand name products
        df_valid_acc = get_overall_rec_acc(df_valid_res, print_res=False)
        # Append results to lists
        iteration.append(i)
        prop_correct.extend(df_valid_acc['perc_correct'].tolist())
    
    # Concatenate results into df
    df_overall_acc_res = pd.DataFrame(list(zip(iteration, prop_correct)),
                                      columns=['iteration','perc_correct'])
    
    return df_overall_acc_res

In [35]:
# Get results from randomization

df_rand_res = get_prod_rec_acc_rand(prod_list=brand_name_prod,
                                    df_tsvd=df_tsvd_full,
                                    randiter=1000)

In [41]:
# Save randomization output

filename = 'data_brand_gen_rand.csv'
df_rand_res.to_csv(rpath+directory_name+filename)

In [81]:
def get_acc_p_val(original_df, rand_df):
    """
    Purpose: Print the p-value for the results observed compared to random 
        chance expectation
    Returns:
    original_df: df of results from original 
        accuracy assessment of recs
    rand_df: df of results from randomization
    """
    # Get original accuracy 
    perc_corr_original = original_df['perc_correct'][0]

    # Get the proportion of randomization results that were equal to 
    #   or greater than the original accuracy
    rand_p_val = (
        (rand_df.perc_correct[rand_df['perc_correct'] >= \
                             perc_corr_original].sum()/1000) * 100
    )
    
    text = f"The probability of observing the original recommendation " \
           f"accuracy of {perc_corr_original}% if the results were due " \
           f"to random chance is {rand_p_val}%"

    print(text)
    
    return rand_p_val

In [83]:
rand_p_val = get_acc_p_val(df_cos_sim_val_res, df_rand_res)

The probability of observing the original recommendation accuracy of 83.33% if the results were due to random chance is 0.0%


In [90]:
# calculate the average accuracy from the randomization results

rand_mean_acc = round(df_rand_res['perc_correct'].mean(),2)
rand_med_acc = round(df_rand_res['perc_correct'].median(),2)

print(f"The average recommendation accuracy for the 1000 randomizations" \
      f"is: {rand_mean_acc}%")
print(f"The median recommendation accuracy for the 1000 randomizations" \
      f"is: {rand_med_acc}%")

The average recommendation accuracy for the 1000 randomizationsis: 0.28%
The median recommendation accuracy for the 1000 randomizationsis: 0.0%


In [88]:
rand_mean_acc

0.28

In [None]:
# Randomly shuffle product names

df_rand['product'] = random.permutation(df['product'].values)



df_tsvd_full_rand = resample(df_tsvd_full, replace=True,
                             n_samples=len(df_tsvd_full))
df_tmp_boot

In [169]:
# Get example product to test

ex_prod = brand_name_prod[:1].values

In [159]:
# Calculate Cosine Similarities

res_cosine = (
    cos_sim(df_tsvd_full_features.loc[ex_prod,:],
                      df_tsvd_full_features)
)

In [160]:
# Convert to df

res_cosine = pd.DataFrame(res_cosine.transpose()) \
               .rename(columns={0:'cosine_sim'})

In [163]:
# Add product info to cosine sim results

df_cos_sim = add_prod_info(new_df=res_cosine, original_df=df_tsvd_full,
                           col_names=prod_info, prod_names_col='product')

In [164]:
# Round to 4 digits

df_cos_sim['cosine_sim'] = df_cos_sim['cosine_sim'].round(4)

In [177]:
# Extract top 6 most similar results

df_cos_sim_topres = df_cos_sim.nlargest(6,'cosine_sim')

In [192]:
# Filter df without input product (should always be with cosine sim of 1)

df_cos_sim_topres = (
    df_cos_sim_topres[~df_cos_sim_topres['product'].isin(ex_prod)]
)

In [173]:
# Check if top result matches by 
# 1 remove top result
# 2 compare brand_generic ID to itself

ex_prod

df_brand_gen.query('product in @ex_prod')['brand_generic']

74    Cetaphil_cleanser_generic
Name: brand_generic, dtype: object

In [198]:
# Get brand generic label for input product

input_prod_label = df_brand_gen.query('product in @ex_prod')['brand_generic']

In [199]:
# Get brand_generic label for top result

top_res_prod_label = (
    df_cos_sim_topres.nlargest(1,'cosine_sim')['brand_generic']
)

In [None]:
# Compare brand_generic label of input product 

input_prod_label == top_res_prod_label

In [None]:
# Calculate cosine similarity for feature 1 
res_cosine = cosine_similarity(features.loc['A+ High-Dose Retinoid Serum',:].to_frame().transpose(), features) #[0:1] .loc["Essential-C Cleanser",:]
res_cosine = res_cosine.reshape(-1)
res_cosine = pd.DataFrame(res_cosine)
res_sim=df[['product','brand','product_type','price','size','ratings', 'active','vit_a',
            'total_reviews','link','price_oz']].copy()
res_sim['similarity']=res_cosine[[0]]
# Round similarity metric
#res_sim['similarity']=round(res_sim['similarity'],2)
# Maybe don't round so you don't have to deal with ties?
#indexNames = res_sim[res_sim['product']=='Essential-C Cleanser'].index
#res_sim.drop(indexNames, inplace=True)
# Sort from top similarity metrics and ignoring self, so starting at 1, not zero
test = res_sim.nlargest(5, 'similarity')[1:5]
#res_sim.head()
# Select top match
test[:10]
#Good Genes All-In-One Lactic Acid Treatment
#A+ High-Dose Retinoid Serum
# Generic vs similar validation
# Cetaphil Daily Facial Cleanser
# Cetaphil Fragrance Free Moisturizing Cream : 0.959282
# Banana Boat Ultra Sport Sunscreen Lotion, Broad Spectum SPF 30 : 0.999971
# St. Ives Fresh Skin Face Scrub, Apricot : 0.999748
# Clean & Clear Essentials Deep Cleaning Toner Sensitive Skin : 0.999994
# Aveeno Positively Radiant Brightening & Exfoliating Face Scrub : 0.990599

#test[test['vit_a']==1]