In [None]:
%pip install pandas
%pip install scikit-learn

/bin/bash: pip: command not found
/bin/bash: pip: command not found


In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [26]:
product_df = pd.read_csv('complete_skincare_dataset.csv')

In [None]:
def get_skin_type_description(row):
    skin_types = []
    if row['for_dry_skin'] == 1:
        skin_types.append('dry skin')
    if row['for_oily_skin'] == 1:
        skin_types.append('oily skin')
    if row['for_combination_skin'] == 1:
        skin_types.append('combination skin')
    return ' '.join(skin_types)

product_df['skin_text'] = product_df.apply(get_skin_type_description, axis=1)

# Enrich combined_text
product_df['combined_text'] = (
    product_df['ingredients_cleaned'].fillna('') + ' ' +
    product_df['highlights_cleaned'].fillna('') + ' ' +
    product_df['product_category'].fillna('') + ' ' +
    product_df['skin_text']
)

In [51]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(product_df['combined_text'])

# Save both
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')

['tfidf_matrix.pkl']

In [38]:
def recommend_similar(product_index, tfidf_matrix, product_df, top_n=10):
    cosine_sim = cosine_similarity(tfidf_matrix[product_index], tfidf_matrix).flatten()
    similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]
    return product_df.iloc[similar_indices][['product_name', 'brand_name', 'price_usd', 'rating', 'product_category']]

In [53]:
def recommend_similar(product_index, tfidf_matrix, product_df, top_n=10):
    # Compute cosine similarity scores
    cosine_sim = cosine_similarity(tfidf_matrix[product_index], tfidf_matrix).flatten()

    # Get the indices of the top N most similar products (excluding the product itself)
    similar_indices = cosine_sim.argsort()[::-1][1:top_n+1]

    # Select the relevant products
    similar_products = product_df.iloc[similar_indices].copy()

    # Add a similarity_score column to the results
    similar_products['similarity_score'] = cosine_sim[similar_indices]

    # Return selected columns + similarity score
    return similar_products[['product_name', 'brand_name', 'product_category', 'similarity_score']]


In [54]:
example_index = 30  # change this to any product index
recommendations = recommend_similar(example_index, tfidf_matrix, product_df)
print("Recommendations for:", product_df.iloc[example_index]['product_name'])
print(recommendations)

Recommendations for: Liquid Gold Exfoliating Treatment with Glycolic acid
                                           product_name          brand_name  \
34    Mini Liquid Gold Exfoliating Treatment with Gl...             Alpha-H   
1156   Ultra Pure High-Potency 9.8% Glycolic Acid Serum  Kiehl's Since 1851   
1031              Bye Bye Pores 10% Glycolic Acid Serum        IT Cosmetics   
677                    Faceshot Vitamin Ampoules Refill             FaceGym   
1445                            Glow2OH Dark Spot Toner        OLEHENRIKSEN   
2126                    Glycolic Acid Exfoliating Toner      The INKEY List   
1667       SuperFruit Exfoliating Tonic 8% AHA Solution    ROSE Ingleton MD   
673   Faceshot Electric Microneedling Device + Vitam...             FaceGym   
1472                       Mini Glow2OH Dark Spot Toner        OLEHENRIKSEN   
1594  Even Smoother Glycolic Retinol Resurfacing Pee...   Peter Thomas Roth   

       product_category  similarity_score  
34          

In [10]:
target = product_df.iloc[example_index]
recommended = recommend_similar(example_index, tfidf_matrix, product_df)

print("Original product category:", target['product_category'])
print("Recommended categories:\n", recommended['product_category'].value_counts())

Original product category: Treatments
Recommended categories:
 product_category
Treatments           3
Moisturizers         2
Masks                2
Cleansers            1
Sunscreen            1
Value & Gift Sets    1
Name: count, dtype: int64


In [None]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')