In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the dataset
# IMPORTANT: Assume 'skindataall.csv' is uploaded manually to Google Colab
try:
    df = pd.read_csv('/content/skindataall.csv')
except FileNotFoundError:
    print("Error: 'skindataall.csv' not found. Please upload the dataset to your Colab environment.")
    exit()

# Preprocess dataset columns as per the requirements
df = df.rename(columns={
    "Product": "name",
    "Brand": "brand",
    "Skin_Type": "skin_type",
    "Category": "category",
    "Ingredients_Cleaned": "ingredients_cleaned",
    "Product_Url": "url",
    "Good_Stuff": "good_stuff",
    "Rating_Stars": "rating"
})

# Convert 'skin_type' to lowercase and handle missing values
df['skin_type'] = df['skin_type'].fillna('').astype(str).str.lower()
# Replace any variations of 'all' or empty strings with 'all' for consistency
df['skin_type'] = df['skin_type'].apply(lambda x: 'all' if x == '' or x == 'all' else x)


# Convert 'category' to lowercase and handle missing values
df['category'] = df['category'].fillna('').astype(str).str.lower()

# Ensure 'ingredients_cleaned' is a string. If it's a list-like string, convert it to a space-separated string.
# This handles cases where ingredients might be stored as string representations of lists.
def clean_ingredients(ingredients):
    if isinstance(ingredients, str):
        # Remove brackets, quotes, and split by comma, then join with space
        ingredients = re.sub(r'[\[\]\']', '', ingredients)
        return ' '.join([item.strip() for item in ingredients.split(',') if item.strip()])
    return ''

df['ingredients_cleaned'] = df['ingredients_cleaned'].apply(clean_ingredients)


# Combine features into a single string for TF-IDF vectorization
# We'll combine skin_type, category, and ingredients_cleaned
df['combined_features'] = df['skin_type'] + ' ' + df['category'] + ' ' + df['ingredients_cleaned']

# Initialize TF-IDF Vectorizer
# Use a TfidfVectorizer to convert the text data into numerical feature vectors.
# This helps in capturing the importance of words in the document relative to the corpus.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
# This creates a matrix where each row represents a product and each column represents a word,
# with values indicating the TF-IDF score of that word for that product.
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

def recommend_products(skin_type, category, ingredients, top_n=5):
    """
    Recommends cosmetic products based on user inputs.

    Args:
        skin_type (str): User's skin type (e.g., "Oily", "Dry", "All").
        category (str): Preferred product category (e.g., "Moisturizer", "Cleanser").
        ingredients (str): Comma-separated string of preferred ingredients (e.g., "Niacinamide, Zinc").
        top_n (int): Number of top recommendations to return.

    Returns:
        pandas.DataFrame: A DataFrame containing the top_n recommended products
                          with columns: name, brand, skin_type, category, rating, url.
                          Returns an empty DataFrame if no recommendations are found.
    """
    # Preprocess user input to match dataset format
    user_skin_type = skin_type.lower()
    user_category = category.lower()
    user_ingredients = clean_ingredients(ingredients)

    # Create a combined feature string for the user input
    user_combined_features = user_skin_type + ' ' + user_category + ' ' + user_ingredients

    # Transform the user input using the fitted TF-IDF vectorizer
    # This ensures the user input is in the same feature space as the product data.
    user_tfidf_vector = tfidf_vectorizer.transform([user_combined_features])

    # Compute cosine similarity between user input and all products in the dataset
    # Cosine similarity measures the cosine of the angle between two non-zero vectors.
    # It determines how similar the user's preferences are to each product.
    cosine_similarities = cosine_similarity(user_tfidf_vector, tfidf_matrix).flatten()

    # Get the indices of products sorted by similarity in descending order
    # These indices correspond to the most similar products.
    product_indices = cosine_similarities.argsort()[-top_n:][::-1]

    # Filter out products with 0 similarity (no match)
    # This ensures that only relevant products are recommended.
    relevant_indices = [idx for idx in product_indices if cosine_similarities[idx] > 0]

    # Get the recommended products
    recommended_products = df.iloc[relevant_indices]

    # Select and return the required columns
    return recommended_products[['name', 'brand', 'skin_type', 'category', 'rating', 'url']]

# Example Usage (as requested by the user)
print("--- Example Usage ---")
recommendations = recommend_products("Oily", "Moisturizer", "Niacinamide, Zinc")
if not recommendations.empty:
    print("Recommended Products:")
    print(recommendations)
else:
    print("No recommendations found for the given criteria.")

print("\n--- Another Example Usage (Dry Skin, Cleanser, Hyaluronic Acid) ---")
recommendations_dry = recommend_products("Dry", "Cleanser", "Hyaluronic Acid")
if not recommendations_dry.empty:
    print("Recommended Products:")
    print(recommendations_dry)
else:
    print("No recommendations found for the given criteria.")

print("\n--- Another Example Usage (All Skin, Serum, Vitamin C) ---")
recommendations_all = recommend_products("All", "Serum", "Vitamin C")
if not recommendations_all.empty:
    print("Recommended Products:")
    print(recommendations_all)
else:
    print("No recommendations found for the given criteria.")


--- Example Usage ---
Recommended Products:
                                                   name         brand  \
2675                     Vintage Single Extract Essence  AMOREPACIFIC   
2682                     Vintage Single Extract Essence  AMOREPACIFIC   
6313  Essential Power Skin Toner for Combination to ...       LANEIGE   
6307  Essential Power Skin Toner for Combination to ...       LANEIGE   
6302  Essential Power Skin Toner for Combination to ...       LANEIGE   

     skin_type     category  rating  \
2675      oily  moisturizer       5   
2682      oily  moisturizer       5   
6313      oily     cleanser       5   
6307      oily     cleanser       5   
6302      oily     cleanser       2   

                                                    url  
2675  https://www.sephora.com/product/vintage-single...  
2682  https://www.sephora.com/product/vintage-single...  
6313  https://www.sephora.com/product/essential-powe...  
6307  https://www.sephora.com/product/essential-po