In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
# Load dataset
df = pd.read_csv('input/cosmetics.csv')

In [3]:
# Prepare the Data
df['features'] = df['Ingredients']

# Initialize and Transform TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

# Encode Categorical Features
df['Label'] = df['Label'].astype('category').cat.codes
df['Brand'] = df['Brand'].astype('category').cat.codes

# Combine TF-IDF and Additional Features
numerical_features = ['Price', 'Rank']
categorical_features = ['Brand']

# Preprocess additional features
scaler = StandardScaler()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_features),
    ], remainder='passthrough')

# Concatenate features
additional_features = preprocessor.fit_transform(df[numerical_features + categorical_features])
X_combined = np.hstack((tfidf_matrix.toarray(), additional_features))

# Split Data for Training and Testing
X = X_combined
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Hyperparameter Tuning for SVM
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm_model = GridSearchCV(SVC(class_weight='balanced'), svm_params, cv=5, scoring='accuracy')
svm_model.fit(X_train, y_train)


In [None]:
# Hyperparameter Tuning for KNN
knn_params = {
    'n_neighbors': [3, 5, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_model = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy')
knn_model.fit(X_train, y_train)

In [None]:
# Hyperparameter Tuning for Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_model = GridSearchCV(RandomForestClassifier(class_weight='balanced'), rf_params, cv=5, scoring='accuracy')
rf_model.fit(X_train, y_train)

In [None]:
# Evaluate Models
svm_best_model = svm_model.best_estimator_
knn_best_model = knn_model.best_estimator_
rf_best_model = rf_model.best_estimator_

svm_accuracy = accuracy_score(y_test, svm_best_model.predict(X_test))
knn_accuracy = accuracy_score(y_test, knn_best_model.predict(X_test))
rf_accuracy = accuracy_score(y_test, rf_best_model.predict(X_test))

In [None]:
# Define the Recommendation Function
def recommend_cosmetics(skin_type, label_filter, rank_filter, brand_filter, price_range, ingredient_input=None, num_recommendations=10):
    recommended_products = df[df[skin_type] == 1]

    if label_filter != 'All':
        recommended_products = recommended_products[recommended_products['Label'] == label_filter]

    recommended_products = recommended_products[
        (recommended_products['Rank'] >= rank_filter[0]) & 
        (recommended_products['Rank'] <= rank_filter[1])
    ]

    if brand_filter != 'All':
        recommended_products = recommended_products[recommended_products['Brand'] == brand_filter]

    recommended_products = recommended_products[
        (recommended_products['Price'] >= price_range[0]) & 
        (recommended_products['Price'] <= price_range[1])
    ]

    if ingredient_input:
        input_vec = tfidf.transform([ingredient_input])
        cosine_similarities = cosine_similarity(input_vec, tfidf_matrix).flatten()
        recommended_indices = cosine_similarities.argsort()[-num_recommendations:][::-1]
        ingredient_recommendations = df.iloc[recommended_indices]
        recommended_products = recommended_products[recommended_products.index.isin(ingredient_recommendations.index)]

    return recommended_products.sort_values(by=['Rank']).head(num_recommendations)


In [None]:
# Recommendation Accuracy Evaluation
def evaluate_recommendations(recommended_df, ground_truth_labels):
    true_labels = ground_truth_labels.loc[recommended_df.index]
    predicted_labels = recommended_df['Label']
    precision = precision_score(true_labels, predicted_labels, average='macro')
    recall = recall_score(true_labels, predicted_labels, average='macro')
    return precision, recall

In [None]:
# Compare Model Performances
print("Model Accuracies:")
print(f"SVM Accuracy: {svm_accuracy}")
print(f"KNN Accuracy: {knn_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")


In [None]:
# Sample Recommendation
sample_skin_type = 'Combination'
sample_label_filter = 'All'
sample_rank_filter = (1, 5)
sample_brand_filter = 'All'
sample_price_range = (10, 50)
sample_ingredient_input = 'Aloe Vera, Vitamin E'

recommended_products = recommend_cosmetics(
    sample_skin_type, sample_label_filter, sample_rank_filter, 
    sample_brand_filter, sample_price_range, sample_ingredient_input
)
print("\nRecommended Products:")
print(recommended_products[['Label', 'Brand', 'Name', 'Ingredients', 'Rank']])

In [None]:
# Evaluate Recommendation Accuracy
precision, recall = evaluate_recommendations(recommended_products, df['Label'])
print("\nRecommendation Accuracy Metrics:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")