In [279]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/skincare-products-clean-dataset/skincare_products_clean.csv


In [280]:
# Section 1: Imports & Configuration
import pandas as pd
import numpy as np
import random
import re
import string
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

In [281]:
# Section 2: Load & Clean Legit Data
DATA_PATH = '/kaggle/input/skincare-products-clean-dataset/skincare_products_clean.csv'
df_legit = pd.read_csv(DATA_PATH, engine='python', encoding='latin1')
if 'product_url' in df_legit.columns:
    df_legit = df_legit.drop(columns=['product_url'])
df_legit = df_legit.rename(columns={
    'clean_ingreds': 'ingredients',
    'product_type': 'category'
})[['product_name', 'ingredients', 'price', 'category']]
df_legit['price'] = df_legit['price'].astype(str).apply(
    lambda x: re.sub(r'[^\d\.]', '', x)
).astype(float)
df_legit['label'] = 0  # Legitimate

In [282]:
# Section 3: Prepare Vocabularies
real_tokens = df_legit['ingredients'].str.split(',').explode().str.strip().str.lower().unique().tolist()
chemical_list = [
    'ethylhexyl methoxycinnamate', 'butylparaben', 'methylparaben', 'propylparaben',
    'phenoxyethanol', 'benzyl alcohol', 'isopropyl myristate', 'cyclopentasiloxane',
    'cyclohexasiloxane', 'dimethicone', 'triethanolamine', 'quaternium-15',
    'formaldehyde', 'polyethylene glycol', 'sodium lauryl sulfate',
    'sodium laureth sulfate', 'ammonium lauryl sulfate', 'triclosan',
    'toluene', 'lead acetate', 'phthalates', 'diethylhexyl phthalate',
    'retinyl palmitate', 'paraffinum liquidum', 'lanolin', 'hydroquinone',
    'resorcinol', 'hydroxyethylcellulose', 'magnesium stearate', 'silica',
    'zinc oxide', 'titanium dioxide'
]
fake_chems = [chem for chem in chemical_list if chem.lower() not in real_tokens]

orig_names = df_legit['product_name'].unique().tolist()
orig_types = df_legit['category'].unique().tolist()
median_price_map = df_legit.groupby('category')['price'].median().to_dict()

In [283]:
# Section 4: Generate Synthetic Counterfeit Data (200 samples)
synthetic = []
for _ in range(200):
    name = random.choice(orig_names)
    cat = random.choice(orig_types)
    med = median_price_map[cat]
    fake_price = round(random.uniform(0.1, 0.5) * med, 2)
    count = random.randint(20, 30)
    n_real = int(count * 0.8)
    n_fake = count - n_real
    real_sel = random.sample(real_tokens, n_real)
    fake_sel = random.sample(fake_chems, n_fake)
    ings = real_sel + fake_sel
    random.shuffle(ings)
    synthetic.append({
        'product_name': name,
        'ingredients': ','.join(ings),
        'price': fake_price,
        'category': cat,
        'label': 1
    })
df_synth = pd.DataFrame(synthetic)

In [284]:
# Section 5: Combine & Introduce Label Noise
df_all = pd.concat([df_legit, df_synth], ignore_index=True)
n_noise = int(0.02 * len(df_all))
noise_idx = np.random.choice(df_all.index, size=n_noise, replace=False)
df_all.loc[noise_idx, 'label'] = 1 - df_all.loc[noise_idx, 'label']
df_all.to_csv('skincare_combined_noisy.csv', index=False)

In [285]:
# Section 6: Feature Engineering
df_all['price_ratio'] = df_all.apply(
    lambda r: r['price'] / median_price_map.get(r['category'], r['price']), axis=1
)
df_all['num_ingredients'] = df_all['ingredients'].apply(lambda x: len(x.split(',')))
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(df_all[['category']])
tf_name = TfidfVectorizer(max_features=50, stop_words='english')
X_name = tf_name.fit_transform(df_all['product_name']).toarray()
tf_ing = TfidfVectorizer(max_features=100, stop_words='english')
X_ing = tf_ing.fit_transform(df_all['ingredients']).toarray()
X_num = df_all[['num_ingredients', 'price_ratio']].values
X = np.hstack([X_num, X_cat, X_name, X_ing])
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
y = df_all['label'].values

In [286]:
# Section 7: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [287]:
# Section 8: Train Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [288]:
# Section 9: Persist Artifacts
joblib.dump({
    'ohe': ohe,
    'tf_name': tf_name,
    'tf_ing': tf_ing,
    'scaler': scaler,
    'clf': clf,
    'median_price_map': median_price_map
}, 'skincare_counterfeit_artifacts.pkl')

['skincare_counterfeit_artifacts.pkl']

In [289]:
# Section 10: Inference Function
def predict_counterfeit(name, ings, price, category):
    art = joblib.load('skincare_counterfeit_artifacts.pkl')
    pr = price / art['median_price_map'].get(category, price)
    num_ings = len(ings.split(','))
    cf = art['ohe'].transform(pd.DataFrame([{art['ohe'].feature_names_in_[0]: category}]))
    nf = art['tf_name'].transform([name]).toarray()
    inf = art['tf_ing'].transform([ings]).toarray()
    x_num = np.array([[num_ings, pr]])
    Xv = np.hstack([x_num, cf, nf, inf])
    Xs = art['scaler'].transform(Xv)
    prob = art['clf'].predict_proba(Xs)[0,1]
    pred = art['clf'].predict(Xs)[0]
    return {'is_counterfeit': bool(pred), 'probability': prob}

In [290]:
# Section 11: Example Usage
print("Legitimate Samples:")
for idx in random.sample(list(df_legit.index), 3):
    r = df_legit.loc[idx]
    print(predict_counterfeit(r['product_name'], r['ingredients'], r['price'], r['category']))

print("\nSynthetic Samples:")
for idx in random.sample(list(df_synth.index), 3):
    r = df_synth.loc[idx]
    print(predict_counterfeit(r['product_name'], r['ingredients'], r['price'], r['category']))

Legitimate Samples:
{'is_counterfeit': False, 'probability': 0.26}
{'is_counterfeit': False, 'probability': 0.03}
{'is_counterfeit': False, 'probability': 0.04}

Synthetic Samples:
{'is_counterfeit': True, 'probability': 0.73}
{'is_counterfeit': True, 'probability': 0.92}
{'is_counterfeit': True, 'probability': 0.9}


In [291]:
# Section: Performance Evaluation on Test Set

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)
import pandas as pd

# Load artifacts and test split (assuming X_test, y_test exist from training pipeline)
art = joblib.load('skincare_counterfeit_artifacts.pkl')
clf = art['clf']

# If X_test and y_test are not in scope, re-split (ensure reproducibility)
# Here we assume X_scaled and y were available; if not, re-run feature pipeline above
# For simplicity, assuming X_test, y_test exist:

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Calculate metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
conf_df = pd.DataFrame(
    conf_mat,
    index=['Actual Legit (0)', 'Actual Fake (1)'],
    columns=['Predicted Legit (0)', 'Predicted Fake (1)']
)

# Classification report
report = classification_report(y_test, y_pred, target_names=['Legit', 'Fake'])

# Display results
print("Accuracy:   ", round(acc, 4))
print("Precision:  ", round(prec, 4))
print("Recall:     ", round(rec, 4))
print("F1 Score:   ", round(f1, 4))
print("ROC AUC:    ", round(roc_auc, 4))
print("\nConfusion Matrix:\n", conf_df)
print("\nClassification Report:\n", report)

Accuracy:    0.9701
Precision:   0.9487
Recall:      0.8605
F1 Score:    0.9024
ROC AUC:     0.9728

Confusion Matrix:
                   Predicted Legit (0)  Predicted Fake (1)
Actual Legit (0)                  223                   2
Actual Fake (1)                     6                  37

Classification Report:
               precision    recall  f1-score   support

       Legit       0.97      0.99      0.98       225
        Fake       0.95      0.86      0.90        43

    accuracy                           0.97       268
   macro avg       0.96      0.93      0.94       268
weighted avg       0.97      0.97      0.97       268

