# **MedTourEasy - Meghna Chaturvedi**

**Project:** Analysis of Chemical Components (Cosmetics)

**Author:** Meghna Chaturvedi

**Date:** September 28, 2025

# Import + Set-Up

In [None]:
# Setup: imports and utilities
import os, math, json, re
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, Markdown

# Deterministic seed for reproducibility
RANDOM_STATE = 42

print("Libraries loaded. Set RANDOM_STATE =", RANDOM_STATE)

Libraries loaded. Set RANDOM_STATE = 42


# Load Dataset

In [None]:
# Load dataset
from google.colab import files
uploaded = files.upload()

# Load dataset (ensure cosmetics.csv uploaded / present)
FILE = "cosmetics.csv"
if not os.path.exists(FILE):
    raise FileNotFoundError(f"{FILE} not found. Upload it to this environment or specify correct path.")

# Read CSV (string-first to inspect)
df = pd.read_csv(FILE, dtype=str)
df.columns = [c.strip() for c in df.columns]  # tidy colnames

# Convert numeric columns if present
numcols = ['Price', 'Rank', 'Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']
for c in numcols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

display(Markdown(f"**Data shape:** {df.shape[0]} rows × {df.shape[1]} cols"))
display(df.head())


Saving cosmetics.csv to cosmetics (1).csv


**Data shape:** 1472 rows × 11 cols

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


#Ingredients cleaning utilities

In [None]:
# Functions to clean and tokenize ingredient lists
def clean_ingredient_text(s):
    if pd.isna(s) or s is None:
        return ''
    t = str(s).lower()
    # normalize separators
    t = re.sub(r'\s*/\s*', ',', t)
    t = re.sub(r'\s*;\s*', ',', t)
    # remove parenthetical content like (ci 77491)
    t = re.sub(r'\([^)]*\)', '', t)
    # unify whitespace and commas
    t = re.sub(r',\s*,+', ',', t)
    t = re.sub(r'\s+', ' ', t).strip()
    t = re.sub(r'(^,|,$)', '', t)
    return t

def tokenize_ingredients(cleaned):
    tokens = [p.strip() for p in cleaned.split(',') if p.strip()]
    return tokens

# Apply to dataframe
if 'Ingredients' not in df.columns:
    raise KeyError("No 'Ingredients' column found in dataset. Rename or provide it.")
df['ingredients_clean'] = df['Ingredients'].astype(str).apply(clean_ingredient_text)
df['ingredients_tokens'] = df['ingredients_clean'].apply(tokenize_ingredients)

display(df[['Name','Brand','ingredients_clean']].head(6))


Unnamed: 0,Name,Brand,ingredients_clean
0,Crème de la Mer,LA MER,"algae extract, mineral oil, petrolatum, glycer..."
1,Facial Treatment Essence,SK-II,"galactomyces ferment filtrate , butylene glyco..."
2,Protini™ Polypeptide Cream,DRUNK ELEPHANT,"water, dicaprylyl carbonate, glycerin, ceteary..."
3,The Moisturizing Soft Cream,LA MER,"algae extract, cyclopentasiloxane, petrolatum,..."
4,Your Skin But Better™ CC+™ Cream with SPF 50+,IT COSMETICS,"water, snail secretion filtrate, phenyl trimet..."
5,The Water Cream,TATCHA,"water, saccharomyces,camellia sinensis leaf,cl..."


#Build DTM

In [None]:
# document-term matrix (DTM) using CountVectorizer that treats comma-separated pieces as tokens
vectorizer = CountVectorizer(token_pattern='[^,]+')  # tokens = comma-separated segments
X_counts = vectorizer.fit_transform(df['ingredients_clean'].fillna(''))
terms = vectorizer.get_feature_names_out()
print("DTM shape:", X_counts.shape)

# Reduce very rare terms to control dimensionality (tweak min_doc_freq)
min_doc_freq = 3
term_docfreq = (X_counts > 0).sum(axis=0).A1
keep_mask = term_docfreq >= min_doc_freq
terms_kept = terms[keep_mask]
X_reduced = X_counts[:, keep_mask]
print("Reduced DTM shape:", X_reduced.shape, f"(terms kept >= {min_doc_freq} docs)")

DTM shape: (1472, 6651)
Reduced DTM shape: (1472, 2372) (terms kept >= 3 docs)


#Remove duplicate ingredient vectors

In [None]:
# To speed t-SNE and reduce visual clutter, remove exact duplicate ingredient vectors (binary signature)
from scipy.sparse import csr_matrix

# convert to binary presence/absence matrix
X_bin = (X_reduced > 0).astype(int)

# create signatures by rows: to save memory we can hash rows using bytes
def row_signature(sparse_row):
    # return a bytes signature (fast enough for modest datasets)
    return bytes(sparse_row.toarray().tobytes())

signatures = [row_signature(X_bin[i]) for i in range(X_bin.shape[0])]
unique_sig_to_first_index = {}
unique_indices = []
for idx, sig in enumerate(signatures):
    if sig not in unique_sig_to_first_index:
        unique_sig_to_first_index[sig] = idx
        unique_indices.append(idx)

print(f"Products: {X_bin.shape[0]} → unique ingredient-vectors: {len(unique_indices)}")
X_unique = X_reduced[unique_indices, :]

Products: 1472 → unique ingredient-vectors: 1236


#SVD and t-SNE

In [None]:
svd_components = 50
if X_unique.shape[1] > svd_components:
    svd = TruncatedSVD(n_components=svd_components, random_state=RANDOM_STATE)
    X_svd = svd.fit_transform(X_unique)
    print("SVD reduced to", svd_components, "components. Explained variance sum:", svd.explained_variance_ratio_.sum())
else:
    X_svd = X_unique.toarray()

perplexity = min(30, max(5, int(X_svd.shape[0] / 10)))
print("Running t-SNE with perplexity:", perplexity)
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=RANDOM_STATE, init='pca', learning_rate='auto')
X_emb = tsne.fit_transform(X_svd)
print("t-SNE finished. Embedding shape:", X_emb.shape)

SVD reduced to 50 components. Explained variance sum: 0.44612516413924874
Running t-SNE with perplexity: 30
t-SNE finished. Embedding shape: (1236, 2)


#Embeddings dataframe & map to product metadata

In [None]:
# Map embeddings back to original products (we removed duplicates earlier)
emb_df = pd.DataFrame(X_emb, columns=['tsne1','tsne2'])
emb_df['orig_idx'] = [int(i) for i in unique_indices]  # row indices in original df
# bring product metadata
emb_df['Name'] = df.iloc[emb_df['orig_idx']]['Name'].values
emb_df['Brand'] = df.iloc[emb_df['orig_idx']]['Brand'].values
emb_df['Price'] = df.iloc[emb_df['orig_idx']]['Price'].values
emb_df['Rank'] = df.iloc[emb_df['orig_idx']]['Rank'].values

display(emb_df.head())

Unnamed: 0,tsne1,tsne2,orig_idx,Name,Brand,Price,Rank
0,-3.125509,-43.083805,0,Crème de la Mer,LA MER,175,4.1
1,-16.030149,2.715127,1,Facial Treatment Essence,SK-II,179,4.1
2,23.355379,-5.470935,2,Protini™ Polypeptide Cream,DRUNK ELEPHANT,68,4.4
3,-2.89731,-46.148869,3,The Moisturizing Soft Cream,LA MER,175,3.8
4,41.864647,3.396981,4,Your Skin But Better™ CC+™ Cream with SPF 50+,IT COSMETICS,38,4.1


#Interactive t-SNE plot

In [None]:
# Interactive scatter: color by brand, hover shows product name + price + rank
fig = px.scatter(emb_df, x='tsne1', y='tsne2', color='Brand',
                 hover_data=['Name','Price','Rank'], title='t-SNE ingredient similarity',
                 height=650)
fig.update_traces(marker=dict(size=8, line=dict(width=0.4, color='DarkSlateGrey')))
fig.update_layout(legend={'itemsizing': 'constant'})
fig.show()

#Ingrediants Tokens

In [None]:
from collections import Counter
tokens = [t for toks in df['ingredients_tokens'] for t in toks]
freq = Counter(tokens)
topN = 80
freq_items = freq.most_common(topN)
wc_df = pd.DataFrame(freq_items, columns=['ingredient','count']).sort_values('count', ascending=False)

# A simple circular layout for a 'word-bubble' like view
wc_df['angle'] = np.linspace(0, 2*np.pi, len(wc_df))
wc_df['x'] = np.cos(wc_df['angle']) * (1 + np.log1p(wc_df['count']))
wc_df['y'] = np.sin(wc_df['angle']) * (1 + np.log1p(wc_df['count']))

fig = px.scatter(wc_df, x='x', y='y', size='count', text='ingredient', size_max=60,
                 title=f'Top {topN} ingredients (bubble-text)', height=600)
fig.update_traces(textposition='middle center', marker=dict(opacity=0.7))
fig.update_layout(showlegend=False)
fig.show()

#Product Comparison

In [None]:
def compare_products(name_a, name_b, show_lists=True):
    a = df[df['Name'] == name_a]
    b = df[df['Name'] == name_b]
    if a.empty or b.empty:
        print("One or both names not found. Use df['Name'].unique() to inspect available product names.")
        return
    a_tokens = set(a.iloc[0]['ingredients_tokens'])
    b_tokens = set(b.iloc[0]['ingredients_tokens'])
    common = sorted(a_tokens & b_tokens)
    only_a = sorted(a_tokens - b_tokens)
    only_b = sorted(b_tokens - a_tokens)
    print(f"Product A: {name_a}  — Brand: {a.iloc[0].get('Brand','')}")
    print(f"Product B: {name_b}  — Brand: {b.iloc[0].get('Brand','')}")
    print('\\nShared ingredients (%d):' % len(common))
    print(common)
    print('\\nOnly in A (%d):' % len(only_a))
    print(only_a[:200])
    print('\\nOnly in B (%d):' % len(only_b))
    print(only_b[:200])
compare_products('Crème de la Mer', 'Protini™ Polypeptide Cream')


Product A: Crème de la Mer  — Brand: LA MER
Product B: Protini™ Polypeptide Cream  — Brand: DRUNK ELEPHANT
\nShared ingredients (3):
['glycerin', 'sodium benzoate', 'water']
\nOnly in A (39):
['alcohol denat.', 'algae extract', 'aluminum distearate', 'benzyl salicylate', 'beta-carotene', 'calcium gluconate', 'citral', 'citric acid', 'citronellol', 'citrus aurantifolia extract', 'copper gluconate', 'cyanocobalamin', 'decyl oleate', 'eucalyptus globulus leaf oil', 'fragrance.', 'geraniol', 'helianthus annuus seedcake', 'hydroxycitronellal', 'isohexadecane', 'lanolin alcohol', 'limonene', 'linalool', 'magnesium gluconate', 'magnesium stearate', 'magnesium sulfate', 'medicago sativa seed powder', 'microcrystalline wax', 'mineral oil', 'niacin', 'octyldodecanol', 'panthenol', 'paraffin', 'petrolatum', 'prunus amygdalus dulcis seed meal', 'sesamum indicum seed oil', 'sesamum indicum seed powder', 'sodium gluconate', 'tocopheryl succinate', 'zinc gluconate']
\nOnly in B (61):
['1', '2-hexaned

#SAVE

In [None]:
# Save embeddings for downstream dashboard or quick loads (CSV)
OUT_EMB = "tsne_embeddings_medtoureasy.csv"
emb_df.to_csv(OUT_EMB, index=False)
print("Saved embeddings to", OUT_EMB)

# Quick KPI summary
kpis = {
    'total_products': int(df.shape[0]),
    'avg_price': float(df['Price'].mean(skipna=True)),
    'median_price': float(df['Price'].median(skipna=True)),
    'avg_rank': float(pd.to_numeric(df['Rank'], errors='coerce').mean(skipna=True))
}
display(Markdown("**KPI summary**"))
display(kpis)

Saved embeddings to tsne_embeddings_medtoureasy.csv


**KPI summary**

{'total_products': 1472,
 'avg_price': 55.58423913043478,
 'median_price': 42.5,
 'avg_rank': 4.153260869565218}