In [34]:
import pandas as pd
import numpy as np
import pickle

In [35]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [36]:
training_data = pd.read_csv("augmented_training_data.csv")
training_data['price'] = training_data['price'].fillna(0)
training_data.head()

Unnamed: 0,brand,name,price,rating,product_type,quantity_sold,recommendation
0,rejuva minerals,Multi Purpose Powder - Blush & Eye,0.0,3.1,blush,7,no
1,marienatie,Mineral Blush,0.0,1.3,blush,4,no
2,lotus cosmetics usa,Creme to Powder Blush,0.0,1.9,blush,8,no
3,glossier,Cloud Paint,22.0,3.2,blush,5,no
4,nyx,Sweet Cheeks Blush Palette,20.0,2.7,blush,7,no


In [43]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

brand_encoded = ohe.fit_transform(training_data[['brand']])
with open("ohe.pkl", "wb") as f:
    pickle.dump(ohe, f)

bdf = pd.DataFrame(brand_encoded)


In [38]:
tfidf = TfidfVectorizer(max_features=50)
name_vectorized = tfidf.fit_transform(training_data['name'].fillna("")).toarray()
with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

In [39]:
scaler = MinMaxScaler()
price_scaled = scaler.fit_transform(training_data[['price']])
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [40]:
feature_matrix = np.hstack([brand_encoded, name_vectorized, price_scaled])

In [41]:
nn_model = NearestNeighbors(n_neighbors=6, metric='cosine')
nn_model.fit(feature_matrix)
with open("nn_model.pkl", "wb") as f:
    pickle.dump(nn_model, f)

In [42]:

query_index = 641
query_vector = feature_matrix[query_index].reshape(1, -1)
distances, indices = nn_model.kneighbors(query_vector)


print("You bought:")
print(training_data.iloc[query_index][['brand', 'name']])

print("\n Recommended Products:")


seen_names = set()
unique_recs = []

query_name = training_data.iloc[query_index]['name'].strip().lower()

for idx in indices[0]:
    rec = training_data.iloc[idx]
    name = rec['name'].strip().lower()
    
    if name != query_name and name not in seen_names:
        seen_names.add(name)
        unique_recs.append(rec)


for rec in unique_recs[:5]:  
    print(f"- {rec['brand']} — {rec['name']} ({rec['product_type']}) • ${rec['price']}")


You bought:
brand                           nyx
name     Lip Lustre Glossy Lip Tint
Name: 641, dtype: object

 Recommended Products:
- nyx — Epic Ink Lip Dye (lipstick) • $7.0
- nyx — Lip Lingerie (lipstick) • $7.0
- nyx — Strictly Vinyl Lip Gloss (lipstick) • $8.0
- nyx — Duo Chromatic Lip Gloss (lipstick) • $8.0
- nyx — V'Amped Up! Lip Top Coat (lipstick) • $6.0


In [45]:
testdf = pd.read_csv("product_data.csv")

In [50]:
testdf.columns

Index(['Unnamed: 0', 'id', 'brand', 'name', 'price', 'price_sign', 'currency',
       'image_link', 'product_link', 'website_link', 'description', 'rating',
       'category', 'product_type', 'tag_list', 'created_at', 'updated_at',
       'product_api_url', 'api_featured_image', 'product_colors'],
      dtype='object')