In [None]:
# 2. Fetch Data
cursor = collection.find({}, {"_id": 1, "name": 1, "description": 1, "category": 1})
products = list(cursor)
df = pd.DataFrame(products)

print(f"Loaded {len(df)} products")
df.head()

In [None]:
# 3. Preprocess Data
# Combine name and description for better context
df['text'] = df['name'].fillna('') + " " + df['description'].fillna('') + " " + df['category'].fillna('')
df['text'] = df['text'].str.lower()

# Keep track of IDs to map back later
product_ids = df['_id'].astype(str).tolist()

In [None]:
# 4. Vectorization (Convert text to numbers)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['text'])

print(f"Matrix shape: {tfidf_matrix.shape}")

In [None]:
# 5. Train Nearest Neighbors Model
knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix)

print("Model trained successfully")

In [None]:
# 6. Save Artifacts
if not os.path.exists('artifacts'):
    os.makedirs('artifacts')

with open('artifacts/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('artifacts/model.pkl', 'wb') as f:
    pickle.dump(knn, f)

with open('artifacts/product_ids.pkl', 'wb') as f:
    pickle.dump(product_ids, f)

print("Artifacts saved to ml/artifacts/")

In [None]:
# 7. Test Prediction
test_idx = 0
query_text = df.iloc[test_idx]['text']
query_vec = vectorizer.transform([query_text])
distances, indices = knn.kneighbors(query_vec)

print(f"Query: {df.iloc[test_idx]['name']}")
print("Recommendations:")
for i in indices[0]:
    print(f" - {df.iloc[i]['name']} ({product_ids[i]})")