# Storing Data pada MongoDB Atlas

In [1]:
# Load Dataset

import pandas as pd
df = pd.read_pickle('../data/datasets.pkl')
print("Dataset loaded successfully:", df.shape)
df.head()

Dataset loaded successfully: (1262, 19)


Unnamed: 0,title,brand,feature,rank,date,asin,imageURL,imageURLHighRes,description,price,also_view,also_buy,fit,details,similar_item,tech1,gender,material,category
69,Buxton Heiress Pik-Me-Up Framed Case,Buxton,"['Leather', 'Imported', 'synthetic lining', 'F...","43,930inClothing,Shoesamp;Jewelry(",5 star,B00007GDFV,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['Authentic crunch leather with rich floral em...,16.95,"[B07C9V84JD, B01J6JE05G, B07J11WZ5Y, B07JJQFHS...","[B07C9V84JD, B01J6JE05G, B07JJQFHS5, B003EGITU...","class=""a-normal a-align-center a-spacing-smal...",,,,female,Leather,Accessories
352,Disguise Tiny Treats Pink Leopard,,"['polyester', 'You can return this item for an...","5,843,505inClothing,ShoesJewelry(",5 star,B0002C6NL6,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['A grrreat pink jumpsuit with attached tail a...,25.99,,,,,,,female,Polyester,Dresses/Jumpsuits
410,Dream PJ's Blue - Large - Part #: 25BLG,Ethical/Spot,['Product Dimensions:\n \n8...,"17,183,425inClothing,Shoesamp;Jewelry(",5 star,B0002TKBSU,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,"['SOFT AND CUDDLY, SWEET DREAM PAJAMAS IN SOFT...",15.99,,,,,,,unisex,Unknown,Undergarments/Sleepwear
512,Buxton Heiress Pik-Me-Up Framed Case,Buxton,"['Leather', 'Imported', 'synthetic lining', 'F...","43,930inClothing,Shoesamp;Jewelry(",5 star,B00007GDFV,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['Authentic crunch leather with rich floral em...,16.95,"[B07C9V84JD, B01J6JE05G, B07J11WZ5Y, B07JJQFHS...","[B07C9V84JD, B01J6JE05G, B07JJQFHS5, B003EGITU...","class=""a-normal a-align-center a-spacing-smal...",,,,female,Leather,Accessories
795,Disguise Tiny Treats Pink Leopard,,"['polyester', 'You can return this item for an...","5,843,505inClothing,ShoesJewelry(",5 star,B0002C6NL6,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,['A grrreat pink jumpsuit with attached tail a...,25.99,,,,,,,female,Polyester,Dresses/Jumpsuits


In [2]:
# Build Document Objects

from haystack import Document

documents = []
for index, row in df.iterrows():
    description = str(row["description"]).strip("[]").strip("''")
    doc = Document(
        content=f"{row['title']}\n{description}",
        meta={
            "asin": row.get("asin", ""),
            "title": row.get("title", ""),
            "brand": row.get("brand", ""),
            "price": float(row.get("price", 0)),
            "gender": row.get("gender", ""),
            "material": row.get("material", ""),
            "category": row.get("category", "")
        }
    )
    documents.append(doc)

print(f"Built {len(documents)} documents.")

Built 1262 documents.


In [3]:
# MongoDB Connection Setup

import os
from getpass import getpass

if "MONGO_CONNECTION_STRING" not in os.environ:
    os.environ["MONGO_CONNECTION_STRING"] = getpass("Masukkan MongoDB Connection String Anda: ")

In [4]:
# Setup Document Store

from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore

document_store = MongoDBAtlasDocumentStore(
    database_name="depato_store",
    collection_name="products",
    vector_search_index="vector_index",
    full_text_search_index="search_index",
)

print("Connected to MongoDB Atlas successfully!")

Connected to MongoDB Atlas successfully!


In [5]:
# Build Embedding + Storing Pipeline

from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy

pipeline = Pipeline()
embedder_model = "sentence-transformers/all-mpnet-base-v2"

pipeline.add_component(
    "embedder",
    SentenceTransformersDocumentEmbedder(model=embedder_model)
)

pipeline.add_component(
    "writer",
    DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)
)

pipeline.connect("embedder", "writer")

print(f"Embedding documents using: {embedder_model}")

pipeline.run({
    "embedder": {"documents": documents}
})

print("Documents stored successfully to MongoDB Atlas.")

  from .autonotebook import tqdm as notebook_tqdm


Embedding documents using: sentence-transformers/all-mpnet-base-v2


Batches: 100%|██████████| 40/40 [00:13<00:00,  2.90it/s]


Documents stored successfully to MongoDB Atlas.


In [6]:
# Store Materials & Categories in Separate Collections

from pymongo import MongoClient

client = MongoClient(os.environ["MONGO_CONNECTION_STRING"])
db = client.depato_store

materials = df["material"].dropna().unique().tolist()
categories = df["category"].dropna().unique().tolist()

material_docs = [{"name": m} for m in materials]
category_docs = [{"name": c} for c in categories]

# Bersihkan koleksi lama dulu biar tidak duplikat
db.materials.delete_many({})
db.categories.delete_many({})

db.materials.insert_many(material_docs)
db.categories.insert_many(category_docs)

print(f"Stored {len(material_docs)} materials and {len(category_docs)} categories successfully.")

Stored 30 materials and 10 categories successfully.
