# Introduction

The goal of this notebook is to implement a similar products method for the television dataset

In [None]:
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions


In [2]:
# one of the best models in MTEB benchmark
model_name = 'sentence-transformers/paraphrase-MiniLM-L6-v2'

model = SentenceTransformer(model_name)

In [None]:
data_path = Path("..//data//Televisions.csv")

df = pd.read_csv(data_path)
df.head()

In [4]:
chroma_client = chromadb.PersistentClient(path="..//televisions_db")

In [5]:
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
collection = chroma_client.get_or_create_collection(name="televisions_db", embedding_function=sentence_transformer_ef)

In [None]:
# Add all the data to the vector database. ChromaDB automatically converts and stores the text as vector embeddings. This may take a few minutes.
collection.add(
    documents=df["name"].values.tolist(),
    metadatas=[{"url": url} for url in df.link.values],
    ids=list(map(str, df.index.tolist()))
)

In [8]:
results = collection.query(
    query_texts=["samsung 42 inch ultra"],
    n_results=5,
    include=['documents', 'distances', 'metadatas']
)
results["documents"]

[['Samsung 1m 09cm (43") Smart Monitor with Built-in Speaker - LS43AM704UWXXL',
  'Samsung The Serif Series 138 cm (55 inches) 4K Ultra HD Smart QLED TV QA55LS01TAKXXL (Cloud White)',
  'Vu 215cm (85 inches) The Masterpiece 4K Ultra HD Android QLED TV 85QPX (Armani Gold) (2020 Model)',
  'Samsung The Serif Series 123 cm (49 Inches) 4K Ultra HD Smart QLED TV QA49LS01TAKXXL (Cloud White)',
  'Samsung 32-Inch(80Cm) 3840 x 2160 Pixels M7 4K UHD Smart Monitor, Type-C, Smart TV apps, TV Plus, Office 365, Apple Airpla...']]