In [81]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain.embeddings import GPT4AllEmbeddings

In [82]:
with open('FinalProductsList.txt', 'r', encoding="utf8") as f:
    document = f.readlines()

In [83]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(document)]

In [84]:
# model = Doc2Vec(documents, vector_size=100, min_count=1, workers=4, epochs=10)
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4, epochs=5)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [85]:
print(f"Corpus Count: {model.corpus_count}")
print(f"Vector Size: {model.vector_size}")
print(f"Number of Document Vectors: {len(model.dv)}")
print(f"Number of Word Vectors: {len(model.wv)}")

Corpus Count: 19
Vector Size: 50
Number of Document Vectors: 19
Number of Word Vectors: 77


In [86]:
query = "Show me watches under Rs. 200"
query_vector = model.infer_vector(query.split())

In [87]:
similar_documents = model.dv.most_similar([query_vector])
print("Similar Documents:", similar_documents)

Similar Documents: [(16, 0.06442073732614517), (14, 0.060320742428302765), (0, 0.05943944305181503), (10, 0.0587504543364048), (18, 0.05813738331198692), (2, 0.05607957765460014), (6, 0.055554721504449844), (17, 0.05519576743245125), (9, 0.053619056940078735), (11, 0.05275766924023628)]


In [88]:
for idx, score in similar_documents:
    print(f"Similarity: {score} Document Text: {document[idx]}")


Similarity: 0.06442073732614517 Document Text: Product 17: Product Name = Qiunery YESURPRISE Unisex Antique Case Vintage Brass Rib Chain Quartz Pocket Watch Train, Product Category = Watches Sunglasses Jewellery/Watches/Men/Fashion, Brand Name = No Brand, Seller Name = Qiunery, URL = https://www.daraz.pk/products/qiunery-yesur-i202992571-s1402460810.html?search=1, Price Details = Original: Rs. 1790, Discounted: Rs. 1250, Positive Seller Ratings = 82%, Ship on Time = 100%, Return Policy = 14 days free & easy return (Change of mind is not applicable)

Similarity: 0.060320742428302765 Document Text: Product 15: Product Name = BUISNESS WATCH CLASSIC LUXURY ROUND DILE STEEL STRAPS WITH SILVER HAND|WRIST BRACELT CHAIN FOR BOYS MENS, Product Category = Watches Sunglasses Jewellery/Watches/Men/Business, Brand Name = No Brand, Seller Name = Finds Shop, URL = https://www.daraz.pk/products/-i438130449-s2118958341.html?search=1, Price Details = Original: Rs. 999, Discounted: Rs. 457 | Original: Rs

In [89]:
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=0)


In [90]:
loader = TextLoader("FinalProductsList.txt")
docs = loader.load()
splits = text_splitter.split_documents(docs)

Created a chunk of size 463, which is longer than the specified 200
Created a chunk of size 675, which is longer than the specified 200
Created a chunk of size 561, which is longer than the specified 200
Created a chunk of size 708, which is longer than the specified 200
Created a chunk of size 550, which is longer than the specified 200
Created a chunk of size 526, which is longer than the specified 200
Created a chunk of size 638, which is longer than the specified 200
Created a chunk of size 502, which is longer than the specified 200
Created a chunk of size 490, which is longer than the specified 200
Created a chunk of size 533, which is longer than the specified 200
Created a chunk of size 715, which is longer than the specified 200
Created a chunk of size 697, which is longer than the specified 200
Created a chunk of size 455, which is longer than the specified 200
Created a chunk of size 511, which is longer than the specified 200
Created a chunk of size 552, which is longer tha

In [91]:
# # %%
# # Use Sentence Transformers to generate embeddings for example sentences
# sentences = ["This is an example sentence", "Each sentence is converted"]
# sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = sentence_model.encode(sentences)
# print("Embeddings Shape:", embeddings.shape)


In [92]:
persist_directory = 'chroma/ProductsSearch'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()

In [93]:
docs = vectordb.similarity_search_with_score(query, k=3)
for result in docs:
    print("\n")
    print(result[1])
    print(result[0].page_content)



0.94700026512146
Product 03: Product Name = Sports Digtal Fashion Watch Women Men Square LED Watch Silicone Electronic Watch Women's Watches Clock, Product Category = Watches Sunglasses Jewellery/Watches/Men/Sports, Brand Name = No Brand, Seller Name = Modern watch company, URL = https://www.daraz.pk/products/led-i431801201-s2265468211.html?search=1, Price Details = Original: Rs. 899, Discounted: Rs. 300 | Original: Rs. 650, Discounted: Rs. 199, Positive Seller Ratings = 79%, Ship on Time = 97%, Return Policy = 14 days free & easy return (Change of mind is not applicable)


0.94700026512146
Product 03: Product Name = Sports Digtal Fashion Watch Women Men Square LED Watch Silicone Electronic Watch Women's Watches Clock, Product Category = Watches Sunglasses Jewellery/Watches/Men/Sports, Brand Name = No Brand, Seller Name = Modern watch company, URL = https://www.daraz.pk/products/led-i431801201-s2265468211.html?search=1, Price Details = Original: Rs. 899, Discounted: Rs. 300 | Origina

In [94]:
with open("search_results.txt", "w") as file:
    for result in docs:
        # Write a separator for each result
        file.write("\n")
        file.write(str(result[1]) + "\n")  # Convert the title to string before writing
        file.write(result[0].page_content + "\n")  # Write the page content

In [95]:
# with open("search_results.txt", "w") as file:
#     response_number = 1
#     for result in docs:
#         file.write("response{} =".format(response_number))
#         # file.write(str(result[1]) + "\n")  # Convert the title to string before writing
#         file.write(result[0].page_content + "\n")  
#         response_number += 1