In [50]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain.embeddings import GPT4AllEmbeddings

In [51]:
with open('DarazData.txt', 'r', encoding="utf8") as f:
    document = f.readlines()

In [52]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(document)]

In [53]:
# model = Doc2Vec(documents, vector_size=100, min_count=1, workers=4, epochs=10)
model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4, epochs=5)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [54]:
print(f"Corpus Count: {model.corpus_count}")
print(f"Vector Size: {model.vector_size}")
print(f"Number of Document Vectors: {len(model.dv)}")
print(f"Number of Word Vectors: {len(model.wv)}")

Corpus Count: 488
Vector Size: 50
Number of Document Vectors: 488
Number of Word Vectors: 93


In [55]:
query = "What are your refund policies?"
query_vector = model.infer_vector(query.split())

In [56]:
similar_documents = model.dv.most_similar([query_vector])
print("Similar Documents:", similar_documents)

Similar Documents: [(383, 0.27928614616394043), (73, 0.2224072515964508), (36, 0.20126929879188538), (58, 0.17943215370178223), (72, 0.16449636220932007), (152, 0.1568303108215332), (157, 0.1521310657262802), (400, 0.14925546944141388), (392, 0.14541473984718323), (84, 0.14480553567409515)]


In [57]:
for idx, score in similar_documents:
    print(f"Similarity: {score} Document Text: {document[idx]}")


Similarity: 0.27928614616394043 Document Text: When can I return my item? Return request must be raised within 14 days for Daraz Mall items and non-Daraz Mall items from the date of delivery. You can initiate the return request once the order status has been updated to “Delivered”. If the order has been delivered and the status has not updated, it will be updated within 24-48 hours. The return window will only begin when the status has been updated. Item is damaged, defective, or expired Item is incorrect, counterfeit or not as advertised Item is of incorrect size or does not fit you Item, freebie, or accessory is missing You changed your mind Men's/Women's Bags Luggages and Suitcases Men's/Women's Fashion Bedding Bath Exceptions are Duffel bags, innerwear and overseas products Note: Our marketplace items' 14-day return policy covers all orders placed on October 25th and onwards.

Similarity: 0.2224072515964508 Document Text: What is the return policy for Daraz Mart products? Item is d

In [58]:
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=0)


In [59]:
loader = TextLoader("DarazData.txt")
docs = loader.load()
splits = text_splitter.split_documents(docs)

Created a chunk of size 26958, which is longer than the specified 800
Created a chunk of size 1422, which is longer than the specified 800
Created a chunk of size 1350, which is longer than the specified 800
Created a chunk of size 868, which is longer than the specified 800
Created a chunk of size 871, which is longer than the specified 800
Created a chunk of size 1285, which is longer than the specified 800
Created a chunk of size 2500, which is longer than the specified 800
Created a chunk of size 1005, which is longer than the specified 800
Created a chunk of size 2032, which is longer than the specified 800
Created a chunk of size 945, which is longer than the specified 800
Created a chunk of size 976, which is longer than the specified 800
Created a chunk of size 860, which is longer than the specified 800
Created a chunk of size 823, which is longer than the specified 800
Created a chunk of size 1537, which is longer than the specified 800
Created a chunk of size 911, which is l

In [60]:
# # %%
# # Use Sentence Transformers to generate embeddings for example sentences
# sentences = ["This is an example sentence", "Each sentence is converted"]
# sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
# embeddings = sentence_model.encode(sentences)
# print("Embeddings Shape:", embeddings.shape)


In [61]:
persist_directory = 'chroma/DarazSearch'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()

In [62]:
docs = vectordb.similarity_search_with_score(query, k=3)
for result in docs:
    print("\n")
    print(result[1])
    print(result[0].page_content)



0.6251944899559021
Refund Process:


0.805809497833252
Refund Voucher


0.8660830855369568
Free & Easy Returns Policy:


In [63]:
# with open("search_results.txt", "w") as file:
#     for result in docs:
#         # Write a separator for each result
#         file.write("\n")
#         file.write(str(result[1]) + "\n")  # Convert the title to string before writing
#         file.write(result[0].page_content + "\n")  # Write the page content

In [64]:
print("question = \"" + query + "\"")
count = 1
for result in docs:
    print(f"response" + str(count) + " = \"" + result[0].page_content + "\"")  
    count+=1

question = "What are your refund policies?"
response1 = "Refund Process:"
response2 = "Refund Voucher"
response3 = "Free & Easy Returns Policy:"


In [65]:
# with open("search_results.txt", "w") as file:
#     response_number = 1
#     for result in docs:
#         file.write("response{} =".format(response_number))
#         # file.write(str(result[1]) + "\n")  # Convert the title to string before writing
#         file.write(result[0].page_content + "\n")  
#         response_number += 1