# Using Sentence Embedding

In [48]:
# %%
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import re

search_query = "What are your refund policies?"
search_query = re.sub(r'\bDaraz\b\s*', '', search_query, flags=re.IGNORECASE)
file_name = "DarazDataMain.txt"
chunkSize = 1500

# %%
# Load the data from the text file
with open(file_name, 'r', encoding="utf8") as f:
    products = f.readlines()

# %%
# Initialize the sentence transformer model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each product
product_embeddings = model.encode(products, convert_to_tensor=True)

In [49]:
# %%
# Define a function to search for similar products
def search_products(query, k):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, product_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=k)

    print("Query:", query)
    for score, idx in zip(top_results[0], top_results[1]):
        print("\nScore:", score.item())
        print("Product Details:", products[idx])

# %%
search_products(search_query, 2)

Query: What are your refund policies?

Score: 0.5706478953361511
Product Details: 1.19 “Returns and Refunds Policy” shall mean the applicable Company policies which govern the procedure for returns and refunds of Products by Customers on the relevant Channels located at Returns and Refunds Policy of Pakistan.


Score: 0.549439549446106
Product Details: Home & Living Bedding & Bath, Furniture & Lighting, Kitchen & Dining, Home Décor, Home Improvements, Household & Home Storage Supplies, Lawn & Garden,Other Accessories Change of mind isnot applicable for return and refund. If the item received is damaged, defective, incorrect, or incomplete, a refund will be issued based on Daraz's assessment. Note: For device-related issues after usage or expiration of return policy period, please check if the item is covered under Seller orBrand Warranty. Refer to our Warranty Policy for information on the different warranty types and ways to contact the seller/manufacturer. Items that are non-returnab

# Using VectorDB

In [50]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, NLTKTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain.embeddings import GPT4AllEmbeddings

In [51]:
query = search_query
loader = TextLoader(file_name)
docs = loader.load()

In [52]:
chunker01 = "Fixed-size (in characters) Overlapping Sliding Window"

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=chunkSize, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

persist_directory = 'chroma/' + str(file_name) + " - Chunker = " + chunker01
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()
docs01 = vectordb.similarity_search_with_score(query, k=2)

Created a chunk of size 26958, which is longer than the specified 1500
Created a chunk of size 2500, which is longer than the specified 1500
Created a chunk of size 2032, which is longer than the specified 1500
Created a chunk of size 1537, which is longer than the specified 1500
Created a chunk of size 1662, which is longer than the specified 1500
Created a chunk of size 2178, which is longer than the specified 1500
Created a chunk of size 1608, which is longer than the specified 1500
Created a chunk of size 2486, which is longer than the specified 1500
Created a chunk of size 2510, which is longer than the specified 1500
Created a chunk of size 1754, which is longer than the specified 1500
Created a chunk of size 3013, which is longer than the specified 1500


In [53]:
chunker02 = "Recursive Structure Aware Splitting"

text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n"], chunk_size=chunkSize, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

persist_directory = 'chroma/' + str(file_name) + " - Chunker = " + chunker02
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()
docs02 = vectordb.similarity_search_with_score(query, k=2)

In [54]:
# import spacy

# chunker03 = "Sentence splitter"

# def tokenize_and_chunk(text, spacy_model="en_core_web_sm", overlap=0, stride=1):
#     nlp = spacy.load(spacy_model)
#     sentences = list(nlp(text).sents)
#     chunks = []

#     for i in range(0, len(sentences), stride):
#         chunk_text = " ".join(str(sent) for sent in sentences[i:i + overlap + 1])
#         chunks.append(chunk_text)

#     return chunks

# # Now using the 'page_content' attribute to get the document text
# splits = [tokenize_and_chunk(doc.page_content) for doc in docs]

# # Assuming `file_name` is defined earlier in your code.
# persist_directory = 'chroma/' + str(file_name) + " - Chunker = " + chunker03
# vectordb = Chroma.from_documents(
#     documents=splits,
#     embedding=GPT4AllEmbeddings(),
#     persist_directory=persist_directory
# )
# vectordb.persist()
# docs03 = vectordb.similarity_search_with_score(query, k=2)


In [55]:
# chunker03 = "Structure Aware Splitting (by Sentence, Paragraph)"

# splits = docs.split(".")

# persist_directory = 'chroma/' + str(file_name) + " - Chunker = " + chunker03
# vectordb = Chroma.from_documents(
#     documents=splits,
#     embedding=GPT4AllEmbeddings(),
#     persist_directory=persist_directory
# )
# vectordb.persist()
# docs03 = vectordb.similarity_search_with_score(query, k=2)

In [56]:
chunker04 = "NLP Chunking: Tracking Topic Changes"

text_splitter = NLTKTextSplitter(chunk_size=chunkSize)#, separator="\n")
splits = text_splitter.split_documents(docs)

persist_directory = 'chroma/' + str(file_name) + " - Chunker = " + chunker04
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()
docs04 = vectordb.similarity_search_with_score(query, k=2)

Created a chunk of size 2157, which is longer than the specified 1500
Created a chunk of size 1692, which is longer than the specified 1500
Created a chunk of size 1967, which is longer than the specified 1500
Created a chunk of size 1830, which is longer than the specified 1500


In [57]:
# print("question = \"" + query + "\"")

# print("\n" + chunker01)
# count = 1
# for result in docs01:
#     print(f"response" + str(count) + " = \"" + result[0].page_content + "\"")  
#     count+=1


# print("\n" + chunker02)
# count = 1
# for result in docs02:
#     print(f"response" + str(count) + " = \"" + result[0].page_content + "\"")  
#     count+=1

# print("\n" + chunker03)
# count = 1
# for result in docs03:
#     print(f"response" + str(count) + " = \"" + result[0].page_content + "\"")  
#     count+=1

# print("\n" + chunker04)
# count = 1
# for result in docs04:
#     print(f"response" + str(count) + " = \"" + result[0].page_content + "\"")  
#     count+=1

In [58]:
print("question = \"" + query + "\"")
# Open a file to write the output
with open("output.txt", "w") as file:
    # file.write(f"question = \"{query}\"\n")

    # Handle the output for chunker01
    # file.write(f"\n{chunker01}\n")
    count = 1
    for result in docs01:
        file.write(f"response{count} = \"{result[0].page_content}\"\n")
        count += 1

    # Handle the output for chunker02
    # file.write(f"\n{chunker02}\n")
    count = 1
    for result in docs02:
        file.write(f"response{count} = \"{result[0].page_content}\"\n")
        count += 1

    # # Handle the output for chunker03
    # file.write(f"\n{chunker03}\n")
    # count = 1
    # for result in docs03:
    #     file.write(f"response{count} = \"{result[0].page_content}\"\n")
    #     count += 1

    # Handle the output for chunker04
    # file.write(f"\n{chunker04}\n")
    count = 1
    for result in docs04:
        file.write(f"response{count} = \"{result[0].page_content}\"\n")
        count += 1

question = "What are your refund policies?"


In [61]:
# import re
# import textwrap

# def clean_text(text):
#     # Remove any image file references
#     text = re.sub(r"\S+\.(png|jpg|jpeg|gif)\s*", "", text)

#     # Normalize spacing issues
#     text = re.sub(r"\s+", " ", text).strip()

#     # Correct common typographical errors
#     text = re.sub(r"isnot", "is not", text)
#     text = re.sub(r"orBrand", "or Brand", text)
#     text = re.sub(r"ourWarranty", "our Warranty", text)

#     # Remove redundant response indicators
#     text = re.sub(r"response\d+\s*=\s*\"", "", text)

#     # Deduplicate text
#     lines = text.split('.')
#     seen = set()
#     unique_lines = []
#     for line in lines:
#         line_clean = line.strip()
#         if line_clean not in seen:
#             seen.add(line_clean)
#             unique_lines.append(line_clean)

#     # Reconstruct text with clean lines
#     cleaned_text = '. '.join(unique_lines).strip()
#     if not cleaned_text.endswith('.'):
#         cleaned_text += '.'

#     # Split into multiple lines with a reasonable width
#     wrapped_text = textwrap.fill(cleaned_text, width=100)

#     return wrapped_text

# # Example usage with a read from a file and clean
# with open('output.txt', 'r') as file:
#     raw_text = file.read()

# cleaned_text = clean_text(raw_text)

# # Optionally, write the cleaned text back to the file or use it directly
# with open('output.txt', 'w') as file:
#     file.write(cleaned_text)

# print("Text cleaned and saved to 'output.txt'")


Text cleaned and saved to 'output.txt'


In [63]:
import re
import textwrap

def clean_text(text):
    # Remove any image file references
    text = re.sub(r"\S+\.(png|jpg|jpeg|gif)\s*", "", text)

    # Normalize spacing issues
    text = re.sub(r"\s+", " ", text).strip()

    # Correct common typographical errors
    text = re.sub(r"isnot", "is not", text)
    text = re.sub(r"orBrand", "or Brand", text)
    text = re.sub(r"ourWarranty", "our Warranty", text)

    # Remove redundant response indicators
    text = re.sub(r"response\d+\s*=\s*\"", "", text)

    # Remove numbers followed by a dot, e.g., "1."
    text = re.sub(r"\d+\.", "", text)

    # Remove numbers followed directly by a dash, e.g., "2-"
    text = re.sub(r"\d+-", "", text)

    # Deduplicate text
    lines = text.split('.')
    seen = set()
    unique_lines = []
    for line in lines:
        line_clean = line.strip()
        if line_clean not in seen:
            seen.add(line_clean)
            unique_lines.append(line_clean)

    # Reconstruct text with clean lines
    cleaned_text = '. '.join(unique_lines).strip()
    if not cleaned_text.endswith('.'):
        cleaned_text += '.'

    # Split into multiple lines with a reasonable width
    wrapped_text = textwrap.fill(cleaned_text, width=100)

    return wrapped_text

# Example usage with a read from a file and clean
with open('output.txt', 'r') as file:
    raw_text = file.read()

cleaned_text = clean_text(raw_text)

# Optionally, write the cleaned text back to the file or use it directly
with open('output.txt', 'w') as file:
    file.write(cleaned_text)

print("Text cleaned and saved to 'output.txt'")


Text cleaned and saved to 'output.txt'


In [66]:
# # %%
# import re
# import numpy as np
# import gensim
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# from sklearn.metrics.pairwise import cosine_similarity

# search_query = "What are your refund policies?"
# search_query = re.sub(r'\bDaraz\b\s*', '', search_query, flags=re.IGNORECASE)
# file_name = "DarazDataMain.txt"
# chunkSize = 1500

# # %%
# # Load the data from the text file
# with open("output.txt", 'r', encoding="utf8") as f:
#     products = f.read().split('\n')

# # %%
# # Prepare the documents for training
# documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(products)]

# # Train a Doc2Vec model
# model = Doc2Vec(documents, vector_size=100, min_count=1, workers=4, epochs=5)
# # model = Doc2Vec(documents, vector_size=50, min_count=1, workers=4, epochs=5)

# # %%
# # Define a function to search for similar products
# def search_products(query, k):
#     # Generate query embedding
#     query_embedding = model.infer_vector(query.split())
    
#     # Generate embeddings for all products
#     product_embeddings = np.array([model.infer_vector(doc.words) for doc in documents])
    
#     # Calculate cosine similarities between the query and all product embeddings
#     similarities = cosine_similarity([query_embedding], product_embeddings)[0]
    
#     # Get the top k most similar products
#     top_indices = np.argsort(similarities)[::-1][:k]

#     print("Query:", query)
#     for index in top_indices:
#         print("\nScore:", similarities[index])
#         print("Product Details:", ' '.join(documents[index].words))

# # %%
# search_products(search_query, 2)

Query: What are your refund policies?

Score: 0.18136615
Product Details: Home Storage Supplies, Lawn & Garden,Other Accessories Change of mind is not applicable for return

Score: 0.17416027
Product Details: overseas). For overseas products, please refer to the product page to check the applicable return


In [65]:
# # %%
# from sentence_transformers import SentenceTransformer, util
# import numpy as np
# import torch
# import re

# search_query = "What are your refund policies?"
# search_query = re.sub(r'\bDaraz\b\s*', '', search_query, flags=re.IGNORECASE)
# file_name = "DarazDataMain.txt"
# chunkSize = 1500

# # %%
# # Load the data from the text file
# with open("output.txt", 'r', encoding="utf8") as f:
#     products = f.readlines()

# # %%
# # Initialize the sentence transformer model for generating embeddings
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Generate embeddings for each product
# product_embeddings = model.encode(products, convert_to_tensor=True)

# # %%
# # Define a function to search for similar products
# def search_products(query, k):
#     query_embedding = model.encode(query, convert_to_tensor=True)
#     cosine_scores = util.pytorch_cos_sim(query_embedding, product_embeddings)[0]
#     top_results = torch.topk(cosine_scores, k=k)

#     print("Query:", query)
#     for score, idx in zip(top_results[0], top_results[1]):
#         print("\nScore:", score.item())
#         print("Product Details:", products[idx])

# # %%
# search_products(search_query, 2)

Query: What are your refund policies?

Score: 0.6342750787734985
Product Details: policy? Refund against Return orders: the refund is processed if your return claim is deemed valid.


Score: 0.5977258682250977
Product Details: and refund. Items that are non-returnable: Any custom-made items" response1 = Does Daraz refund



In [62]:
# Read the entire content of the file and save it to the variable 'content'
with open('output.txt', 'r') as file:
    content = file.readlines()

# # Optionally, print the content to verify it's loaded correctly
# for line in content:
#     print(line)

Home & Living Bedding & Bath, Furniture & Lighting, Kitchen & Dining, Home Décor, Home Improvements,

Household & Home Storage Supplies, Lawn & Garden,Other Accessories Change of mind is not applicable

for return and refund. If the item received is damaged, defective, incorrect, or incomplete, a

refund will be issued based on Daraz's assessment. Note: For device-related issues after usage or

expiration of return policy period, please check if the item is covered under Seller or Brand

Warranty. Refer to our Warranty Policy for information on the different warranty types and ways to

contact the seller/manufacturer. Items that are non-returnable: Any custom-made items" Home & Living

Bedding & Bath, Furniture & Lighting, Kitchen & Dining, Home Décor, Home Improvements, Household &

Home Storage Supplies, Lawn & Garden,Other Accessories Change of mind is not applicable for return

and refund. Items that are non-returnable: Any custom-made items" response1 = Does Daraz refund

shipping

In [59]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from transformers.generation import GenerationConfig
# import warnings
# warnings.filterwarnings("ignore")
# import textwrap

# !pip install tiktoken
# !pip install tiktoken transformers_stream_generator einops optimum auto-gptq
# tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-1_8B-Chat", trust_remote_code=True)
# from transformers import pipeline
# pipe = pipeline("text-generation", model="Qwen/Qwen-1_8B-Chat", device_map="auto", trust_remote_code=True)

# question = "What are your refund policies?"
# with open('/kaggle/input/ita-proj/output.txt', 'r') as file:
#     content = file.readlines()

# def get_completion(prompt):
#     messages = [{
#         "role": "user", 
#         "content": prompt }]
#     prompt2 = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     outputs = pipe(prompt2, max_new_tokens=400, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
#     return outputs[0]["generated_text"]

# prompt = f"""
# Based on the following information:\n\n
# 1. {content}\n\n
# Please provide a detailed answer to the question: {question}.
# Your answer should integrate the essence of the entire content, providing a unified answer that leverages the \
# diverse perspectives or data points provided by the entire content.
# """

# response = get_completion(prompt)
# print(response)

Based on the following information:


1. Product 12: Product Name = High Quality Wrist Watch For Men & Boys| Decent Wrist Leather Strap Attractive Dial, Product Category = Watches Sunglasses Jewellery/Watches/Men/Fashion, Brand Name = No Brand, Seller Name = Maal-Lo, URL = https://www.daraz.pk/products/-i409485404-s1960964280.html?search=1, Price Details = Original: Rs. 1000, Discounted: Rs. 699 | Original: Rs. 1000, Discounted: Rs. 900 | Original: Rs. 1000, Discounted: Rs. 799 | Original: Rs. 1000, Discounted: Rs. 580 | Original: Rs. 1000, Discounted: Rs. 590 | Original: Rs. 1000, Discounted: Rs. 580, Positive Seller Ratings = 90%, Ship on Time = 98%, Return Policy = 14 days free & easy return (Change of mind is not applicable)


2. Product 08: Product Name = OMG's Stylish watch for men , steel Built Design , Heavy Weight Watch in Fashion and for Casual use, Product Category = Watches Sunglasses Jewellery/Watches/Men/Fashion, Brand Name = No Brand, Seller Name = OMGs, URL = https://www.daraz.pk/products/-i433228448-s2139698887.html?search=1, Price Details = Original: Rs. 2500, Discounted: Rs. 2199, Positive Seller Ratings = 96%, Ship on Time = 100%, Return Policy = 14 days free & easy return (Change of mind is not applicable)


Please provide a detailed answer to the question: Show me watches from sellers with more than 90% positive ratings..
Your answer should integrate the essence of all three responses, providing a unified answer that leverages the diverse perspectives or data points provided by three responses.
<|im_end|>
<|im_start|>assistant
Based on the given information, we can identify watches from sellers who have more than 90% positive ratings by analyzing the product details and seller ratings provided.
For example, the product "OMG's Stylish watch for men" has a price of Rs. 2500 and an average rating of 96%. Additionally, the seller "OMGs" has received positive ratings of 96% on their website.
On the other hand, the product "High Quality Wrist Watch For Men & Boys" has a price of Rs. 1000 and an average rating of 100%. The seller "Maal-Lo" has received negative ratings of 90% on their website.
Therefore, based on the analysis, watches from sellers with more than 90% positive ratings include the "OMG's Stylish watch for men" and "High Quality Wrist Watch For Men & Boys". These watches are likely to be well-received by customers due to their high quality and stylish design.