In [146]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch
import re, os, json, csv
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, NLTKTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer 
from langchain.embeddings import GPT4AllEmbeddings
import os
import shutil
import spacy
import pandas as pd
import re
import textwrap

In [147]:
searchType = "Product"
query = "Show me android cables from sellers with more than 90% positive ratings."

# searchType = "Main"
# query = "What are your refund policies?"

# searchType = "Seller"
# query = "Can I make product bundles on Daraz?"

query = re.sub(r'\bDaraz\b\s*', '', query, flags=re.IGNORECASE)
chunkSize = 1500

In [148]:
if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/outputCleaned.txt"):
    os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/outputCleaned.txt")

In [149]:
nlp = spacy.load('en_core_web_sm')

subjects = [
    "Phone Cases", "Power Banks", "iPhone Cables", "Android Cables", "Wall Chargers",
    "Wireless Chargers", "Tablet Accessories", "Car Chargers", "Screen Protectors",
    "Phone Camera Flash", "Lights", "Selfie Sticks", "Bluetooth Headphones",
    "Wireless Earbuds", "Mono Headsets", "Headphones", "Wired Headsets", "Smartwatches",
    "Fitness", "Trackers", "Fitness Tracker", "Virtual Reality", "Memory Cards",
    "Lenses", "Tripods", "Monopods", "Camera Cases", "Camera", "Gimbals", "Batteries",
    "Cooling Pads", "Keyboards", "Watches"
]

headers = [
    "Product Number", "Product Name", "Product Category", "Brand Name", "Seller Name", 
    "Price Details", "Positive Seller Ratings", "Ship on Time", "Return Policy"
]

def is_paragraph_break(line):
    return line.strip() == ""

def is_unwanted_line(line):
    # Check if a line ends with a colon
    return line.strip().endswith(":")

def process_files(folder_path, output_file):
    files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    all_text = []
    
    for file in files:
        current_paragraph = []
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            for line in f:
                if is_unwanted_line(line):
                    continue  # Skip lines ending with a colon
                if is_paragraph_break(line):
                    if current_paragraph:
                        all_text.append(" ".join(current_paragraph))
                        current_paragraph = []
                else:
                    # Remove leading/trailing whitespace and add the line to the current paragraph
                    current_paragraph.append(line.strip())
            # Don't forget to add the last paragraph if the file didn't end with a blank line
            if current_paragraph:
                all_text.append(" ".join(current_paragraph))
    
    # Remove lines with less than 100 characters
    all_text = [line for line in all_text if len(line) >= 100]
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("\n".join(all_text))

def normalize_subjects(subjects):
    """Lemmatize and normalize subjects for easier matching."""
    normalized_subjects = {}
    for subject in subjects:
        # Process the subject text with spaCy to lemmatize
        doc = nlp(subject.lower())
        # Join lemmatized words with hyphens
        normalized = '-'.join([token.lemma_ for token in doc])
        normalized_subjects[normalized] = subject  # Store original subject
    return normalized_subjects

def find_subject_in_query(query, subjects):
    """Find a subject in the lemmatized and normalized query."""
    normalized_subjects = normalize_subjects(subjects)
    # Process the query text with spaCy to lemmatize
    doc = nlp(query.lower())
    lemmatized_query = '-'.join([token.lemma_ for token in doc])

    for normalized, original in normalized_subjects.items():
        if normalized in lemmatized_query:
            return original
    return "No subject found"

# Read each file in the directory
def read_product_files(directory):
    products_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                data = file.read()
                corrected_data = '[' + data.replace('}\n\n{', '},\n{') + ']'
                try:
                    product_info = json.loads(corrected_data)
                    products_data.append(product_info)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from {filename}: {e}")
    return products_data

# Extract product description specifically looking more robustly
def extract_description(description_text):
    # Attempt to extract the portion after "Product Description"
    desc_start = description_text.find("Product Description:")
    if desc_start != -1:
        # Extract starting from the found index through the end of the description
        desc_substr = description_text[desc_start:]
        desc_end = desc_substr.find("<br/>")
        if desc_end != -1:
            return desc_substr[len("Product Description:"):desc_end].strip()
        else:
            return desc_substr[len("Product Description:"):].strip()
    return "Description not found."

# Write the consolidated product info to an output file
def write_product_info(products_data, output_file):
    with open(output_file, 'w') as outfile:
        for i, product in enumerate(products_data, start=1):
            product_dict = {}
            for segment in product:
                product_dict.update(segment)

            product_name = product_dict.get("Product Name", "N/A")
            category_path = product_dict.get("Category", "N/A").replace('"', '')
            brand_name = product_dict.get("Brand Name", "N/A")
            seller_name = product_dict.get("Seller Name", "N/A")
            url = product_dict.get("URL", "N/A")
            price_info = product_dict.get("Price Info", [])
            price_details = " | ".join([f"Original: {p[1]}, Discounted: {p[2]}" for p in price_info])
            # description = extract_description(product_dict.get("desc", "").replace("<br/>", "\n"))
            additional_info = product_dict.get("Additional Info", {})
            positive_ratings = additional_info.get("Positive Seller Ratings", "N/A")
            ship_on_time = additional_info.get("Ship on Time", "N/A")
            return_policy = product_dict.get("Return Policy", {})
            return_details = f"{return_policy.get('Title', 'N/A')} ({return_policy.get('Subtitle', 'N/A')})"

            # product_entry = f"Product {i:02d}: Product Name = {product_name}, Product Category = {category_path}, Brand Name = {brand_name}, Seller Name = {seller_name}, URL = {url}, Price Details = {price_details}, Description = {description}, Positive Seller Ratings = {positive_ratings}, Ship on Time = {ship_on_time}, Return Policy = {return_details}\n"
            product_entry = f"Product {i:02d}: Product Name = {product_name}, Product Category = {category_path}, Brand Name = {brand_name}, Seller Name = {seller_name}, URL = {url}, Price Details = {price_details}, Positive Seller Ratings = {positive_ratings}, Ship on Time = {ship_on_time}, Return Policy = {return_details}\n"
            outfile.write(product_entry)

# Function to parse each line of the text file into structured data
def parse_line(line):
    # Prepare regex pattern with lookahead assertions to capture fields correctly
    pattern = re.compile(
        r"Product Name = (?P<Product_Name>.*?)(?=, Product Category =)|"
        r"Product Category = (?P<Product_Category>.*?)(?=, Brand Name =)|"
        r"Brand Name = (?P<Brand_Name>.*?)(?=, Seller Name =)|"
        r"Seller Name = (?P<Seller_Name>.*?)(?=, URL =)|"
        r"Price Details = (?P<Price_Details>.*?)(?=, Positive Seller Ratings =)|"
        r"Positive Seller Ratings = (?P<Positive_Seller_Ratings>.*?)(?=, Ship on Time =)|"
        r"Ship on Time = (?P<Ship_on_Time>.*?)(?=, Return Policy =)|"
        r"Return Policy = (?P<Return_Policy>.*?)(?=, Product \d+:|, URL =|$)"
    )

    # Extract product number separately
    product_number = re.match(r"Product (\d+):", line).group(1)

    # Find all matches in the line
    matches = pattern.finditer(line)
    data = {k: v for m in matches for k, v in m.groupdict().items() if v is not None}

    # Constructing the row based on required headers
    return [
        "Product " + product_number,
        data.get("Product_Name", ""),
        data.get("Product_Category", ""),
        data.get("Brand_Name", ""),
        data.get("Seller_Name", ""),
        data.get("Price_Details", ""),
        data.get("Positive_Seller_Ratings", ""),
        data.get("Ship_on_Time", ""),
        data.get("Return_Policy", "")
    ]

def extract_info_simple(query):
    # Define keywords for subject identification
    subject_keywords = ["watch", "watches", "smartwatch", "luxury watch"]
    brand_names = products_df['Brand Name'].str.lower().unique().tolist()
    seller_names = products_df['Seller Name'].str.lower().unique().tolist()

    # Patterns for limitations
    price_pattern = r"Rs\.\s*\d+|\d+\s*%|between\s*Rs\.\s*\d+\s*and\s*Rs\.\s*\d+"
    # Updated rating pattern to be more specific and catch contexts like "more than 90%"
    rating_pattern = r"more than \d{1,3}% positive ratings|less than \d{1,3}% positive ratings|\d{1,3}% positive ratings|\d{1,3}%"
    time_pattern = r"ship on time"
    
    # Find subjects
    subjects = [keyword for keyword in subject_keywords if keyword in query.lower()]
    subjects.extend([brand for brand in brand_names if brand in query.lower()])
    
    # Find limitations
    limitations = re.findall(price_pattern, query)
    limitations.extend(re.findall(rating_pattern, query))
    if "top-rated sellers" in query.lower() or "highly rated sellers" in query.lower():
        limitations.append("top-rated sellers")
    if re.search(time_pattern, query, re.IGNORECASE):
        limitations.append("ship on time")

    # Check if there are specific seller names mentioned
    for seller in seller_names:
        if seller in query.lower():
            limitations.append(f"sold by {seller}")

    # return {"subjects": subjects, "limitations": limitations}
    return limitations

def load_data(filepath):
    """Load the product data from a CSV file and preprocess it."""
    data = pd.read_csv(filepath)
    data['Discounted Price'] = data['Price Details'].apply(
        lambda x: min(map(int, re.findall(r'Discounted: Rs\. (\d+)', x)))
    )
    data['Positive Seller Ratings'] = data['Positive Seller Ratings'].str.rstrip('%').astype(int)
    data['Ship on Time'] = data['Ship on Time'].str.rstrip('%').astype(int)
    return data

def parse_limitation(limitation):
    """Parse the limitation string into a structured dictionary."""
    if 'between Rs.' in limitation:
        low, high = map(int, re.findall(r'\d+', limitation))
        return {'price_range': (low, high)}
    elif 'Rs.' in limitation:
        price = int(re.findall(r'\d+', limitation)[0])
        return {'price_exact': price}
    elif 'sold by' in limitation:
        seller = limitation.split('sold by ')[1].strip()
        return {'seller_name': seller}
    elif 'top-rated sellers' in limitation:
        return {'top_rated_sellers': 90}
    elif '%' in limitation:
        rating = int(re.findall(r'\d+', limitation)[0])
        return {'top_rated_sellers': rating}
    elif 'ship on time' in limitation:
        return {'ship_on_time': 100}
    else:
        return None  # Handle unrecognized input

def filter_productsTwo(data, limitation_dict):
    """Apply filters to the data based on parsed limitations."""
    if limitation_dict is None:
        return []
    key, value = next(iter(limitation_dict.items()))
    if key == 'price_exact':
        filtered_data = data[data['Discounted Price'] == value]
    elif key == 'price_range':
        filtered_data = data[(data['Discounted Price'] >= value[0]) & (data['Discounted Price'] <= value[1])]
    elif key == 'seller_name':
        filtered_data = data[data['Seller Name'].str.contains(value, case=False, na=False)]
    elif key == 'top_rated_sellers':
        filtered_data = data[data['Positive Seller Ratings'] >= value]
    elif key == 'ship_on_time':
        filtered_data = data[data['Ship on Time'] == value]
    return filtered_data['Product Number'].tolist()

def filter_products(input_filename, output_filename, matching_product_numbers):
    with open(input_filename, 'r') as file:
        lines = file.readlines()

    # Prepare to collect matching lines
    matching_lines = []

    # Filter lines based on matching_product_numbers
    if matching_product_numbers:
        # Create a set for faster lookup
        product_set = set(matching_product_numbers)
        for line in lines:
            # Assuming each line starts with a product identifier like "Product XX:"
            product_number = line.split(':', 1)[0].strip()
            if product_number in product_set:
                matching_lines.append(line)
    else:
        # If matching_product_numbers is empty, select all lines
        matching_lines = lines

    # Write the selected lines to the output file
    with open(output_filename, 'w') as file:
        file.writelines(matching_lines)

In [150]:
# import pandas as pd

# # Function to load data, handle NaN values, and save back to CSV
# def load_dataTwo(filepath, output_filepath):
#     # Load data
#     data = pd.read_csv(filepath)

#     # Iterate through each column in DataFrame
#     for column in data.columns:
#         # Check data type of the column
#         if data[column].dtype == 'float64' or data[column].dtype == 'int64':
#             # For numerical columns, fill NaNs with the mean of the column
#             data[column].fillna(data[column].mean(), inplace=True)
#         else:
#             # For categorical columns, fill NaNs with the mode of the column (most frequent value)
#             mode_value = data[column].mode()[0]
#             data[column].fillna(mode_value, inplace=True)

#     # Save the modified DataFrame back to a new CSV file
#     data.to_csv(output_filepath, index=False)

#     return data

# # Specify the path for the output file
# output_file_path = 'ProcessedFinalProductsList.csv'

# # Use the function to load your data and save it to a new CSV
# data = load_dataTwo('FinalProductsList.csv', output_file_path)

In [151]:
if searchType.lower() == "main":
    folder_path = '/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/DarazDataMain'  # Change this to the path of your folder
    output_file = 'DarazDataMain.txt'  
    process_files(folder_path, output_file)
    file_name = "DarazDataMain.txt"
elif searchType.lower() == "seller":
    folder_path = '/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/DarazDataSeller'  # Change this to the path of your folder
    output_file = 'DarazDataSeller.txt'  
    process_files(folder_path, output_file)
    file_name = "DarazDataSeller.txt"
elif searchType.lower() == "product":  
    result = find_subject_in_query(query, subjects)
    words = result.split()
    result = '-'.join(words) if len(words) > 1 else result
    directory_path = 'products/' + str(result)
    print(directory_path)
    products_data = read_product_files(directory_path)
    output_file = 'FinalProductsList.txt'
    write_product_info(products_data, output_file)
    input_file_path = 'FinalProductsList.txt'
    output_csv_path = 'FinalProductsList.csv'
    # Reading the text file and writing to CSV
    with open(input_file_path, 'r', encoding='utf-8') as file, \
        open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)  # Writing headers to the CSV file
        
        for line in file:
            if line.strip():  # Ensuring the line has content
                row = parse_line(line)
                writer.writerow(row)  # Writing the parsed data as a row in the CSV file

    products_df = pd.read_csv('FinalProductsList.csv')

    # # changes
    products_df.replace('N/A', np.nan, inplace=True)
    numeric_cols = products_df.select_dtypes(include=[np.number]).columns
    products_df[numeric_cols] = products_df[numeric_cols].fillna(products_df[numeric_cols].mean())
    # # till here 

    # Function to load data, handle NaN values, and save back to CSV
    def load_dataTwo(filepath, output_filepath):
        # Load data
        data = pd.read_csv(filepath)

        # Iterate through each column in DataFrame
        for column in data.columns:
            # Check data type of the column
            if data[column].dtype == 'float64' or data[column].dtype == 'int64':
                # For numerical columns, fill NaNs with the mean of the column
                data[column].fillna(data[column].mean(), inplace=True)
            else:
                # For categorical columns, fill NaNs with the mode of the column (most frequent value)
                mode_value = data[column].mode()[0]
                data[column].fillna(mode_value, inplace=True)

        # Save the modified DataFrame back to a new CSV file
        data.to_csv(output_filepath, index=False)

        return data

    # Specify the path for the output file
    output_file_path = 'ProcessedFinalProductsList.csv'

    # Use the function to load your data and save it to a new CSV
    # data = load_dataTwo('FinalProductsList.csv', output_file_path)

    extracted_info = extract_info_simple(query)

    limitation = "', '".join(extracted_info)
    limitation = f"'{limitation}'"

    # data = pd.read_csv('FinalProductsList.csv')
    # data.fillna({'your_integer_column': 0}, inplace=True)

    
    # data = load_data('FinalProductsList.csv')
    data = load_data('ProcessedFinalProductsList.csv')



    # Example limitation
    # limitation = "'90%', 'more than 90% positive ratings'"

    # Parse and filter products based on the limitation
    parsed_limitation = parse_limitation(limitation)

    # print(parsed_limitation)
    matching_product_numbers = filter_productsTwo(data, parsed_limitation)

    filter_products('FinalProductsList.txt', 'ProductsList.txt', matching_product_numbers)

    file_name = "ProductsList.txt"



products/Android-Cables


In [152]:
with open(file_name, 'r', encoding="utf8") as f:
    products = f.readlines()

model = SentenceTransformer('all-MiniLM-L6-v2')
product_embeddings = model.encode(products, convert_to_tensor=True)

def search_products(query, k):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, product_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=k)

    print("Query:", query)
    for score, idx in zip(top_results[0], top_results[1]):
        print("\nScore:", score.item())
        print("Product Details:", products[idx])

search_products(query, 2)

Query: Show me android cables from sellers with more than 90% positive ratings.

Score: 0.646823525428772
Product Details: Product 41: Product Name = micro charging cable for android, Product Category = Mobiles & Tablets/Mobile Accessories/Cables & Converters, Brand Name = No Brand, Seller Name = Wonder gadgets & accessories, URL = https://www.daraz.pk/products/-i485464709-s2280408721.html?search=1, Price Details = Original: Rs. 212, Discounted: Rs. 101, Positive Seller Ratings = 91%, Ship on Time = 100%, Return Policy = 14 days free & easy return (Change of mind is not applicable)


Score: 0.6350957155227661
Product Details: Product 22: Product Name = Type C Cable 6A:Charging and Data Transfer for Android Devices AZee Brandz, Product Category = Mobiles & Tablets/Mobile Accessories/Cables & Converters, Brand Name = No Brand, Seller Name = AZee Brandz, URL = https://www.daraz.pk/products/c-6a-i466560132-s2242082073.html?search=1, Price Details = Original: Rs. 200, Discounted: Rs. 99, Po

In [153]:
loader = TextLoader(file_name)
docs = loader.load()

In [154]:
import os
import re
import shutil

def manage_chroma_folders(directory):
    global chromaCounter
    chromaCounter = 0  # Initialize the counter to zero

    # Prepare to capture the highest number found in folder names starting with "chromaDB"
    highest_number = 0

    # Compile a regex pattern to match 'chromaDB' followed immediately by a number
    pattern = re.compile(r'^chromaDB(\d+)$')

    # List all items in the directory
    items = os.listdir(directory)

    # First pass: Identify the highest number suffix for "chromaDB" folders
    for item in items:
        if os.path.isdir(os.path.join(directory, item)):
            match = pattern.match(item)
            if match:
                # Extract the numeric part and update the highest_number if this one is greater
                number = int(match.group(1))
                if number > highest_number:
                    highest_number = number

    # Update the chromaCounter to the next available number
    chromaCounter = highest_number + 1

    # Second pass: Delete all folders starting with "chroma"
    for item in items:
        if os.path.isdir(os.path.join(directory, item)) and item.startswith("chroma"):
            full_path = os.path.join(directory, item)
            shutil.rmtree(full_path)
            # print(f"Deleted folder: {full_path}")

# Usage example:
directory = "/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/"
manage_chroma_folders(directory)
# print(f"Next chromaCounter value: {chromaCounter}")

with open('FinalWorking/chromaCounter.txt', 'w') as file:
    file.write(f"Current Chroma Counter = {chromaCounter}")

In [155]:
chunker01 = "Fixed-size (in characters) Overlapping Sliding Window"

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=chunkSize, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

persist_directory = 'chromaDB' + str(chromaCounter) + '/'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()
docs01 = vectordb.similarity_search_with_score(query, k=5)

In [156]:
chunker02 = "Recursive Structure Aware Splitting"

text_splitter = RecursiveCharacterTextSplitter(separators = ["\n\n", "\n"], chunk_size=chunkSize, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

persist_directory = 'chromaDB' + str(chromaCounter) + '/'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()
docs02 = vectordb.similarity_search_with_score(query, k=5)

In [157]:
if searchType.lower() != "product":
    chunker03 = "NLP Chunking: Tracking Topic Changes"

    text_splitter = NLTKTextSplitter(chunk_size=chunkSize)#, separator="\n")
    splits = text_splitter.split_documents(docs)

    persist_directory = 'chromaDB' + str(chromaCounter) + '/'
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=GPT4AllEmbeddings(),
        persist_directory=persist_directory
    )
    vectordb.persist()
    docs03 = vectordb.similarity_search_with_score(query, k=5)

In [158]:
if searchType.lower() == "product":
    print("question = \"" + query + "\"")

    with open("output.txt", "w") as file:
        for result in docs01:
            file.write(result[0].page_content+ "\n")

        for result in docs02:
            file.write(result[0].page_content+ "\n")

        # for result in docs03:
        #     file.write(result[0].page_content+ "\n")

    def remove_duplicates(input_file, output_file):
        try:
            # Read all lines from the input file
            with open(input_file, 'r') as file:
                lines = file.readlines()
            
            # Remove duplicate lines, maintaining order
            unique_lines = []
            seen = set()
            for line in lines:
                if line not in seen:
                    seen.add(line)
                    unique_lines.append(line)
            
            # Write the unique lines to the output file
            with open(output_file, 'w') as file:
                file.writelines(unique_lines)
            
        except Exception as e:
            print(f"An error occurred: {e}")

    # Example usage
    remove_duplicates('output.txt', 'outputCleaned.txt')

    # os.rename("output.txt", "outputCleaned.txt")

    # def clean_and_deduplicate_file(filepath):
    # # Read the file and filter lines
    #     with open(filepath, 'r') as file:
    #         lines = file.readlines()
        
    #     # Filter lines that start with "Product"
    #     filtered_lines = [line for line in lines if line.startswith('Product')]
        
    #     # Remove duplicates by converting the list to a set and back to a list
    #     unique_lines = list(set(filtered_lines))

    #     # Write the unique lines back to the file
    #     with open(filepath, 'w') as file:
    #         file.writelines(unique_lines)

    # # Path to the file
    # file_path = 'outputCleaned.txt'
    # # Clean and deduplicate the file
    # clean_and_deduplicate_file(file_path)

    # remove_duplicates('outputCleaned.txt', 'outputCleaned.txt')


question = "Show me android cables from sellers with more than 90% positive ratings."


In [159]:
if searchType.lower() == "main" or searchType.lower() == "seller":
    print("question = \"" + query + "\"")

    with open("output.txt", "w") as file:
        count = 1
        for result in docs01:
            file.write(f"response{count} = \"{result[0].page_content}\"\n")
            count += 1

        count = 1
        for result in docs02:
            file.write(f"response{count} = \"{result[0].page_content}\"\n")
            count += 1

        count = 1
        for result in docs03:
            file.write(f"response{count} = \"{result[0].page_content}\"\n")
            count += 1

    def clean_text(text):
        # Remove any image file references
        text = re.sub(r"\S+\.(png|jpg|jpeg|gif)\s*", "", text)

        # Normalize spacing issues
        text = re.sub(r"\s+", " ", text).strip()

        # Correct common typographical errors
        text = re.sub(r"isnot", "is not", text)
        text = re.sub(r"orBrand", "or Brand", text)
        text = re.sub(r"ourWarranty", "our Warranty", text)

        # Remove redundant response indicators
        text = re.sub(r"response\d+\s*=\s*\"", "", text)

        # Remove numbers followed by a dot, e.g., "1."
        text = re.sub(r"\d+\.", "", text)

        # Remove numbers followed directly by a dash, e.g., "2-"
        text = re.sub(r"\d+-", "", text)

        # Deduplicate text
        lines = text.split('.')
        seen = set()
        unique_lines = []
        for line in lines:
            line_clean = line.strip()
            if line_clean not in seen:
                seen.add(line_clean)
                unique_lines.append(line_clean)

        # Reconstruct text with clean lines
        cleaned_text = '. '.join(unique_lines).strip()
        if not cleaned_text.endswith('.'):
            cleaned_text += '.'

        # Split into multiple lines with a reasonable width
        wrapped_text = textwrap.fill(cleaned_text, width=100)

        return wrapped_text

    with open('output.txt', 'r') as file:
        raw_text = file.read()
        
    cleaned_text = clean_text(raw_text)

    with open('outputCleaned.txt', 'w') as file:
        file.write(cleaned_text)




In [160]:
loader = TextLoader("outputCleaned.txt")
docs = loader.load()

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=0)
splits = text_splitter.split_documents(docs)

persist_directory = 'chromaDB' + str(chromaCounter) + '/'
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=GPT4AllEmbeddings(),
    persist_directory=persist_directory
)
vectordb.persist()
docs = vectordb.similarity_search_with_score(query, k=2)

Created a chunk of size 780, which is longer than the specified 200
Created a chunk of size 465, which is longer than the specified 200
Created a chunk of size 495, which is longer than the specified 200
Created a chunk of size 572, which is longer than the specified 200
Created a chunk of size 854, which is longer than the specified 200
Created a chunk of size 481, which is longer than the specified 200
Created a chunk of size 493, which is longer than the specified 200
Created a chunk of size 494, which is longer than the specified 200


In [161]:
print("question = \"" + query + "\"")
count = 1
for result in docs:
    print(f"response" + str(count) + " = \"" + result[0].page_content + "\"")  
    count+=1

question = "Show me android cables from sellers with more than 90% positive ratings."
response1 = "Product 41: Product Name = micro charging cable for android, Product Category = Mobiles & Tablets/Mobile Accessories/Cables & Converters, Brand Name = No Brand, Seller Name = Wonder gadgets & accessories, URL = https://www.daraz.pk/products/-i485464709-s2280408721.html?search=1, Price Details = Original: Rs. 212, Discounted: Rs. 101, Positive Seller Ratings = 91%, Ship on Time = 100%, Return Policy = 14 days free & easy return (Change of mind is not applicable)"
response2 = "Product 22: Product Name = Type C Cable 6A:Charging and Data Transfer for Android Devices AZee Brandz, Product Category = Mobiles & Tablets/Mobile Accessories/Cables & Converters, Brand Name = No Brand, Seller Name = AZee Brandz, URL = https://www.daraz.pk/products/c-6a-i466560132-s2242082073.html?search=1, Price Details = Original: Rs. 200, Discounted: Rs. 99, Positive Seller Ratings = 100%, Ship on Time = 60%, Retur

In [162]:
if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/DarazDataMain.txt"):
    os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/DarazDataMain.txt")

if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/DarazDataSeller.txt"):
    os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/DarazDataSeller.txt")

# if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/output.txt"):
#     os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/output.txt")

# if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/FinalProductsList.csv"):
#     os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/FinalProductsList.csv")

# if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/FinalProductsList.txt"):
#     os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/FinalProductsList.txt")

# if os.path.exists("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/ProductsList.txt"):
#     os.remove("/Users/moiz/Library/CloudStorage/OneDrive-InstituteofBusinessAdministration/IBA/6th - Spring 2024/ITA/Project/ProductsList.txt")

In [163]:
# with open('output.txt', 'r') as file:
#     content = file.readlines()

### See Img1 in Images Folder