# Find All Products From Products Folder

In [1]:
import os
import json

# Define the path to the directory containing the text files
directory_path = 'Extracted Files/Phone Cases'

# Output file where consolidated data will be saved
output_file = 'FinalProductsList.txt'

# Read each file in the directory
def read_product_files(directory):
    products_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                data = file.read()
                corrected_data = '[' + data.replace('}\n\n{', '},\n{') + ']'
                try:
                    product_info = json.loads(corrected_data)
                    products_data.append(product_info)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from {filename}: {e}")
    return products_data

# Extract product description specifically looking more robustly
def extract_description(description_text):
    # Attempt to extract the portion after "Product Description"
    desc_start = description_text.find("Product Description:")
    if desc_start != -1:
        # Extract starting from the found index through the end of the description
        desc_substr = description_text[desc_start:]
        desc_end = desc_substr.find("<br/>")
        if desc_end != -1:
            return desc_substr[len("Product Description:"):desc_end].strip()
        else:
            return desc_substr[len("Product Description:"):].strip()
    return "Description not found."

# Write the consolidated product info to an output file
def write_product_info(products_data, output_file):
    with open(output_file, 'w') as outfile:
        for i, product in enumerate(products_data, start=1):
            product_dict = {}
            for segment in product:
                product_dict.update(segment)

            product_name = product_dict.get("Product Name", "N/A")
            category_path = product_dict.get("Category", "N/A").replace('"', '')
            brand_name = product_dict.get("Brand Name", "N/A")
            seller_name = product_dict.get("Seller Name", "N/A")
            url = product_dict.get("URL", "N/A")
            price_info = product_dict.get("Price Info", [])
            price_details = " | ".join([f"Original: {p[1]}, Discounted: {p[2]}" for p in price_info])
            # description = extract_description(product_dict.get("desc", "").replace("<br/>", "\n"))
            additional_info = product_dict.get("Additional Info", {})
            positive_ratings = additional_info.get("Positive Seller Ratings", "N/A")
            ship_on_time = additional_info.get("Ship on Time", "N/A")
            return_policy = product_dict.get("Return Policy", {})
            return_details = f"{return_policy.get('Title', 'N/A')} ({return_policy.get('Subtitle', 'N/A')})"

            # product_entry = f"Product {i:02d}: Product Name = {product_name}, Product Category = {category_path}, Brand Name = {brand_name}, Seller Name = {seller_name}, URL = {url}, Price Details = {price_details}, Description = {description}, Positive Seller Ratings = {positive_ratings}, Ship on Time = {ship_on_time}, Return Policy = {return_details}\n"
            product_entry = f"Product {i:02d}: Product Name = {product_name}, Product Category = {category_path}, Brand Name = {brand_name}, Seller Name = {seller_name}, URL = {url}, Price Details = {price_details}, Positive Seller Ratings = {positive_ratings}, Ship on Time = {ship_on_time}, Return Policy = {return_details}\n"
            outfile.write(product_entry)

# Main function to handle operations
def main():
    products_data = read_product_files(directory_path)
    write_product_info(products_data, output_file)
    print("Data consolidation complete.")

if __name__ == "__main__":
    main()

Data consolidation complete.


# FinalProductsList.txt To FinalProductsList.csv

In [2]:
import csv
import re

# Path to the input text file
input_file_path = 'FinalProductsList.txt'

# Output CSV file
output_csv_path = 'FinalProductsList.csv'

# Column headers for the CSV file
headers = [
    "Product Number", "Product Name", "Product Category", "Brand Name", "Seller Name", 
    "Price Details", "Positive Seller Ratings", "Ship on Time", "Return Policy"
]

# Function to parse each line of the text file into structured data
def parse_line(line):
    # Prepare regex pattern with lookahead assertions to capture fields correctly
    pattern = re.compile(
        r"Product Name = (?P<Product_Name>.*?)(?=, Product Category =)|"
        r"Product Category = (?P<Product_Category>.*?)(?=, Brand Name =)|"
        r"Brand Name = (?P<Brand_Name>.*?)(?=, Seller Name =)|"
        r"Seller Name = (?P<Seller_Name>.*?)(?=, URL =)|"
        r"Price Details = (?P<Price_Details>.*?)(?=, Positive Seller Ratings =)|"
        r"Positive Seller Ratings = (?P<Positive_Seller_Ratings>.*?)(?=, Ship on Time =)|"
        r"Ship on Time = (?P<Ship_on_Time>.*?)(?=, Return Policy =)|"
        r"Return Policy = (?P<Return_Policy>.*?)(?=, Product \d+:|, URL =|$)"
    )

    # Extract product number separately
    product_number = re.match(r"Product (\d+):", line).group(1)

    # Find all matches in the line
    matches = pattern.finditer(line)
    data = {k: v for m in matches for k, v in m.groupdict().items() if v is not None}

    # Constructing the row based on required headers
    return [
        "Product " + product_number,
        data.get("Product_Name", ""),
        data.get("Product_Category", ""),
        data.get("Brand_Name", ""),
        data.get("Seller_Name", ""),
        data.get("Price_Details", ""),
        data.get("Positive_Seller_Ratings", ""),
        data.get("Ship_on_Time", ""),
        data.get("Return_Policy", "")
    ]

# Reading the text file and writing to CSV
with open(input_file_path, 'r', encoding='utf-8') as file, \
     open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)  # Writing headers to the CSV file
    
    for line in file:
        if line.strip():  # Ensuring the line has content
            row = parse_line(line)
            writer.writerow(row)  # Writing the parsed data as a row in the CSV file

print("CSV file has been created successfully.")


CSV file has been created successfully.


# Extracting Information From Queries Regarding Products

In [3]:
import spacy

# Load the English NLP model from spaCy
nlp = spacy.load('en_core_web_sm')

# Example queries, including your latest example about shoes
queries = [
    "Show me watches under Rs. 500.",
    "Can you list watches between Rs. 1000 and Rs. 2000?",
    "Show me all Samsung watches available.",
    "Which watches do you have from Jianuo?",
    "List watches sold by New-Gen.",
    "Show me the top-rated sellers who sell watches.",
    "Show me the top-rated sellers who sell shoes.",
    "Show me watches from sellers with more than 90% positive ratings.",
    "Which watches are sold by highly rated sellers?",
    "Can you find watches that always ship on time?",
    "I'm looking for sports watches priced between Rs. 1500 and Rs. 2500, sold by top-rated sellers.",
    "What are the latest smartwatches available under Rs. 5000?",
    "List all luxury watches.",
    "Help me find a watch for a gift under Rs. 2000.",
    "List all the shoes available in blue colour."
]

# Keywords that are likely subjects in retail-related queries
keywords = ["watch", "watches", "shoes", "smartwatches", "luxury watches", "sports watches"]

# Function to clean and extract the subject from the query
def extract_subject(query):
    doc = nlp(query)
    for chunk in doc.noun_chunks:
        cleaned_chunk = ' '.join([token.text for token in chunk if token.pos_ in ['NOUN', 'PROPN']])
        if any(keyword in cleaned_chunk.lower() for keyword in keywords):
            return cleaned_chunk
    return "No clear subject found"

# Process each query and print the subject
for query in queries:
    subject = extract_subject(query)
    print(f"Query: '{query}' -> Subject: {subject}")


Query: 'Show me watches under Rs. 500.' -> Subject: watches
Query: 'Can you list watches between Rs. 1000 and Rs. 2000?' -> Subject: watches
Query: 'Show me all Samsung watches available.' -> Subject: No clear subject found
Query: 'Which watches do you have from Jianuo?' -> Subject: No clear subject found
Query: 'List watches sold by New-Gen.' -> Subject: List watches
Query: 'Show me the top-rated sellers who sell watches.' -> Subject: watches
Query: 'Show me the top-rated sellers who sell shoes.' -> Subject: shoes
Query: 'Show me watches from sellers with more than 90% positive ratings.' -> Subject: No clear subject found
Query: 'Which watches are sold by highly rated sellers?' -> Subject: watches
Query: 'Can you find watches that always ship on time?' -> Subject: watches
Query: 'I'm looking for sports watches priced between Rs. 1500 and Rs. 2500, sold by top-rated sellers.' -> Subject: sports watches
Query: 'What are the latest smartwatches available under Rs. 5000?' -> Subject: smar

In [4]:
# import re
# import pandas as pd

# products_df = pd.read_csv('FinalProductsList.csv')

# def extract_info_simple(query):
#     # Define keywords for subject identification
#     subject_keywords = ["watch", "watches", "smartwatch", "luxury watch"]
#     brand_names = products_df['Brand Name'].str.lower().unique().tolist()
#     seller_names = products_df['Seller Name'].str.lower().unique().tolist()

#     # Patterns for limitations
#     price_pattern = r"Rs\.\s*\d+|\d+\s*%|between\s*Rs\.\s*\d+\s*and\s*Rs\.\s*\d+"
#     # Updated rating pattern to be more specific and catch contexts like "more than 90%"
#     rating_pattern = r"more than \d{1,3}% positive ratings|less than \d{1,3}% positive ratings|\d{1,3}% positive ratings|\d{1,3}%"
#     time_pattern = r"ship on time"
    
#     # Find subjects
#     subjects = [keyword for keyword in subject_keywords if keyword in query.lower()]
#     subjects.extend([brand for brand in brand_names if brand in query.lower()])
    
#     # Find limitations
#     limitations = re.findall(price_pattern, query)
#     limitations.extend(re.findall(rating_pattern, query))
#     if "top-rated sellers" in query.lower() or "highly rated sellers" in query.lower():
#         limitations.append("top-rated sellers")
#     if re.search(time_pattern, query, re.IGNORECASE):
#         limitations.append("ship on time")

#     # Check if there are specific seller names mentioned
#     for seller in seller_names:
#         if seller in query.lower():
#             limitations.append(f"sold by {seller}")

#     # return {"subjects": subjects, "limitations": limitations}
#     return limitations

# # Redefining example queries
# query_examples = [
#     "Show me watches under Rs. 500",
#     "Can you list watches between Rs. 1000 and Rs. 2000?",
#     "Show me all Samsung watches available.",
#     "List watches sold by New-Gen.",
#     "Show me the top-rated sellers who sell watches.",
#     "Show me watches from sellers with more than 90% positive ratings.",
#     "Which watches are sold by highly rated sellers?",
#     "Can you find watches that always ship on time?",
#     "I'm looking for sports watches priced between Rs. 1500 and Rs. 2500, sold by top-rated sellers.",
#     "What are the latest smartwatches available under Rs. 5000?",
#     "List all luxury watches.",
#     "Help me find a watch for a gift under Rs. 2000."
# ]

# # Apply function to each example query
# extracted_info = [extract_info_simple(query) for query in query_examples]
# extracted_info


In [5]:
import re
import pandas as pd

products_df = pd.read_csv('FinalProductsList.csv')

def extract_info_simple(query):
    # Define keywords for subject identification
    subject_keywords = ["watch", "watches", "smartwatch", "luxury watch"]
    brand_names = products_df['Brand Name'].str.lower().unique().tolist()
    seller_names = products_df['Seller Name'].str.lower().unique().tolist()

    # Patterns for limitations
    price_pattern = r"Rs\.\s*\d+|\d+\s*%|between\s*Rs\.\s*\d+\s*and\s*Rs\.\s*\d+"
    # Updated rating pattern to be more specific and catch contexts like "more than 90%"
    rating_pattern = r"more than \d{1,3}% positive ratings|less than \d{1,3}% positive ratings|\d{1,3}% positive ratings|\d{1,3}%"
    time_pattern = r"ship on time"
    
    # Find subjects
    subjects = [keyword for keyword in subject_keywords if keyword in query.lower()]
    subjects.extend([brand for brand in brand_names if brand in query.lower()])
    
    # Find limitations
    limitations = re.findall(price_pattern, query)
    limitations.extend(re.findall(rating_pattern, query))
    if "top-rated sellers" in query.lower() or "highly rated sellers" in query.lower():
        limitations.append("top-rated sellers")
    if re.search(time_pattern, query, re.IGNORECASE):
        limitations.append("ship on time")

    # Check if there are specific seller names mentioned
    for seller in seller_names:
        if seller in query.lower():
            limitations.append(f"sold by {seller}")

    # return {"subjects": subjects, "limitations": limitations}
    return limitations

# Redefining example queries
query = "Show me watches from sellers with more than 90% positive ratings."
extracted_info = extract_info_simple(query)
extracted_info


['90%', 'more than 90% positive ratings']

# Finding Products Based On Limitations

In [6]:
import pandas as pd
import re

def load_data(filepath):
    """Load the product data from a CSV file and preprocess it."""
    data = pd.read_csv(filepath)
    data['Discounted Price'] = data['Price Details'].apply(
        lambda x: min(map(int, re.findall(r'Discounted: Rs\. (\d+)', x)))
    )
    data['Positive Seller Ratings'] = data['Positive Seller Ratings'].str.rstrip('%').astype(int)
    data['Ship on Time'] = data['Ship on Time'].str.rstrip('%').astype(int)
    return data

def parse_limitation(limitation):
    """Parse the limitation string into a structured dictionary."""
    if 'between Rs.' in limitation:
        low, high = map(int, re.findall(r'\d+', limitation))
        return {'price_range': (low, high)}
    elif 'Rs.' in limitation:
        price = int(re.findall(r'\d+', limitation)[0])
        return {'price_exact': price}
    elif 'sold by' in limitation:
        seller = limitation.split('sold by ')[1].strip()
        return {'seller_name': seller}
    elif 'top-rated sellers' in limitation:
        return {'top_rated_sellers': 90}
    elif '%' in limitation:
        rating = int(re.findall(r'\d+', limitation)[0])
        return {'top_rated_sellers': rating}
    elif 'ship on time' in limitation:
        return {'ship_on_time': 100}
    else:
        return None  # Handle unrecognized input

def filter_products(data, limitation_dict):
    """Apply filters to the data based on parsed limitations."""
    if limitation_dict is None:
        return []
    key, value = next(iter(limitation_dict.items()))
    if key == 'price_exact':
        filtered_data = data[data['Discounted Price'] == value]
    elif key == 'price_range':
        filtered_data = data[(data['Discounted Price'] >= value[0]) & (data['Discounted Price'] <= value[1])]
    elif key == 'seller_name':
        filtered_data = data[data['Seller Name'].str.contains(value, case=False, na=False)]
    elif key == 'top_rated_sellers':
        filtered_data = data[data['Positive Seller Ratings'] >= value]
    elif key == 'ship_on_time':
        filtered_data = data[data['Ship on Time'] == value]
    return filtered_data['Product Number'].tolist()

# Load data
data = load_data('FinalProductsList.csv')

# Example limitation
limitation = "'90%', 'more than 90% positive ratings'"

# Parse and filter products based on the limitation
parsed_limitation = parse_limitation(limitation)
matching_product_numbers = filter_products(data, parsed_limitation)

matching_product_numbers

['Product 01',
 'Product 02',
 'Product 04',
 'Product 05',
 'Product 06',
 'Product 07',
 'Product 10',
 'Product 12',
 'Product 16',
 'Product 17',
 'Product 19',
 'Product 20',
 'Product 22',
 'Product 23',
 'Product 24',
 'Product 26',
 'Product 29',
 'Product 36',
 'Product 37',
 'Product 38',
 'Product 42']

# Shortlisting Products From All Products

In [7]:
def filter_products(input_filename, output_filename, matching_product_numbers):
    with open(input_filename, 'r') as file:
        lines = file.readlines()

    # Prepare to collect matching lines
    matching_lines = []

    # Filter lines based on matching_product_numbers
    if matching_product_numbers:
        # Create a set for faster lookup
        product_set = set(matching_product_numbers)
        for line in lines:
            # Assuming each line starts with a product identifier like "Product XX:"
            product_number = line.split(':', 1)[0].strip()
            if product_number in product_set:
                matching_lines.append(line)
    else:
        # If matching_product_numbers is empty, select all lines
        matching_lines = lines

    # Write the selected lines to the output file
    with open(output_filename, 'w') as file:
        file.writelines(matching_lines)

filter_products('FinalProductsList.txt', 'ProductsList.txt', matching_product_numbers)
