In [None]:
# Import Libraries
import os
import json
from urllib.parse import urlparse
from collections import defaultdict, OrderedDict
from datetime import datetime
import ollama
import pandas as pd
from html import unescape
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import shutil

## PROCESS RAW DATA

In [None]:
## RAW DATA to JSONs

# Define the folder paths
input_folder_path = 'Raw_data'
output_folder_path = 'Output_JSONs'
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.txt'):  # Process only text files
        file_path = os.path.join(input_folder_path, file_name)
        
        # Initialize a list to store JSON objects
        json_objects = []

        # Read and process the file
        with open(file_path, 'r' , encoding='utf-8') as file:
            for line in file:
                # Attempt to parse each line as JSON
                try:
                    json_object = json.loads(line)
                    json_objects.append(json_object)
                except json.JSONDecodeError:
                    # Handle or log the error if necessary
                    pass

        # Save each JSON object to a separate file with the file name as a prefix
        base_file_name = os.path.splitext(file_name)[0]
        for i, json_object in enumerate(json_objects):
            output_file_path = os.path.join(output_folder_path, f'json_object_{i+1}.json')
            with open(output_file_path, 'w') as outfile:
                json.dump(json_object, outfile)

        # Output the list of generated file paths
        generated_files = [os.path.join(output_folder_path, f'json_object_{i+1}.json') for i in range(len(json_objects))]
        #print(f'Generated files for {file_name}:', generated_files)

In [None]:
#Extract all keys from the JSON object

# Load the JSON object from the uploaded file
file_path = r'Output_JSONs/json_object_1.json'
with open(file_path, 'r' , encoding='utf-8') as file:
    json_data = json.load(file)

# Function to recursively extract all keys from the JSON object
def extract_keys(obj, keys=set()):
    if isinstance(obj, dict):
        for key, value in obj.items():
            keys.add(key)
            extract_keys(value, keys)
    elif isinstance(obj, list):
        for item in obj:
            extract_keys(item, keys)
    return keys

# Extract all keys from the JSON data
all_keys = extract_keys(json_data)

# Print all unique keys
print("All keys in the JSON file:")
for key in all_keys:
    print(key)

## PRICES PIPELINE

In [None]:
import os
import json
import pandas as pd

# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_info_prices'
os.makedirs(output_folder_path, exist_ok=True)

# Initialize the counter for unknown_ean files
unknown_id_counter = 1

# List to store input and output file mappings
file_mappings = []

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "gtins": json_data.get("gtins"),
            "upc": json_data.get("upc"),
            "upca": json_data.get("upca"),
            "asins": json_data.get("asins", None),
            "prices": json_data.get("prices"),
            "id": json_data.get("id"),
            
        }

        # Determine the file name using the EAN number
        if extracted_data["id"]:
            id_number = extracted_data["id"]
        else:
            id_number = f"unknown_id_{unknown_id_counter}"
            unknown_id_counter += 1
        
        output_file_name = f'{id_number}.json'
        output_file_path = os.path.join(output_folder_path, output_file_name)
        
        # Save the extracted data to a new JSON file
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(extracted_data, outfile, ensure_ascii=False, indent=4)

        # Store the input and output filenames
        file_mappings.append({"input_filename": file_name, "output_filename": output_file_name})

# Convert the file mappings to a DataFrame
df = pd.DataFrame(file_mappings)

# Save the DataFrame to an Excel file
excel_output_path = os.path.join(output_folder_path, 'file_mappings.xlsx')
df.to_excel(excel_output_path, index=False)

# Uncomment the line below if you want to print the path of the Excel file
# print(f'Excel file saved to {excel_output_path}')

In [None]:
import os
import json
import shutil

# Define the folder paths
input_folder_path = 'Extracted_info_prices'
corrupted_folder_path = 'Data_with_no_prices'
os.makedirs(corrupted_folder_path, exist_ok=True)

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)
        
        try:
            # Load the JSON object from the file
            with open(file_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)

            # Check if 'prices' exists and is a list
            prices = json_data.get('prices', [])
            if prices is None or not isinstance(prices, list):
                raise TypeError("'prices' is not a list or is None")
            
        except (TypeError, json.JSONDecodeError, KeyError) as e:
            # Move the corrupted file to the 'Corrupted_JSONs' folder
            corrupted_file_path = os.path.join(corrupted_folder_path, file_name)
            shutil.move(file_path, corrupted_file_path)
            print(f"Moved corrupted file {file_name} to {corrupted_folder_path} due to error: {str(e)}")

In [None]:
# Define the folder paths
input_folder_path = 'Extracted_info_prices'
output_folder_path = 'Pricing_history'
os.makedirs(output_folder_path, exist_ok=True)

# Helper function to extract the base domain from a URL
def get_base_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception:
        return None

# Helper function to clean the merchant name
def clean_merchant_name(name):
    if name:
        name = name.split('Learn more')[0]
        name = name.split('Store info')[0]
    return name.strip() if name else None

# Helper function to parse ISO format with different variations
def parse_iso_date(date_str):
    for fmt in ("%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S"):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            pass
    raise ValueError(f"Unknown date format: {date_str}")

# Iterate through all JSON files in the input folder
unique_merchants = set()
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)
        
        # Load the JSON object from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        # Dictionary to hold aggregated data
        merchant_data = defaultdict(lambda: {
            "amountMax": float('-inf'),
            "amountMin": float('inf'),
            "amounts": defaultdict(list),
            "firstDateSeen": None,
            "lastDateSeen": None,
            "sourceURLs": set()
        })

        # Iterate through each price entry and aggregate data
        for price_entry in json_data.get('prices', []):
            merchant = clean_merchant_name(price_entry.get('merchant'))
            source_urls = price_entry.get('sourceURLs', [])
            
            if not merchant:
                if len(source_urls) == 1:
                    merchant = get_base_domain(source_urls[0])
                else:
                    continue

            merchant_key = merchant.lower()

            merchant_data[merchant_key]["amountMax"] = max(merchant_data[merchant_key]["amountMax"], price_entry["amountMax"])
            merchant_data[merchant_key]["amountMin"] = min(merchant_data[merchant_key]["amountMin"], price_entry["amountMin"])
            amount = price_entry["amountMax"]
            merchant_data[merchant_key]["amounts"][amount].extend(price_entry["dateSeen"])
            
            try:
                first_date_seen = parse_iso_date(price_entry["firstDateSeen"])
            except KeyError:
                try:
                    first_date_seen = parse_iso_date(price_entry["lastDateSeen"])
                except KeyError:
                    first_date_seen = min(parse_iso_date(date) for date in price_entry["dateSeen"])

            try:
                last_date_seen = parse_iso_date(price_entry["lastDateSeen"])
            except KeyError:
                try:
                    last_date_seen = parse_iso_date(price_entry["firstDateSeen"])
                except KeyError:
                    last_date_seen = max(parse_iso_date(date) for date in price_entry["dateSeen"])
                  
            if not merchant_data[merchant_key]["firstDateSeen"] or first_date_seen < merchant_data[merchant_key]["firstDateSeen"]:
                merchant_data[merchant_key]["firstDateSeen"] = first_date_seen
            if not merchant_data[merchant_key]["lastDateSeen"] or last_date_seen > merchant_data[merchant_key]["lastDateSeen"]:
                merchant_data[merchant_key]["lastDateSeen"] = last_date_seen
            
            merchant_data[merchant_key]["sourceURLs"].update([get_base_domain(url) for url in source_urls if get_base_domain(url)])

        # Collect unique merchants
        unique_merchants.update(merchant_data.keys())

        # Prepare the final extracted data
        final_data = {
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "upc": json_data.get("upc"),
            "upca": json_data.get("upca"),
            "gtins": json_data.get("gtins"),
            "asins": json_data.get("asins", None),
            "id": json_data.get("id"),
            "prices": []
        }

        for merchant_key, data in merchant_data.items():
            price_data = {
                "merchant": merchant_key,
                "amountMax": data["amountMax"],
                "amountMin": data["amountMin"],
                "firstDateSeen": data["firstDateSeen"].isoformat(),
                "lastDateSeen": data["lastDateSeen"].isoformat(),
                "sourceURLs": list(data["sourceURLs"])
            }
            for i, (amount, dates) in enumerate(data["amounts"].items()):
                price_data[f"currency_{i}" if i > 0 else "currency"] = amount
                price_data[f"dateSeen_{i}" if i > 0 else "dateSeen"] = sorted(dates)
            final_data["prices"].append(price_data)

        # Save the extracted data to a new JSON file named with the EAN number
        if json_data is None or json_data.get("id") is None:
            continue

        # Skip files with unknown id number
        id_number = json_data["id"]
        # Save the updated JSON file to the output folder
        output_file_path = os.path.join(output_folder_path, f'{id_number}.json')
        with open(output_file_path, 'w') as outfile:
            json.dump(final_data, outfile, indent=2)

        print(f'Extracted data saved to {output_file_path}')

In [None]:
def find_latest_price(merchant_data):
    latest_date = None
    latest_currency = None
    
    # Iterate over dateSeen_0, dateSeen_1, etc., until we don't find a key
    for i in range(1000000):  # Arbitrarily large number, assuming the maximum number of entries
        key = f"dateSeen_{i}"
        if key in merchant_data and merchant_data[key]:
            dates = merchant_data[key]
            if dates:
                last_date = max(dates)
                if not latest_date or last_date > latest_date:
                    latest_date = last_date
                    currency_key = f"currency_{i}"
                    latest_currency = merchant_data[currency_key] if currency_key in merchant_data else None
    
    return latest_date, latest_currency

def process_data(input_data):
    ean = input_data.get("ean", [None])[0]
    ean13 = input_data.get("ean13")
    upc = input_data.get("upc")
    upca = input_data.get("upca")
    gtins = input_data.get("gtins")
    asins = input_data.get("asins", None)
    id_number = input_data.get("id")
    
    prices = input_data["prices"]
    result = {
        "ean": [ean],
        "ean13": ean13,
        "upca": upca,
        "upc": upc,
        "gtins": gtins,
        "asins": asins,
        "id": id_number,
        "prices": []
    }
    
    for price in prices:
        merchant = price["merchant"]
        latest_date, latest_currency = find_latest_price(price)
        
        if latest_date and latest_currency:
            result["prices"].append({
                "merchant": merchant,
                "currency": latest_currency,
                "dateSeen": latest_date,
                "sourceURLs": price.get("sourceURLs", [])
            })
    
    return result

# Create output directory if it doesn't exist
output_directory = "Latest_Pricing_per_Merchant"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Directory containing input JSON files
input_directory = "Pricing_history"

# Iterate through all JSON files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".json"):
        input_file = os.path.join(input_directory, filename)
        
        # Load input data from file
        with open(input_file, 'r', encoding='utf-8') as f:
            input_data = json.load(f)
        
        # Process the data
        output_data = process_data(input_data)
        
        # Define output file path
        output_file = os.path.join(output_directory, f"{output_data['id']}.json")
        
        # Save the output to a new JSON file
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=2)
        
        print(f"Processed {filename} and saved output to {output_file}")

print("All files processed.")

In [None]:
input_directory = "Latest_Pricing_per_Merchant"

# Create output directory for 2024 data
output_directory = "Latest_Pricing_per_Merchant_2024"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Function to filter entries where dateSeen begins with "2024" and get the latest entry for each merchant
def filter_2024_data(input_data):
    ean = input_data.get("ean", [None])[0]
    ean13 = input_data.get("ean13")
    upc = input_data.get("upc")
    upca = input_data.get("upca")
    gtins = input_data.get("gtins")
    asins = input_data.get("asins", None)
    id_number = input_data.get("id")

    prices = input_data["prices"]

    # Dictionary to store the latest entry for each merchant
    filtered_prices = {}
    for price in prices:
        if price["dateSeen"].startswith("2024"):
            merchant = price["merchant"]
            date_seen = price["dateSeen"]

            if merchant not in filtered_prices or datetime.strptime(date_seen, '%Y-%m-%d') > datetime.strptime(filtered_prices[merchant]["dateSeen"], '%Y-%m-%d'):
                filtered_prices[merchant] = price

    return {
        "ean": [ean],
        "ean13": ean13,
        "upc": upc,
        "upca": upca,
        "gtins": gtins,
        "asins": asins,
        "id": id_number,
        "prices": list(filtered_prices.values())
    }

# Iterate through all JSON files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith(".json"):
        input_file = os.path.join(input_directory, filename)

        # Load input data from file
        with open(input_file, 'r', encoding='utf-8') as f:
            input_data = json.load(f)

        # Filter the data
        filtered_data = filter_2024_data(input_data)

        # Skip files where no data is available for 2024
        if not filtered_data["prices"]:
            continue

        # Define output file path
        output_file = os.path.join(output_directory, filename)

        # Save the output to a new JSON file
        with open(output_file, 'w') as f:
            json.dump(filtered_data, f, indent=2)

        print(f"Processed {filename} and saved output to {output_file}")

print("All files processed.")

In [None]:
def replace_merchant_names(file_path):
    replacements = {
        'amazon': 'amazon',
        'walmart': 'walmart',
        'ebay': 'ebay',
        'bestbuy': 'bestbuy',
        'best buy': 'bestbuy',
        'kmart': 'kmart',
        'target': 'target',
        'newegg': 'newegg',
        'new egg': 'newegg',
        'overstock': 'overstock',
        'lowes': 'lowes',
        'lowe\'s': 'lowes',
        'homedepot': 'homedepot',
        'home depot': 'homedepot',
        'bonanza': 'bonanza',
        'sears': 'sears',
        'kohl': 'kohls',
        'artfire': 'artfire',
        'bkstr': 'bkstr',
        'buya.com': 'buya',
        'discount bandit': 'discount bandit',
        'ebluejay': 'ebluejay',
        'ecrater': 'ecrater',
        'truegether': 'truegether',
        'macys': 'macys',
        'macy\'s': 'macys',
        'macys.com': 'macys',
    }
    
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    updated_prices = []
    
    for item in data.get('prices', []):
        original_name = item.get('merchant', '').lower()
        for key, value in replacements.items():
            if key in original_name:
                item['merchant'] = value
                break
        updated_prices.append(item)
    
    return data, updated_prices

def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in os.listdir(input_directory):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_directory, file_name)
            original_data, updated_prices = replace_merchant_names(file_path)
            
            # Grouping data by merchant name
            grouped_data = {}
            for item in updated_prices:
                merchant = item.get('merchant')
                if merchant not in grouped_data:
                    grouped_data[merchant] = []
                grouped_data[merchant].append(item)
            
            # Updating the original data with the grouped prices
            original_data['prices'] = [item for sublist in grouped_data.values() for item in sublist]
            
            # Writing the updated data to a new JSON file
            output_file_path = os.path.join(output_directory, file_name)
            with open(output_file_path, 'w') as output_file:
                json.dump(original_data, output_file, indent=4)
            
            #print(f"Processed and saved {file_name} to {output_directory}")

# Replace with the actual directory paths
input_directory = 'Pricing_history'
output_directory = 'output'

process_directory(input_directory, output_directory)

In [None]:
def replace_merchant_names(file_path):
    replacements = {
        'amazon': 'amazon',
        'walmart': 'walmart',
        'ebay': 'ebay',
        'bestbuy': 'bestbuy',
        'best buy': 'bestbuy',
        'kmart': 'kmart',
        'target': 'target',
        'newegg': 'newegg',
        'new egg': 'newegg',
        'overstock': 'overstock',
        'lowes': 'lowes',
        'lowe\'s': 'lowes',
        'homedepot': 'homedepot',
        'home depot': 'homedepot',
        'bonanza': 'bonanza',
        'sears': 'sears',
        'kohl': 'kohls',
        'artfire': 'artfire',
        'bkstr': 'bkstr',
        'buya.com': 'buya',
        'discount bandit': 'discount bandit',
        'ebluejay': 'ebluejay',
        'ecrater': 'ecrater',
        'truegether': 'truegether',
        'macys': 'macys',
        'macy\'s': 'macys',
        'macys.com': 'macys',
    }
    
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    updated_prices = []
    
    for item in data.get('prices', []):
        original_name = item.get('merchant', '').lower()
        for key, value in replacements.items():
            if key in original_name:
                item['merchant'] = value
                break
        updated_prices.append(item)
    
    return data, updated_prices

def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in os.listdir(input_directory):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_directory, file_name)
            original_data, updated_prices = replace_merchant_names(file_path)
            
            # Grouping data by merchant name
            grouped_data = {}
            for item in updated_prices:
                merchant = item.get('merchant')
                if merchant not in grouped_data:
                    grouped_data[merchant] = []
                grouped_data[merchant].append(item)
            
            # Updating the original data with the grouped prices
            original_data['prices'] = [item for sublist in grouped_data.values() for item in sublist]
            
            # Writing the updated data to a new JSON file
            output_file_path = os.path.join(output_directory, file_name)
            with open(output_file_path, 'w') as output_file:
                json.dump(original_data, output_file, indent=4)
            
            #print(f"Processed and saved {file_name} to {output_directory}")

# Replace with the actual directory paths
input_directory = 'Latest_Pricing_per_Merchant'
output_directory = 'Latest_Pricing_per_Merchant_v2'

process_directory(input_directory, output_directory)

In [None]:
def replace_merchant_names(file_path):
    replacements = {
        'amazon': 'amazon',
        'walmart': 'walmart',
        'ebay': 'ebay',
        'bestbuy': 'bestbuy',
        'best buy': 'bestbuy',
        'kmart': 'kmart',
        'target': 'target',
        'newegg': 'newegg',
        'new egg': 'newegg',
        'overstock': 'overstock',
        'lowes': 'lowes',
        'lowe\'s': 'lowes',
        'homedepot': 'homedepot',
        'home depot': 'homedepot',
        'bonanza': 'bonanza',
        'sears': 'sears',
        'kohl': 'kohls',
        'artfire': 'artfire',
        'bkstr': 'bkstr',
        'buya.com': 'buya',
        'discount bandit': 'discount bandit',
        'ebluejay': 'ebluejay',
        'ecrater': 'ecrater',
        'truegether': 'truegether',
        'macys': 'macys',
        'macy\'s': 'macys',
        'macys.com': 'macys',
    }
    
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    updated_prices = []
    
    for item in data.get('prices', []):
        original_name = item.get('merchant', '').lower()
        for key, value in replacements.items():
            if key in original_name:
                item['merchant'] = value
                break
        updated_prices.append(item)
    
    return data, updated_prices

def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for file_name in os.listdir(input_directory):
        if file_name.endswith('.json'):
            file_path = os.path.join(input_directory, file_name)
            original_data, updated_prices = replace_merchant_names(file_path)
            
            # Grouping data by merchant name
            grouped_data = {}
            for item in updated_prices:
                merchant = item.get('merchant')
                if merchant not in grouped_data:
                    grouped_data[merchant] = []
                grouped_data[merchant].append(item)
            
            # Updating the original data with the grouped prices
            original_data['prices'] = [item for sublist in grouped_data.values() for item in sublist]
            
            # Writing the updated data to a new JSON file
            output_file_path = os.path.join(output_directory, file_name)
            with open(output_file_path, 'w') as output_file:
                json.dump(original_data, output_file, indent=4)
            
            #print(f"Processed and saved {file_name} to {output_directory}")

# Replace with the actual directory paths
input_directory = 'Latest_Pricing_per_Merchant_2024'
output_directory = 'Latest_Pricing_per_Merchant_2024_v2'

process_directory(input_directory, output_directory)

In [None]:
def merge_prices(prices):
    merged = {
        "amountMax": 0,
        "amountMin": float('inf'),
        "firstDateSeen": None,
        "lastDateSeen": None,
        "sourceURLs": set(),
        "currency_prices": []
    }

    for price in prices:
        merged["amountMax"] = max(merged["amountMax"], price["amountMax"])
        merged["amountMin"] = min(merged["amountMin"], price["amountMin"])
        if not merged["firstDateSeen"] or price["firstDateSeen"] < merged["firstDateSeen"]:
            merged["firstDateSeen"] = price["firstDateSeen"]
        if not merged["lastDateSeen"] or price["lastDateSeen"] > merged["lastDateSeen"]:
            merged["lastDateSeen"] = price["lastDateSeen"]
        merged["sourceURLs"].update(price["sourceURLs"])

        for key, value in price.items():
            if key.startswith("currency_"):
                currency_index = int(key.split('_')[-1])
                merged["currency_prices"].append({
                    "currency": value,
                    "dateSeen": price[f"dateSeen_{currency_index}"]
                })
            elif key == "currency":
                merged["currency_prices"].append({
                    "currency": value,
                    "dateSeen": price["dateSeen"]
                })

    merged["sourceURLs"] = list(merged["sourceURLs"])
    return merged

def group_merchants_data(input_directory, output_directory):
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Process each JSON file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith('.json'):
            input_file = os.path.join(input_directory, filename)
            with open(input_file, 'r') as file:
                data = json.load(file)
            
            grouped_data = {}

            # Group data by merchant name
            for price_entry in data['prices']:
                merchant = price_entry['merchant']
                if merchant not in grouped_data:
                    grouped_data[merchant] = []
                grouped_data[merchant].append(price_entry)
            
            # Create the new grouped structure for the prices
            new_prices = []
            for merchant, prices in grouped_data.items():
                merged_prices = merge_prices(prices)
                new_prices.append({
                    "merchant": merchant,
                    "prices": [merged_prices]
                })
            
            # Preserve the original data and update the prices field
            new_data = data.copy()
            new_data['prices'] = new_prices

            # Save the modified data to a new JSON file
            output_file = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_modified.json")
            
            with open(output_file, 'w') as file:
                json.dump(new_data, file, indent=4)
            
            #print(f"Modified data saved to {output_file}")

# Example usage
input_directory = 'output'  # Replace with your input directory path
output_directory = 'output_v1'  # Replace with your output directory path
group_merchants_data(input_directory, output_directory)

In [None]:
def transform_currency_prices(currency_prices):
    transformed = []
    for idx, price in enumerate(currency_prices, start=1):
        new_price = {
            f"currency_{idx}": price["currency"],
            f"dateSeen_{idx}": price["dateSeen"]
        }
        transformed.append(new_price)
    return transformed

def transform_prices(prices):
    for price_entry in prices:
        if "currency_prices" in price_entry:
            price_entry["currency_prices"] = transform_currency_prices(price_entry["currency_prices"])
    return prices

def transform_json(data):
    for price_entry in data["prices"]:
        price_entry["prices"] = transform_prices(price_entry["prices"])
    return data

def process_files(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            input_file_path = os.path.join(input_dir, filename)
            output_file_path = os.path.join(output_dir, filename)

            with open(input_file_path, 'r') as f:
                data = json.load(f)

            transformed_data = transform_json(data)

            with open(output_file_path, 'w') as f:
                json.dump(transformed_data, f, indent=4)

if __name__ == "__main__":
    input_dir = 'output_v1'  # Replace with your input directory path
    output_dir = 'output_v2'  # Replace with your output directory path
    process_files(input_dir, output_dir)

In [None]:
import os

def rename_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('_modified.json'):
            new_filename = filename.replace('_modified', '')
            old_filepath = os.path.join(directory, filename)
            new_filepath = os.path.join(directory, new_filename)
            os.rename(old_filepath, new_filepath)
            #print(f"Renamed '{filename}' to '{new_filename}'")

# Directory containing the JSON files
directory = 'output_v2'  # Replace with your directory path

# Rename the files
rename_files(directory)

In [None]:
def delete_folders(folders):
    for folder in folders:
        if os.path.exists(folder):
            shutil.rmtree(folder)
            print(f"Deleted folder '{folder}'")
        else:
            print(f"Folder '{folder}' does not exist")

def rename_folder(old_name, new_name):
    if os.path.exists(old_name):
        os.rename(old_name, new_name)
        print(f"Renamed folder '{old_name}' to '{new_name}'")
    else:
        print(f"Folder '{old_name}' does not exist")

# Folders to delete
folders_to_delete = ['output', 'output_v1']

# Folder to rename
old_folder_name = 'output_v2'
new_folder_name = 'Pricing_history_v2'

# Delete specified folders
delete_folders(folders_to_delete)

# Rename the folder
rename_folder(old_folder_name, new_folder_name)

## FEATURES Pipline

In [None]:
# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_features'
os.makedirs(output_folder_path, exist_ok=True)

# Initialize the counter for unknown EANs
unknown_id_counter = 1

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "brand": json_data.get("brand"),
            "category": json_data.get("categories"),
            "features": [
                {
                    "key": feature.get("key"),
                    "value": feature.get("value")
                } for feature in json_data.get("features", [])
            ],
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "gtins": json_data.get("gtins"),
            "upc": json_data.get("upc"),
            "upca": json_data.get("upca"),
            "asins": json_data.get("asins", None),
            "id": json_data.get("id"),
            "taxonomy": json_data.get("taxonomy"),
        }

        # Determine the EAN number or use a counter for unknown EANs
        if extracted_data["id"]:
            id_number = extracted_data["id"]
        else:
            id_number_number = f"unknown_id_{unknown_id_counter}"
            unknown_id_counter += 1

        # Save the extracted data to a new JSON file named with the EAN number
        output_file_path = os.path.join(output_folder_path, f'{id_number}.json')
        with open(output_file_path, 'w') as outfile:
            json.dump(extracted_data, outfile, indent=2)

        print(f'Extracted data saved to {output_file_path}')

## PRODUCT ID PIPELINE

In [None]:
# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'ID_JSONs'
os.makedirs(output_folder_path, exist_ok=True)

# Initialize the counter for unknown EANs
unknown_id_counter = 1

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "gtins": json_data.get("gtins"),
            "upca": json_data.get("upca"),
            "upc": json_data.get("upc"),
            "id": json_data.get("id"),
            # Get asins from json_data.get("asins") if it exists, otherwise use None
            "asins": json_data.get("asins", None)
        }

        # Determine the file name using the EAN number or use a counter for unknown EANs
        if extracted_data["id"]:
            id_number = extracted_data["id"]
        else:
            id_number = f"unknown_id_{unknown_id_counter}"
            unknown_id_counter += 1

        # Save the extracted data to a new JSON file
        output_file_path = os.path.join(output_folder_path, f'{id_number}.json')
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(extracted_data, outfile, ensure_ascii=False, indent=4)

        # Uncomment the line below if you want to print the output file path
        # print(f'Extracted data saved to {output_file_path}')

## IMAGE URLS Pipeline

In [None]:
import os
import json

# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'ImageURLs_JSONs'
os.makedirs(output_folder_path, exist_ok=True)

# Initialize the counter for unknown EANs
unknown_id_counter = 1

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "gtins": json_data.get("gtins"),
            "upc": json_data.get("upc"),
            "upca": json_data.get("upca"),
            # Get asins from json_data.get("asins") if it exists, otherwise use None
            "asins": json_data.get("asins", None),
            "id": json_data.get("id"),
            "imageURLs": json_data.get("imageURLs", None)
        }

        # Determine the file name using the EAN number or use a counter for unknown EANs
        if extracted_data["id"]:
            id_number = extracted_data["id"]
        else:
            id_number = f"unknown_id_{unknown_id_counter}"
            unknown_id_counter += 1

        # Save the extracted data to a new JSON file
        output_file_path = os.path.join(output_folder_path, f'{id_number}.json')
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(extracted_data, outfile, ensure_ascii=False, indent=4)

        # Uncomment the line below if you want to print the output file path
        # print(f'Extracted data saved to {output_file_path}')

In [None]:
import re
import json
import os

# Merchant patterns
merchant_patterns = {
    'bestbuy': re.compile(r'bbystatic\.com'),
    'lowes': re.compile(r'lowes\.com'),
    'walmart': re.compile(r'walmartimages\.com'),
    'newegg': re.compile(r'neweggimages\.com'),
    'costco': re.compile(r'costco\.com'),
    'kroger': re.compile(r'kroger\.com'),
    'amazon': re.compile(r'amazon\.com'),
    'officedepot': re.compile(r'officedepot\.com'),
    'homedepot': re.compile(r'homedepot\.com' | r'homedepotstatic\.com'),
    'target': re.compile(r'target\.scene7\.com'),
    'ebay': re.compile(r'ebayimg\.com'),
    'ajmadison': re.compile(r'ajmadison\.com'),
    'wayfair': re.compile(r'wayfairimages\.com'),
    'overstock': re.compile(r'overstock\.com'),
    'sears': re.compile(r'sears\.com'),
    'kmart': re.compile(r'kmart\.com'),
    'macys': re.compile(r'macysassets\.com'),
    'staples': re.compile(r'staples\.com'),
    'Others': re.compile(r'')
}

def group_image_urls(data):
    # Group URLs by merchant
    grouped_urls = {merchant: [] for merchant in merchant_patterns.keys()}
    for url in data["imageURLs"]:
        matched = False
        for merchant, pattern in merchant_patterns.items():
            if pattern.search(url):
                grouped_urls[merchant].append(url)
                matched = True
                break
        if not matched:
            grouped_urls['Others'].append(url)

    # Remove empty lists
    grouped_urls = {k: v for k, v in grouped_urls.items() if v}
    return grouped_urls

def process_json_files(input_dir, output_dir):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Process each file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            input_file = os.path.join(input_dir, filename)
            output_file = os.path.join(output_dir, filename)

            # Read the input JSON file
            with open(input_file, 'r') as f:
                data = json.load(f)

            # Group the image URLs
            grouped_urls = group_image_urls(data)

            # Update the JSON data with grouped URLs
            data["imageURLs"] = grouped_urls

            # Write the updated JSON data to the output file
            with open(output_file, 'w') as f:
                json.dump(data, f, indent=4)

# Example usage
input_dir = 'ImageURLs_JSONs'  # Replace with the path to your input directory
output_dir = 'imageurls_jsons_v2'  # Replace with the path to your output directory
process_json_files(input_dir, output_dir)

In [None]:
import os
import json

# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_info_reviews'
os.makedirs(output_folder_path, exist_ok=True)

# Initialize the counter for unknown EANs
unknown_id_counter = 1

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "gtins": json_data.get("gtins"),
            "upc": json_data.get("upc"),
            "upca": json_data.get("upca"),
            # Get asins from json_data.get("asins") if it exists, otherwise use None
            "asins": json_data.get("asins", None),
            "id": json_data.get("id"),
            "reviews": json_data.get("reviews")
        }

        # Determine the file name using the EAN number or use a counter for unknown EANs
        if extracted_data["id"]:
            id_number = extracted_data["id"]
        else:
            id_number = f"unknown_id_{unknown_id_counter}"
            unknown_id_counter += 1

        # Save the extracted data to a new JSON file
        output_file_path = os.path.join(output_folder_path, f'{id_number}.json')
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            json.dump(extracted_data, outfile, ensure_ascii=False, indent=4)

        # Uncomment the line below if you want to print the output file path
        # print(f'Extracted data saved to {output_file_path}')

In [None]:
import os
import subprocess

# Run the python file
subprocess.run(["python", "convert_files_into_json.py"])

## DESCRIPTION PIPELINE

In [None]:
import os
import json
import pandas as pd

# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_info_descriptions'
os.makedirs(output_folder_path, exist_ok=True)

# Counter for unknown EAN files
unknown_id_counter = 1

# List to store information about files that created unknown_ean files
unknown_id_files = []

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "brand": json_data.get("brand"),
            "descriptions": [
                {
                    "value": desc.get("value"),
                    "sourceURLs": desc.get("sourceURLs"),
                    "dateSeen": desc.get("dateSeen")
                } for desc in json_data.get("descriptions", [])
            ],
            "dimension": json_data.get("dimension"),
            "domains": json_data.get("domains"),
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "gtins": json_data.get("gtins"),
            "upc": json_data.get("upc"),
            "upca": json_data.get("upca"),
            "id": json_data.get("id"),
            "warranty": json_data.get("warranty")
        }

        # Determine the output file name
        if extracted_data["id"]:
            id_number = extracted_data["id"]
        else:
            id_number = f"unknown_id_{unknown_id_counter}"
            unknown_id_files.append({"Input File": file_name, "Output File": f'{id_number}.json'})
            unknown_id_counter += 1

            # Add the "id" field to extracted_data for unknown_ean files
            #extracted_data["id"] = json_data.get("id")

        output_file_path = os.path.join(output_folder_path, f'{id_number}.json')

        # Save the extracted data to the output file
        with open(output_file_path, 'w') as outfile:
            json.dump(extracted_data, outfile, indent=4)

        print(f'Extracted data saved to {output_file_path}')

# Save the information about unknown EAN files to an Excel sheet
if unknown_id_files:
    df = pd.DataFrame(unknown_id_files)
    df.to_excel('unknown_id_files.xlsx', index=False)
    print('Excel sheet with unknown ID files created: unknown_id_files.xlsx')
else:
    print('No unknown EAN files were created.')