In [1]:
# Import Libraries
import os
import json
from urllib.parse import urlparse
from collections import defaultdict
from datetime import datetime

In [2]:
#Extract all keys from the JSON object

# Load the JSON object from the uploaded file
file_path = 'Output_JSONs/json_object_1.json'
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Function to recursively extract all keys from the JSON object
def extract_keys(obj, keys=set()):
    if isinstance(obj, dict):
        for key, value in obj.items():
            keys.add(key)
            extract_keys(value, keys)
    elif isinstance(obj, list):
        for item in obj:
            extract_keys(item, keys)
    return keys

# Extract all keys from the JSON data
all_keys = extract_keys(json_data)

# Print all unique keys
print("All keys in the JSON file:")
for key in all_keys:
    print(key)

All keys in the JSON file:
rating
term
warranty
sizes
quantities
upc
ean
taxonomyLevel2
colors
country
address
numHelpful
mostRecentPriceAvailability
mostRecentPriceCurrency
amountMax
id
monthlyPaymentMax
didPurchase
taxonomy
mostRecentPriceSize
merchants
lastDateSeen
value
gtins
province
amountMin
imageURLs
brand
domains
dateAdded
username
availability
sourceURLs
doRecommend
phone
isSale
reviews
shipping
monthlyPaymentMin
city
size
text
weight
skus
name
financingAndLeasing
firstDateSeen
key
merchant
mostRecentPriceFirstDateSeen
ean13
offer
upca
title
keys
descriptions
replace
features
asins
prices
postalCode
mostRecentPriceDomain
date
dateUpdated
apiURLs
categories
color
manufacturer
currency
taxonomyLevel4
mostRecentPriceAmount
taxonomyLevel1
taxonomyLevel3
primaryImageURLs
returnPolicy
dateSeen
condition
dimension
websiteIDs


In [3]:
# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_info_prices'
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)


        # Extract the required fields
        extracted_data = {
            "ean": json_data.get("ean"),
            "prices": json_data.get("prices")
            }


        # Save the extracted data to a new JSON file named with the EAN number
        ean_number = extracted_data["ean"][0] if extracted_data["ean"] else "unknown_ean"
        output_file_path = os.path.join(output_folder_path, f'{ean_number}.json')
        with open(output_file_path, 'w') as outfile:
            json.dump(extracted_data, outfile)

        print(f'Extracted data saved to {output_file_path}')

Extracted data saved to Extracted_info_prices/0719192625952.json
Extracted data saved to Extracted_info_prices/0719192630840.json
Extracted data saved to Extracted_info_prices/0719192630833.json
Extracted data saved to Extracted_info_prices/0719192630949.json
Extracted data saved to Extracted_info_prices/0719192631038.json
Extracted data saved to Extracted_info_prices/unknown_ean.json
Extracted data saved to Extracted_info_prices/0027242921955.json
Extracted data saved to Extracted_info_prices/0719192633056.json
Extracted data saved to Extracted_info_prices/0719192631014.json
Extracted data saved to Extracted_info_prices/0811635020918.json


In [4]:
# Define the folder paths
input_folder_path = 'Extracted_info_prices'
output_folder_path = 'Pricing_history'
os.makedirs(output_folder_path, exist_ok=True)

# Helper function to extract the base domain from a URL
def get_base_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        return domain
    except Exception:
        return None

# Helper function to parse ISO format with different variations
def parse_iso_date(date_str):
    for fmt in ("%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S"):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            pass
    raise ValueError(f"Unknown date format: {date_str}")

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name == "unknown_ean.json":  # Skip files with unknown EAN number
        continue
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)
        
        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Dictionary to hold aggregated data
        merchant_data = defaultdict(lambda: {
            "amountMax": float('-inf'),
            "amountMin": float('inf'),
            "amounts": defaultdict(list),
            "firstDateSeen": None,
            "lastDateSeen": None,
            "sourceURLs": set()
        })

        # Iterate through each price entry and aggregate data
        for price_entry in json_data.get('prices', []):
            merchant = price_entry.get('merchant')
            source_urls = price_entry.get('sourceURLs', [])
            
            if not merchant:
                if len(source_urls) == 1:
                    merchant = get_base_domain(source_urls[0])
                else:
                    continue

            merchant_key = merchant[:6].lower()

            merchant_data[merchant_key]["amountMax"] = max(merchant_data[merchant_key]["amountMax"], price_entry["amountMax"])
            merchant_data[merchant_key]["amountMin"] = min(merchant_data[merchant_key]["amountMin"], price_entry["amountMin"])
            amount = price_entry["amountMax"]
            merchant_data[merchant_key]["amounts"][amount].extend(price_entry["dateSeen"])
            
            #first_date_seen = parse_iso_date(price_entry["firstDateSeen"])
            try:
                first_date_seen = parse_iso_date(price_entry["firstDateSeen"])
            except KeyError:
                try:
                    first_date_seen = parse_iso_date(price_entry["lastDateSeen"])
                except KeyError:
                    first_date_seen = min(parse_iso_date(date) for date in price_entry["dateSeen"])


            #last_date_seen = parse_iso_date(price_entry["lastDateSeen"])
            try:
                last_date_seen = parse_iso_date(price_entry["lastDateSeen"])
            except KeyError:
                try:
                    last_date_seen = parse_iso_date(price_entry["firstDateSeen"])
                except KeyError:
                    last_date_seen = max(parse_iso_date(date) for date in price_entry["dateSeen"])
                  
    
            if not merchant_data[merchant_key]["firstDateSeen"] or first_date_seen < merchant_data[merchant_key]["firstDateSeen"]:
                merchant_data[merchant_key]["firstDateSeen"] = first_date_seen
            if not merchant_data[merchant_key]["lastDateSeen"] or last_date_seen > merchant_data[merchant_key]["lastDateSeen"]:
                merchant_data[merchant_key]["lastDateSeen"] = last_date_seen
            
            merchant_data[merchant_key]["sourceURLs"].update([get_base_domain(url) for url in source_urls if get_base_domain(url)])

        # Prepare the final extracted data
        final_data = {
            "ean": json_data.get("ean"),
            "prices": []
        }

        for merchant_key, data in merchant_data.items():
            price_data = {
                "merchant": merchant_key,
                "amountMax": data["amountMax"],
                "amountMin": data["amountMin"],
                "firstDateSeen": data["firstDateSeen"].isoformat(),
                "lastDateSeen": data["lastDateSeen"].isoformat(),
                "sourceURLs": list(data["sourceURLs"])
            }
            for i, (amount, dates) in enumerate(data["amounts"].items()):
                price_data[f"currency_{i}" if i > 0 else "currency"] = amount
                price_data[f"dateSeen_{i}" if i > 0 else "dateSeen"] = sorted(dates)
            final_data["prices"].append(price_data)

        # Save the extracted data to a new JSON file named with the EAN number
        ean_number = json_data.get("ean", ["unknown_ean"])[0]

         # Skip files with unknown EAN number
        if ean_number == "unknown_ean":
            continue

        # Save the updated JSON file to the output folder
        output_file_path = os.path.join(output_folder_path, f'{ean_number}.json')
        with open(output_file_path, 'w') as outfile:
            json.dump(final_data, outfile, indent=2)

        print(f'Extracted data saved to {output_file_path}')

Extracted data saved to Pricing_history/0719192630840.json
Extracted data saved to Pricing_history/0719192633056.json
Extracted data saved to Pricing_history/0719192630949.json
Extracted data saved to Pricing_history/0027242921955.json
Extracted data saved to Pricing_history/0719192630833.json
Extracted data saved to Pricing_history/0719192631014.json
Extracted data saved to Pricing_history/0719192631038.json
Extracted data saved to Pricing_history/0811635020918.json
Extracted data saved to Pricing_history/0719192625952.json
