In [1]:
# Import Libraries
import json
import os
import requests
import ollama

In [7]:
# Define the folder paths
input_folder_path = 'Raw_data'
output_folder_path = 'Output_JSONs'
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.txt'):  # Process only text files
        file_path = os.path.join(input_folder_path, file_name)
        
        # Initialize a list to store JSON objects
        json_objects = []

        # Read and process the file
        with open(file_path, 'r') as file:
            for line in file:
                # Attempt to parse each line as JSON
                try:
                    json_object = json.loads(line)
                    json_objects.append(json_object)
                except json.JSONDecodeError:
                    # Handle or log the error if necessary
                    pass

        # Save each JSON object to a separate file with the file name as a prefix
        base_file_name = os.path.splitext(file_name)[0]
        for i, json_object in enumerate(json_objects):
            output_file_path = os.path.join(output_folder_path, f'json_object_{i+1}.json')
            with open(output_file_path, 'w') as outfile:
                json.dump(json_object, outfile)

        # Output the list of generated file paths
        generated_files = [os.path.join(output_folder_path, f'json_object_{i+1}.json') for i in range(len(json_objects))]
        print(f'Generated files for {file_name}:', generated_files)

Generated files for 299678_1.txt: ['Output_JSONs/json_object_1.json', 'Output_JSONs/json_object_2.json', 'Output_JSONs/json_object_3.json', 'Output_JSONs/json_object_4.json', 'Output_JSONs/json_object_5.json', 'Output_JSONs/json_object_6.json', 'Output_JSONs/json_object_7.json', 'Output_JSONs/json_object_8.json', 'Output_JSONs/json_object_9.json', 'Output_JSONs/json_object_10.json']


In [3]:
#Extract all keys from the JSON object

# Load the JSON object from the uploaded file
file_path = 'Output_JSONs/json_object_1.json'
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Function to recursively extract all keys from the JSON object
def extract_keys(obj, keys=set()):
    if isinstance(obj, dict):
        for key, value in obj.items():
            keys.add(key)
            extract_keys(value, keys)
    elif isinstance(obj, list):
        for item in obj:
            extract_keys(item, keys)
    return keys

# Extract all keys from the JSON data
all_keys = extract_keys(json_data)

# Print all unique keys
print("All keys in the JSON file:")
for key in all_keys:
    print(key)

All keys in the JSON file:
id
apiURLs
title
returnPolicy
date
lastDateSeen
categories
dateSeen
replace
dimension
offer
address
taxonomyLevel3
keys
financingAndLeasing
firstDateSeen
sourceURLs
condition
weight
doRecommend
mostRecentPriceCurrency
websiteIDs
upca
isSale
mostRecentPriceAvailability
warranty
monthlyPaymentMin
currency
features
size
mostRecentPriceAmount
gtins
imageURLs
merchant
merchants
taxonomyLevel2
upc
brand
dateUpdated
manufacturer
key
asins
monthlyPaymentMax
dateAdded
amountMax
color
primaryImageURLs
descriptions
quantities
taxonomyLevel4
term
colors
ean13
shipping
ean
amountMin
rating
country
taxonomy
postalCode
name
mostRecentPriceFirstDateSeen
reviews
didPurchase
prices
sizes
taxonomyLevel1
skus
numHelpful
mostRecentPriceDomain
domains
value
city
phone
province
text
mostRecentPriceSize
username
availability


In [6]:
# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_info_descriptions'
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "brand": json_data.get("brand"),
            "descriptions": [
                {
                    "value": desc.get("value"),
                    "sourceURLs": desc.get("sourceURLs"),
                    "dateSeen": desc.get("dateSeen")
                } for desc in json_data.get("descriptions", [])
            ],
            "dimension": json_data.get("dimension"),
            "domains": json_data.get("domains"),
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "warranty": json_data.get("warranty")
        }

        # Save the extracted data to a new JSON file named with the EAN number
        ean_number = extracted_data["ean"][0] if extracted_data["ean"] else "unknown_ean"
        output_file_path = os.path.join(output_folder_path, f'{ean_number}.json')
        with open(output_file_path, 'w') as outfile:
            json.dump(extracted_data, outfile,indent=4)

        print(f'Extracted data saved to {output_file_path}')

Extracted data saved to Extracted_info_descriptions/0719192625952.json
Extracted data saved to Extracted_info_descriptions/0719192630840.json
Extracted data saved to Extracted_info_descriptions/0719192630833.json
Extracted data saved to Extracted_info_descriptions/0719192630949.json
Extracted data saved to Extracted_info_descriptions/0719192631038.json
Extracted data saved to Extracted_info_descriptions/unknown_ean.json
Extracted data saved to Extracted_info_descriptions/0027242921955.json
Extracted data saved to Extracted_info_descriptions/0719192633056.json
Extracted data saved to Extracted_info_descriptions/0719192631014.json
Extracted data saved to Extracted_info_descriptions/0811635020918.json


In [5]:
# Define the folder paths
input_folder_path = 'Extracted_info_descriptions'
output_folder_path = 'Updated_Description_JSONs'
os.makedirs(output_folder_path, exist_ok=True)

# Function to generate a combined description using the Mistral model from Ollama
def generate_combined_description(descriptions):
    prompt = "Generate a single combined product description from the following descriptions:\n\n"
    for desc in descriptions:
        prompt += f"- {desc['value']}\n"

    response = ollama.chat(model='mistral', messages=[
        {'role': 'user', 'content': prompt},
    ])

    if response and 'message' in response and 'content' in response['message']:
        combined_description = response['message']['content'].strip()
        return combined_description
    else:
        return None

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name == "unknown_ean.json":  # Skip files with unknown EAN number
        continue
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Extract the descriptions
        descriptions = json_data.get("descriptions", [])

        # Generate a new combined description
        combined_description = generate_combined_description(descriptions)
        
        if combined_description:
            # Replace the descriptions with the new combined description
            json_data["descriptions"] = [{"value": combined_description}]

            # Save the extracted data to a new JSON file named with the EAN number
            ean_number = json_data.get("ean")[0]

            # Save the updated JSON file to the output folder
            output_file_path = os.path.join(output_folder_path, f'{ean_number}.json')
            with open(output_file_path, 'w') as outfile:
                json.dump(json_data, outfile, indent=4)

            print(f'Updated data saved to {output_file_path}')

Updated data saved to Updated_Description_JSONs/0719192630840.json
Updated data saved to Updated_Description_JSONs/0719192633056.json
Updated data saved to Updated_Description_JSONs/0719192630949.json
Updated data saved to Updated_Description_JSONs/0027242921955.json
Updated data saved to Updated_Description_JSONs/0719192630833.json
Updated data saved to Updated_Description_JSONs/0719192631014.json
Updated data saved to Updated_Description_JSONs/0719192631038.json
Updated data saved to Updated_Description_JSONs/0811635020918.json
Updated data saved to Updated_Description_JSONs/0719192625952.json
