In [12]:
# Import Libraries
import json
import os
import requests
import ollama



In [18]:
# Define the folder paths
input_folder_path = 'Output_JSONs'
output_folder_path = 'Extracted_features'
os.makedirs(output_folder_path, exist_ok=True)

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Extract the required fields
        extracted_data = {
            "brand": json_data.get("brand"),
            "category": json_data.get("categories"),
            "features": [
                {
                    "key": feature.get("key"),
                    "value": feature.get("value")
                } for feature in json_data.get("features", [])
            ],
            "ean": json_data.get("ean"),
            "ean13": json_data.get("ean13"),
            "taxonomy": json_data.get("taxonomy"),
        }

        # Save the extracted data to a new JSON file named with the EAN number
        ean_number = extracted_data["ean"][0] if extracted_data["ean"] else "unknown_ean"
        output_file_path = os.path.join(output_folder_path, f'{ean_number}.json')
        with open(output_file_path, 'w') as outfile:
            json.dump(extracted_data, outfile, indent=2)

        print(f'Extracted data saved to {output_file_path}')

Extracted data saved to Extracted_features/0719192625952.json
Extracted data saved to Extracted_features/0719192630840.json
Extracted data saved to Extracted_features/0719192630833.json
Extracted data saved to Extracted_features/0719192630949.json
Extracted data saved to Extracted_features/0719192631038.json
Extracted data saved to Extracted_features/unknown_ean.json
Extracted data saved to Extracted_features/0027242921955.json
Extracted data saved to Extracted_features/0719192633056.json
Extracted data saved to Extracted_features/0719192631014.json
Extracted data saved to Extracted_features/0811635020918.json


In [22]:
import os
import json
import ollama

# Define the folder paths
input_folder_path = 'Extracted_features'
output_folder_path = 'key_features'
os.makedirs(output_folder_path, exist_ok=True)

# Function to generate the top 10 feature keys using the Mistral model from Ollama
def generate_top_feature_keys(taxonomy):
    prompt = f"Based on the taxonomy '{taxonomy}', define the top 10 feature keys for a product in this category."

    response = ollama.chat(model='mistral', messages=[
        {'role': 'user', 'content': prompt},
    ])

    if response and 'message' in response and 'content' in response['message']:
        top_features = response['message']['content'].strip().split('\n')
        return list(set([feature.strip() for feature in top_features if feature.strip()]))
    else:
        return None

# Function to extract feature information using the Mistral model from Ollama
def extract_feature_information(features, top_features):
    prompt = "Based on the following top feature keys, find and fill in the information from the given features:\n\n"
    for feature in top_features:
        prompt += f"- {feature}\n"
    prompt += "\nFrom the given features:\n\n"
    for feature in features:
        prompt += f"- {feature['key']}: {feature['value']}\n"

    response = ollama.chat(model='mistral', messages=[
        {'role': 'user', 'content': prompt},
    ])

    if response and 'message' in response and 'content' in response['message']:
        filled_features = response['message']['content'].strip().split('\n')
        return filled_features
    else:
        return None

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name == "unknown_ean.json":  # Skip files with unknown EAN number
        continue
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)

        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)

        # Extract the taxonomy
        taxonomy = json_data.get("taxonomy", [])
        if not taxonomy:
            continue
        taxonomy = taxonomy[0]

        # Check if the top features file exists for this taxonomy
        top_features_file = os.path.join(output_folder_path, f'{taxonomy.replace(" ", "_").replace(">", "_")}_top_features.txt')
        if os.path.exists(top_features_file):
            # Load the top features from the file
            with open(top_features_file, 'r') as file:
                top_features = [line.strip() for line in file.readlines()]
        else:
            # Generate the top 10 feature keys using Mistral
            top_features = generate_top_feature_keys(taxonomy)

            if top_features:
                # Save the top features to a file
                with open(top_features_file, 'w') as file:
                    for feature in top_features:
                        file.write(f'{feature}\n')

        if top_features:
            # Extract the features
            features = json_data.get("features", [])
            
            # Use Mistral to fill in the information for the top features
            filled_features = extract_feature_information(features, top_features)
            
            if filled_features:
                # Create a dictionary from the filled features
                new_features = []
                for feature in filled_features:
                    if ": " in feature:
                        key, value = feature.split(": ", 1)
                        new_features.append({"key": key, "value": value})

                # Create the new JSON object with the top 10 features
                new_json_data = {
                    "brand": json_data.get("brand"),
                    "category": json_data.get("category"),
                    "ean": json_data.get("ean"),
                    "ean13": json_data.get("ean13"),
                    "features": new_features
                }

                # Save the new JSON file to the output folder
                output_file_path = os.path.join(output_folder_path, file_name)
                with open(output_file_path, 'w') as outfile:
                    json.dump(new_json_data, outfile, indent=4)

                print(f'Updated data saved to {output_file_path}')

Updated data saved to key_features/0719192630840.json
Updated data saved to key_features/0719192633056.json
Updated data saved to key_features/0719192630949.json
Updated data saved to key_features/0027242921955.json
Updated data saved to key_features/0719192630833.json
Updated data saved to key_features/0719192631014.json
Updated data saved to key_features/0719192631038.json
Updated data saved to key_features/0811635020918.json
Updated data saved to key_features/0719192625952.json
