In [9]:
from datasets import load_dataset

ds = load_dataset("paraloq/json_data_extraction")

In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'topic', 'item_id', 'schema', 'item', 'text', 'medium', '__index_level_0__'],
        num_rows: 484
    })
})

In [8]:
import json
import os
import random

# Ensure the directory exists
os.makedirs("items", exist_ok=True)

# Extract the "item" and "title" features from the dataset
items = ds["train"]["item"]
titles = ds["train"]["title"]

# Set a seed for reproducibility
random.seed(42)

# Select 10 random items and their corresponding titles
random_indices = random.sample(range(len(items)), 100)
random_items = [items[i] for i in random_indices]
random_titles = [titles[i] for i in random_indices]

# Write each item to a separate .json file with formatting
for i, (item, title) in enumerate(zip(random_items, random_titles)):
    item_dict = json.loads(item)  # Parse the JSON string into a dictionary
    sanitized_title = "".join(c if c.isalnum() else "_" for c in title)  # Sanitize the title for file naming
    file_number = f"{i+1:03}"  # Format the running number with leading zeros
    with open(f"items/{file_number}_{sanitized_title}.json", "w") as f:
        json.dump(item_dict, f, indent=4)  # Write the dictionary as formatted JSON

In [1]:
import os
import json
import ollama

# Directory where the JSON files are stored
json_directory = "items"

# Directory where the generated .md files will be stored (if you want to keep them in the same folder)
output_directory = "items"

# Path to the file where the prompt is stored
prompt_file_path = "prompt_to_generate_instructions.md"

# Function to load the prompt from the file
def load_prompt():
    with open(prompt_file_path, "r") as prompt_file:
        return prompt_file.read()

# Function to call the local LLM via Ollama
def generate_markdown_from_json(json_content, prompt_template):
    # Insert the JSON content into the prompt
    final_prompt = prompt_template.replace("{{json_content}}", json.dumps(json_content, indent=4))
    
    # Call the LLM using ollama's chat function
    try:
        response = ollama.chat(model='gemma2:27b', messages=[{'role': 'user', 'content': final_prompt}])
        return response['message']['content']  # Assuming this contains the Markdown content
    except ollama.ResponseError as e:
        raise Exception(f"Error while generating markdown: {e.error}")

# Load the prompt from the file
prompt_template = load_prompt()

# Loop over all JSON files in the directory
for filename in os.listdir(json_directory):
    if filename.endswith(".json") and "_solution" not in filename:
        json_path = os.path.join(json_directory, filename)
        
        # Create the output Markdown file
        output_filename = f"{os.path.splitext(filename)[0]}_instruction.md"
        output_path = os.path.join(output_directory, output_filename)
        
        # Check if the output file already exists
        if os.path.exists(output_path):
            print(f"Skipping {output_filename} as it already exists.")
            continue
        
        # Load the JSON content
        with open(json_path, "r") as json_file:
            json_content = json.load(json_file)
        
        # Generate markdown content using the LLM
        markdown_content = generate_markdown_from_json(json_content, prompt_template)
        
        # Write the markdown content to the .md file
        with open(output_path, "w") as md_file:
            md_file.write(markdown_content)
        
        print(f"Generated {output_filename} from {filename}")

Skipping 001_TV_Show_Bloopers_Reel_instruction.md as it already exists.
Skipping 002_Customer_Loyalty_Program_instruction.md as it already exists.
Skipping 003_Healthcare_Insurance_Policy_instruction.md as it already exists.
Skipping 004_Software_Patch_Management_instruction.md as it already exists.
Skipping 005_Employee_Performance_Appraisal_instruction.md as it already exists.
Skipping 006_Purchase_Order_instruction.md as it already exists.
Skipping 007_Product_Price_Comparison_Chart_instruction.md as it already exists.
Skipping 008_Discount_Promotion_instruction.md as it already exists.
Skipping 009_Cloud_Service_Subscription_instruction.md as it already exists.
Skipping 010_Shopping_Cart_instruction.md as it already exists.
Skipping 011_Video_Game_Fan_Art_Item_instruction.md as it already exists.
Skipping 012_Machine_Maintenance_History_instruction.md as it already exists.
Skipping 013_Machine_Operator_Training_Manual_instruction.md as it already exists.
Skipping 014_Traveler_s_Ele

In [2]:
import os
import json
import ollama

# Directory where the JSON files are stored
json_directory = "items"

# Path to the file where the prompt is stored
prompt_file_path = "prompt_to_generate_solution.md"

# Function to load the prompt from the file
def load_prompt():
    with open(prompt_file_path, "r") as prompt_file:
        return prompt_file.read()

# Function to load the instruction from the Markdown file
def load_instruction(instruction_file_path):
    with open(instruction_file_path, "r") as instruction_file:
        return instruction_file.read()

# Function to call the local LLM via Ollama
def apply_instructions_to_json(json_content, instruction_content, prompt_template):
    # Insert the JSON content and instruction content into the prompt
    final_prompt = prompt_template.replace("{{json_content}}", json.dumps(json_content, indent=4))
    final_prompt = final_prompt.replace("{{instruction_content}}", instruction_content)

    # Call the LLM using Ollama's chat function
    try:
        response = ollama.chat(model='gemma2:27b', messages=[{'role': 'user', 'content': final_prompt}])
        output_text = response['message']['content']

        # Attempt to parse the output as JSON
        try:
            # Remove any code fences or extra text
            json_start = output_text.find('{')
            json_end = output_text.rfind('}') + 1
            json_str = output_text[json_start:json_end]

            modified_json = json.loads(json_str)
            return modified_json
        except json.JSONDecodeError as e:
            raise Exception(f"Failed to parse LLM output as JSON. Error: {e}\nLLM Output:\n{output_text}")
    except ollama.ResponseError as e:
        raise Exception(f"Error while applying instructions: {e.error}")

# Load the prompt from the file
prompt_template = load_prompt()

# Loop over all JSON files in the directory
for filename in os.listdir(json_directory):
    if filename.endswith(".json") and not filename.endswith("_solution.json"):
        try:
            json_path = os.path.join(json_directory, filename)
            base_name = os.path.splitext(filename)[0]
            instruction_filename = f"{base_name}_instruction.md"
            instruction_path = os.path.join(json_directory, instruction_filename)

            # Check if the corresponding solution file already exists
            solution_filename = f"{base_name}_solution.json"
            solution_path = os.path.join(json_directory, solution_filename)

            if os.path.exists(solution_path):
                print(f"Solution file {solution_filename} already exists for {filename}. Skipping.")
                continue

            # Check if the instruction file exists
            if not os.path.exists(instruction_path):
                print(f"Instruction file {instruction_filename} not found for {filename}. Skipping.")
                continue

            # Load the JSON content
            with open(json_path, "r") as json_file:
                json_content = json.load(json_file)

            # Load the instruction content
            instruction_content = load_instruction(instruction_path)

            # Generate the modified JSON using the LLM
            modified_json = apply_instructions_to_json(json_content, instruction_content, prompt_template)

            # Create the output JSON file
            output_path = os.path.join(json_directory, solution_filename)

            # Write the modified JSON content to the .json file
            with open(output_path, "w") as json_file:
                json.dump(modified_json, json_file, indent=4)

            print(f"Generated {solution_filename} by applying instructions to {filename}")
        except Exception as e:
            print(f"An error occurred while processing {filename}: {e}")
            continue  # Continue to next iteration


Solution file 001_TV_Show_Bloopers_Reel_solution.json already exists for 001_TV_Show_Bloopers_Reel.json. Skipping.
Solution file 002_Customer_Loyalty_Program_solution.json already exists for 002_Customer_Loyalty_Program.json. Skipping.
Solution file 003_Healthcare_Insurance_Policy_solution.json already exists for 003_Healthcare_Insurance_Policy.json. Skipping.
Solution file 004_Software_Patch_Management_solution.json already exists for 004_Software_Patch_Management.json. Skipping.
Solution file 005_Employee_Performance_Appraisal_solution.json already exists for 005_Employee_Performance_Appraisal.json. Skipping.
Solution file 006_Purchase_Order_solution.json already exists for 006_Purchase_Order.json. Skipping.
Solution file 007_Product_Price_Comparison_Chart_solution.json already exists for 007_Product_Price_Comparison_Chart.json. Skipping.
Solution file 008_Discount_Promotion_solution.json already exists for 008_Discount_Promotion.json. Skipping.
Solution file 009_Cloud_Service_Subscr