In [25]:
import os
import json
import google.generativeai as genai
from dotenv import load_dotenv
from tqdm import tqdm  # Progress bar
import time


In [26]:
# Load environment variables (Ensure .env contains GOOGLE_API_KEY)
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

In [27]:
# Initialize Gemini API
genai.configure(api_key=google_api_key)
model = genai.GenerativeModel("gemini-1.5-flash")  # Use Gemini Pro 1.5

In [28]:
# Input and output file names
input_file = "../../datasets/qa_pairs.json"
output_file = "./my_dataset1.jsonl"

In [29]:
system_message = (
    "You are an expert business consultant specializing in Washington state business laws and benefits. "
    "Help users choose the best business structure: LLC, Non-Profit, or S-Corp. "
    "Ask clarifying questions to understand their goals, liability concerns, tax preferences, and operational structure. "
    "Provide recommendations with clear reasoning, considering Washington-specific tax laws, registration requirements, and business incentives. "
    "Avoid suggesting General Partnerships, Sole Proprietorships, or C-Corps."
)


In [30]:
# Load entire JSON file as an array
with open(input_file, "r", encoding="utf-8") as f:
    entries = json.load(f)  # Read the full JSON array

# Open output file for writing JSONL
with open(output_file, "w", encoding="utf-8") as fout:
    for entry in tqdm(entries, desc="Enhancing Responses", unit="pair"):
        user_message = entry["instruction"]

        # Generate response using Gemini
        response = model.generate_content([
            {"role": "user", "parts": [{"text": user_message}]}
        ])

        # Extract generated text
        enhanced_response = response.text if response.text else "Error: No response generated"

        # Format for Mistral fine-tuning
        mistral_entry = {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": enhanced_response}
            ]
        }
        time.sleep(4.2)
        # Write each JSON object as a separate line in JSONL format
        fout.write(json.dumps(mistral_entry) + "\n")

print(f"\n✅ Fine-tuning dataset saved to {output_file}")

Enhancing Responses: 100%|██████████| 620/620 [1:10:40<00:00,  6.84s/pair]


✅ Fine-tuning dataset saved to ./my_dataset1.jsonl





In [None]:

# Input and output file names
input_file = "./business_qlora.jsonl"
output_file = "./my_dataset.json"

data = []
system_string = "You are an expert business consultant helping users choose the best business structure: LLC, Non-Profit, or S-Corp. Ask clarifying questions if needed, analyze their needs, and recommend the best option with clear reasoning. Avoid suggesting General Partnerships, Sole Proprietorships, or C-Corps."

# Read JSONL file and convert it to the required format
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line.strip())
        formatted_entry = {
            "messages": [
                {"role": "user", "content": entry["instruction"]},  # Fix: "context" → "content"
                {"role": "assistant", "content": entry["response"]}  # Fix: "context" → "content"
            ]
        }
        data.append(formatted_entry)

# Write the formatted data to a JSON file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

print(f"Converted JSONL to JSON and saved to {output_file}")


Converted JSONL to JSON and saved to ./my_dataset.json
