In [1]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from typing import List, Dict
import random
import shutil

import sys
sys.path.append('../scripts')  

from jsontocsv import update_csv_from_json

In [2]:
load_dotenv()
# Configure OpenAI API key
openai = OpenAI()
api_key = os.getenv("OPENAI_API_KEY")

In [3]:
system = '''You are an assistant trained to generate Nepali-English code-mixed smart home control commands for fine-tuning a language model. 
Your task is to follow the updated guidelines and create commands as provided in the examples.'''

In [4]:
user = '''1. Guideline for Commands:
   - Commands should be **Nepali-English code-mixed**, with some in **full English** and **Nepali-dominant**, but written in **Romanized Nepali** (transliterated Nepali to English):
     - **50% Code-Mixed:** e.g., "study room ko batti balnu" or "baranda ko lights dim gara".
     - **30% Fully English:** e.g., "Turn off the lights in the study room and balcony."
     - **20% Nepali-Dominant (Transliterated):** e.g., "padhney kotha ra baranda ko batti balera rakhnus."
   - **Do not** generate commands in fully native Nepali script (e.g., "बालकोनी" or "स्टडी रूम"). Use Romanized Nepali instead.
   - Incorporate natural variations in grammar, phrasing, and spelling.
   - Commands may include requests or polite phrasing. For instance:
     - Study room ra bardali ko batti off garna milcha?
     - Can you brighten the study area and baranda lights?
   - A single command can include different actions for different rooms. For example:
     - "Study room ko light color red rakhnu ra baranda ko light off gara."
     - "Adjust the brightness of the bardali light to 50% and turn off the study room lights."

2. Room Names:
   - Generate commands for the following rooms:
     - **Study Room**: `padhney kotha`, `study`.
     - **Balcony**: `bardali`, `baranda`.

3. Intent:
   - Generate commands exclusively for **multi_room_control** intent.
   - Each command should involve the two rooms.
   - Example: "Turn off the lights in the study room and balcony."

4. JSON Output Requirements:
   - Translate Nepali transliterations to English room names in the `actions` section.
   - Follow this consistent JSON structure:
     {
      "command": "<natural language command>",
      "intent": "multi_room_control",
      "rooms": [
          "<list_of_room_names>"
      ],
      "actions": [
          {
              "room": "<room_name>",
              "action": "<turn_on | turn_off | adjust_brightness | change_color>",
              "brightness": "<optional: 0 to 1>",
              "color": "<optional: red | blue | green | etc.>"
          }
      ]
    }

5. Examples:
   - **Code-Mixed:** 
     ```json
     {
       "command": "Baranda ko light on ra padhney kotha ko batti off garnus.",
       "intent": "multi_room_control",
       "rooms": ["balcony", "study room"],
       "actions": [
           {
               "room": "balcony",
               "action": "turn_on"
           },
           {
               "room": "study room",
               "action": "turn_off"
           }
       ]
     }
     ```
   - **Fully English:**
     ```json
     {
       "command": "Turn off the lights in the study room and change color to red in balcony.",
       "intent": "multi_room_control",
       "rooms": ["study room", "balcony"],
       "actions": [
           {
               "room": "study room",
               "action": "turn_off"
           },
           {
               "room": "balcony",
               "action": "change_color",
               "color": "red"
           }
       ]
     }
     ```

6. Generate Variations:
   - Generate 50 commands as an array of JSON objects following these guidelines, ensuring diversity and natural phrasing.'''


In [5]:
def builder(system, user):
    return [{"role":"system", "content":system},
            {"role":"user", "content":user}
           ]       

In [6]:
# combining all the steps
def generate_dataset(system, user):
    try:
        response = openai.chat.completions.create(model = "gpt-4o",
                                                  messages = builder(system, user),
                                                  temperature = 0.8, 
                                                  # max_tokens = 2000,
                                                #   response_format={"type": "json_object"}
                                                  )
        # dataset = json.loads(response.choices[0].message.content)
        dataset = (response.choices[0].message.content)
        return dataset
    except Exception as e:
        print(f"Error generating dataset: {str(e)}")
        return []

In [7]:
# function to save the dataset as json      
def save_dataset(dataset: List[Dict], filename: str):
    """Save the generated dataset to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

In [8]:
def tojsonDump():
    root_path = "../data/jsonRawAdditionalMulti"
    try:
        # Ensure the directory exists
        os.makedirs(root_path, exist_ok=True)
        
        if len(os.listdir(root_path)) == 0:
            shutil.copy("light_automation_dataset.json", "light_automation_dataset_1.json")
            shutil.move("light_automation_dataset_1.json", root_path)
        else:
            num = len(os.listdir(root_path)) + 1
            fname = "light_automation_dataset_" + f"{num}" + ".json"
            shutil.copy("light_automation_dataset.json", fname)
            shutil.move(fname, root_path)
        print("Moved successfully to jsonDump. File: ", fname)
    except Exception as e:
        print(f"Error moving to {root_path}", str(e))

In [10]:
# if __name__ == "__main__":
#     dataset = generate_dataset(system, user)
    
#     if dataset:
#         #Save the dataset
#         dataset1 = dataset[7:]
#         dataset1 = dataset1[:-3]
#         dataset1 = json.loads(dataset1)
#         save_dataset(dataset1, "light_automation_dataset.json")
#         print(f"Successfully generated and saved {len(dataset1)} samples to light_automation_dataset.json")
#         tojsonDump()
#         # update_csv_from_json("light_automation_dataset.json", "annotator_data.csv")
#     else:
#         print("Failed to generate dataset")

In [11]:
for i in range(17):
    print("Itr: ", i+1)
    dataset = generate_dataset(system, user)
    
    if dataset:
        #Save the dataset
        dataset1 = dataset[7:]
        dataset1 = dataset1[:-3]
        dataset1 = json.loads(dataset1)
        save_dataset(dataset1, "light_automation_dataset.json")
        print(f"Successfully generated and saved {len(dataset1)} samples to light_automation_dataset.json")
        tojsonDump()
        # update_csv_from_json("light_automation_dataset.json", "annotator_data.csv")
    else:
        print("Failed to generate dataset")

Itr:  1
Successfully generated and saved 45 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_155.json
Itr:  2
Successfully generated and saved 49 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_156.json
Itr:  3
Successfully generated and saved 45 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_157.json
Itr:  4
Successfully generated and saved 46 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_158.json
Itr:  5
Successfully generated and saved 45 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_159.json
Itr:  6
Successfully generated and saved 49 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_160.json
Itr:  7
Successfully generated and saved 47 samples 