In [4]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from typing import List, Dict
import random
import shutil

import sys
sys.path.append('../scripts')  

from jsontocsv import update_csv_from_json

In [2]:
load_dotenv()
# Configure OpenAI API key
openai = OpenAI()
api_key = os.getenv("OPENAI_API_KEY")

## Version 3 of the prompt

In [3]:
system = '''You are an assistant trained to generate Nepali-English code-mixed smart home control commands for fine-tuning a language model. 
            Your task is to follow the guidelines and create commands as provided in the examples.'''

In [4]:
user = '''1. Guideline for Commands:
   - Commands should be **Nepali-English code-mixed**, with some in **full English** and **Nepali-dominant** for diversity:
     - **50% Code-Mixed:** e.g., "bhancha ko batti balnu" or "kitchen ko light on gara".
     - **30% Fully English:** e.g., "Turn on the kitchen lights".
     - **20% Nepali-Dominant:** e.g., "Baithak kothaa ko batti balera rakhnus."
   - Incorporate natural variations in grammar, phrasing, and spelling.
   - Commands may include requests or polite phrasing. For instance:
     - Sutney kotha ko batti off garna milcha?
     - Can you please turn off the bedroom lights?
     - Could you change the light color to blue in bhaancha, please?

2. Room Names:
   - Include the following **room names** with variations in Nepali spellings and transliterations:
     - **Generic Room**: `kotha`, `kothaa`.
     - **Kitchen**: `bhancha`, `bhaanchaa`, `bhanxa`, `bhancha kotha`, `bhaanchaa kotha`, `bhansa`, `bhansaa`.
     - **Living Room**: `baithak`, `baithak kotha`, `baithak kothaa`.
     - **Bedroom**: `sutney kotha`, `kotha`.
     - **Store Room**: `store`.
     - **Corridor**: `baato`, `bato`.
     - **Balcony**: `bardali`, `baranda`.
     - **Terrace**: `kausi`, `chat`, `chhat`.
     - **Study Room**: `padhney kotha`.
     - **Worship/Prayer Room**: `puja kotha`, `pooja kotha`, `mandir`.

3. Intents:
   - Include the following **intents**:
     - `turn_on`, `turn_off`, `change_color`, `adjust_brightness`, and `multi_room_control`.
   - Commands should involve **one or two rooms only** to ensure clarity and consistency. Avoid commands that reference more than two rooms in a single request.
   - Use parameters such as:
     - **Brightness levels**: `low`, `medium`, `high`, or numerical values between 0 to 1 (e.g., `0.2` for dim, `1` for bright).
     - **Colors**: `red`, `blue`, `green`, etc.
     - Include **multi-room commands** only for up to two rooms (e.g., turning on lights in both the `kitchen` and `living room`).

4. JSON Output Requirements:
   - **Consistent Keys:** Always include `command`, `intent`, `rooms`, and `actions`. Use optional keys (`brightness`, `color`) only if relevant.
   - Commands should involve a maximum of **two rooms per JSON output**, with separate actions or intents clearly defined for each room.
   - Translate Nepali transliterations to English room names in the `actions` section.
   - Example JSON Structure:
     {
      "command": "<command>",
      "intent": "<single_room_control | multi_room_control | adjust_brightness | change_color>",
      "rooms": [
          "<list_of_room_names>"
      ],
      "actions": [
          {
              "room": "<room_name>",
              "action": "<turn_on | turn_off | adjust_brightness | change_color>",
              "brightness": "<optional: 0 to 1>",
              "color": "<optional: red | blue | green | etc.>"
          },
          ...
      ]
    }

  Example 1:
  {
        "command": "Sutney kotha ko lights off garera baithak kotha ko lights ujyalo banaidinus.",
        "intent": "multi_room_control",
        "rooms": [
            "bedroom",
            "living room"
        ],
        "actions": [
            {
                "room": "bedroom",
                "action": "turn_off"
            },
            {
                "room": "living room",
                "action": "adjust_brightness",
                "brightness": 1.0
            }
        ]
    }

    Example 2:
    {
        "command": "Living room lights switch on and bhaancha ko lights ali dim garna milcha?",
        "intent": "multi_room_control",
        "rooms": [
            "living room",
            "kitchen"
        ],
        "actions": [
            {
                "room": "living room",
                "action": "turn_on"
            },
            {
                "room": "kitchen",
                "action": "adjust_brightness",
                "brightness": 0.3
            }
        ]
    }

    Example 3: 
    {
        "command": "Puja kotha ko light color yellow garnu, please.",
        "intent": "change_color",
        "rooms": ["prayer room"],
        "actions": [
            {
                "room": "prayer room",
                "action": "change_color",
                "color": "yellow"
            }
        ]
    }

    Example 4:
    {
        "command": "Can you dim the terrace lights?",
        "intent": "single_room_control",
        "rooms": ["terrace"],
        "actions": [
            {
                "room": "terrace", 
                "action": "adjust_brightness", 
                "brightness": 0.3
            }
        ]
    }

5. Generate Variations:
   - Generate 50 command as an array of JSON objects following these guidelines, ensuring diversity and natural phrasing.
'''

In [5]:
def builder(system, user):
    return [{"role":"system", "content":system},
            {"role":"user", "content":user}
           ]       

In [6]:
# combining all the steps
def generate_dataset(system, user):
    try:
        response = openai.chat.completions.create(model = "gpt-4o",
                                                  messages = builder(system, user),
                                                  temperature = 0.8, 
                                                  # max_tokens = 2000,
                                                #   response_format={"type": "json_object"}
                                                  )
        # dataset = json.loads(response.choices[0].message.content)
        dataset = (response.choices[0].message.content)
        return dataset
    except Exception as e:
        print(f"Error generating dataset: {str(e)}")
        return []

In [7]:
# function to save the dataset as json      
def save_dataset(dataset: List[Dict], filename: str):
    """Save the generated dataset to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False, indent=4)

In [8]:
# Makes a copy of newly generated json file and move that to a ../data/jsonRaw folder. 
def tojsonDump():
    root_path = "../data/jsonRaw"
    try:
        if len(os.listdir(root_path))==0:
            shutil.copy("light_automation_dataset.json", "light_automation_dataset_1.json")
            shutil.move("light_automation_dataset_1.json", root_path)
        else:
            num = len(os.listdir(root_path)) + 1
            fname = "light_automation_dataset_" + f"{num}" + ".json"
            shutil.copy("light_automation_dataset.json", fname)
            shutil.move(fname, root_path)
        print("Moved successfully to jsonDump. File: ", fname)
    except Exception as e:
        print("Error moving to jsonDump: ", str(e))

In [11]:
if __name__ == "__main__":
    dataset = generate_dataset(system, user)
    
    if dataset:
        #Save the dataset
        dataset1 = dataset[7:]
        dataset1 = dataset1[:-3]
        dataset1 = json.loads(dataset1)
        save_dataset(dataset1, "light_automation_dataset.json")
        print(f"Successfully generated and saved {len(dataset1)} samples to light_automation_dataset.json")
        tojsonDump()
        update_csv_from_json("light_automation_dataset.json", "annotator_data.csv")
    else:
        print("Failed to generate dataset")

Successfully generated and saved 47 samples to light_automation_dataset.json
Moved successfully to jsonDump. File:  light_automation_dataset_141.json
CSV file 'annotator_data.csv' updated successfully! 47 rows added.


In [None]:
# for i in range(5):
#     print("Itr: ", i+1)
#     dataset = generate_dataset(system, user)
    
#     if dataset:
#         #Save the dataset
#         dataset1 = dataset[7:]
#         dataset1 = dataset1[:-3]
#         dataset1 = json.loads(dataset1)
#         save_dataset(dataset1, "light_automation_dataset.json")
#         print(f"Successfully generated and saved {len(dataset1)} samples to light_automation_dataset.json")
#         tojsonDump()
#         update_csv_from_json("light_automation_dataset.json", "annotator_data.csv")
#         print("\n")
#     else:
#         print("Failed to generate dataset")

In [7]:
# import os
# os.listdir('../data/jsonRaw')

['light_automation_dataset_1.json',
 'light_automation_dataset_10.json',
 'light_automation_dataset_100.json',
 'light_automation_dataset_101.json',
 'light_automation_dataset_102.json',
 'light_automation_dataset_103.json',
 'light_automation_dataset_104.json',
 'light_automation_dataset_105.json',
 'light_automation_dataset_106.json',
 'light_automation_dataset_107.json',
 'light_automation_dataset_108.json',
 'light_automation_dataset_109.json',
 'light_automation_dataset_11.json',
 'light_automation_dataset_110.json',
 'light_automation_dataset_111.json',
 'light_automation_dataset_112.json',
 'light_automation_dataset_113.json',
 'light_automation_dataset_114.json',
 'light_automation_dataset_115.json',
 'light_automation_dataset_116.json',
 'light_automation_dataset_117.json',
 'light_automation_dataset_118.json',
 'light_automation_dataset_119.json',
 'light_automation_dataset_12.json',
 'light_automation_dataset_120.json',
 'light_automation_dataset_121.json',
 'light_automatio