In [24]:
import os
import json
import random
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

In [25]:
# Loading the dataset

root = "../../data"
filename = "restructured_Clean_FINAL.json"

jsonFile = os.path.join(root, filename)

with open(jsonFile, "r", encoding="utf-8") as f:
    data = json.load(f)

In [26]:
# Defining the split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

In [27]:
# Organizing the dataset by (intent, room, action)
grouped_data = defaultdict(list)

for entry in data:
    # Create a key that represents the entire action sequence for multi-room commands
    room_key = tuple(sorted(entry["rooms"]))  # Sorted tuple to ensure consistency
    action_key = tuple(sorted((action["room"], action["action"]) for action in entry["actions"]))
    key = (entry["intent"], room_key, action_key)

    grouped_data[key].append(entry)

In [54]:
len(grouped_data)

479

In [63]:
## testing
howmany = 0
for i in range(len(grouped_data)-2):
    temp1 = dict(grouped_data)
    key1 = list(temp1.keys())[i]
    if  len(grouped_data[key1]) == 1:
        howmany += 1
        print(key1, ": ", len(grouped_data[key1]))
print("Keys with only one value: ",howmany)

('multi_room_control', ('corridor', 'terrace'), (('corridor', 'turn_on'), ('terrace', 'adjust_brightness'))) :  1
('multi_room_control', ('balcony', 'store room'), (('balcony', 'turn_on'), ('store room', 'adjust_brightness'))) :  1
('multi_room_control', ('bathroom', 'living room'), (('bathroom', 'adjust_brightness'), ('living room', 'turn_on'))) :  1
('multi_room_control', ('balcony', 'balcony'), (('balcony', 'adjust_brightness'), ('balcony', 'turn_on'))) :  1
('multi_room_control', ('balcony', 'corridor'), (('balcony', 'adjust_brightness'), ('corridor', 'turn_off'))) :  1
('multi_room_control', ('corridor', 'study room'), (('corridor', 'turn_on'), ('study room', 'adjust_brightness'))) :  1
('multi_room_control', ('prayer room', 'study room'), (('prayer room', 'turn_off'), ('study room', 'turn_on'))) :  1
('single_room_control', ('kitchen',), (('kitchen', 'adjust_brightness'), ('kitchen', 'change_color'))) :  1
('multi_room_control', ('kitchen', 'prayer room'), (('kitchen', 'turn_off'

In [28]:
# Initializing the split sets
train_set, val_set, test_set = [], [], []

In [30]:
# Perform stratified split while keeping multi-room commands intact
for key, samples in grouped_data.items():
    num_samples = len(samples)

    # Warning for underrepresented classes
    if num_samples < 10:
        print(f"⚠ Warning: Category {key} has only {num_samples} samples. It may not split well.")

    # Perform stratified train-val-test split
    train, temp = train_test_split(samples, test_size=(val_ratio + test_ratio), random_state=42)
    val, test = train_test_split(temp, test_size=(test_ratio / (test_ratio + val_ratio)), random_state=42)

    train_set.extend(train)
    val_set.extend(val)
    test_set.extend(test)



ValueError: With n_samples=1, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [31]:
# Function to compute category distribution in the dataset
def compute_distribution(dataset, name):
    counter = Counter((entry["intent"], tuple(sorted(entry["rooms"])), 
                       tuple(sorted((action["room"], action["action"]) for action in entry["actions"]))) 
                      for entry in dataset)
    
    print(f"\n{name} set distribution:")
    for category, count in counter.items():
        print(f"  {category}: {count} samples")

In [32]:
# Display distributions
compute_distribution(train_set, "Train")
compute_distribution(val_set, "Validation")
compute_distribution(test_set, "Test")


Train set distribution:
  ('single_room_control', ('kitchen',), (('kitchen', 'turn_off'),)): 160 samples
  ('single_room_control', ('store room',), (('store room', 'turn_on'),)): 266 samples
  ('multi_room_control', ('kitchen', 'living room'), (('kitchen', 'adjust_brightness'), ('living room', 'adjust_brightness'))): 78 samples
  ('single_room_control', ('balcony',), (('balcony', 'turn_off'),)): 224 samples
  ('single_room_control', ('study room',), (('study room', 'change_color'),)): 230 samples
  ('single_room_control', ('bedroom',), (('bedroom', 'adjust_brightness'),)): 302 samples
  ('single_room_control', ('kitchen',), (('kitchen', 'change_color'),)): 498 samples

Validation set distribution:
  ('single_room_control', ('kitchen',), (('kitchen', 'turn_off'),)): 34 samples
  ('single_room_control', ('store room',), (('store room', 'turn_on'),)): 58 samples
  ('multi_room_control', ('kitchen', 'living room'), (('kitchen', 'adjust_brightness'), ('living room', 'adjust_brightness'))):

In [33]:
# Save the split datasets
with open("train.json", "w", encoding="utf-8") as f:
    json.dump(train_set, f, indent=4)
with open("val.json", "w", encoding="utf-8") as f:
    json.dump(val_set, f, indent=4)
with open("test.json", "w", encoding="utf-8") as f:
    json.dump(test_set, f, indent=4)

print(f"\n✅ Dataset split complete: {len(train_set)} train, {len(val_set)} val, {len(test_set)} test samples")


✅ Dataset split complete: 1758 train, 378 val, 384 test samples
