In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file from Kaggle’s storage
csv_file = "/kaggle/input/labeled-dataset/correct_dataset_sorted.csv"  # Update dataset name
df = pd.read_csv(csv_file)

# Group by 'tag' column
grouped = df.groupby("tag")

train_data = []
validation_data = []
test_data = []

for tag, group in grouped:
    patterns = group["pattern"].tolist()
    responses = group["response"].tolist()

    if len(patterns) > 1:
        # Split into training (80%) and temp (20%)
        train_patterns, temp_patterns, train_responses, temp_responses = train_test_split(
            patterns, responses, train_size=0.8, random_state=42
        )

        if len(temp_patterns) > 1:
            # Split remaining into validation (50%) and test (50%)
            validation_patterns, test_patterns, validation_responses, test_responses = train_test_split(
                temp_patterns, temp_responses, train_size=0.5, random_state=42
            )
        else:
            validation_patterns, test_patterns = temp_patterns, []
            validation_responses, test_responses = temp_responses, []

        # Add to respective lists
        train_data.extend([{"tag": tag, "pattern": p, "response": r} for p, r in zip(train_patterns, train_responses)])
        validation_data.extend([{"tag": tag, "pattern": p, "response": r} for p, r in zip(validation_patterns, validation_responses)])
        test_data.extend([{"tag": tag, "pattern": p, "response": r} for p, r in zip(test_patterns, test_responses)])
    else:
        # If only one pattern exists for a tag, add it to training data
        train_data.append({"tag": tag, "pattern": patterns[0], "response": responses[0]})

train_df = pd.DataFrame(train_data)
validation_df = pd.DataFrame(validation_data)
test_df = pd.DataFrame(test_data)

# Save in Kaggle's output folder (since /kaggle/input is read-only)
train_df.to_csv("/kaggle/working/train_data.csv", index=False, encoding='utf-8-sig')
validation_df.to_csv("/kaggle/working/validation_data.csv", index=False, encoding='utf-8-sig')
test_df.to_csv("/kaggle/working/test_data.csv", index=False, encoding='utf-8-sig')

print(f"Train Data saved: {len(train_df)} samples")
print(f"Validation Data saved: {len(validation_df)} samples")
print(f"Test Data saved: {len(test_df)} samples")


Train Data saved: 645665 samples
Validation Data saved: 80709 samples
Test Data saved: 80711 samples
