In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset
dataset_path = "/content/drive/MyDrive/correct_dataset_sorted.csv"  # Replace with your actual path
df = pd.read_csv(dataset_path)

# Count occurrences of each emotion
emotion_counts = df["tag"].value_counts()
total_samples = 100_000

# Calculate sample size per emotion based on original distribution
emotion_sample_sizes = (emotion_counts / emotion_counts.sum() * total_samples).astype(int)

# Ensure at least 1 sample per category (if needed)
emotion_sample_sizes = emotion_sample_sizes.clip(lower=1)

# Sample data while maintaining emotion proportions
df_reduced = df.groupby("tag", group_keys=False).apply(lambda x: x.sample(n=emotion_sample_sizes[x.name], random_state=42))

# Shuffle dataset (optional)
df_reduced = df_reduced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the reduced dataset
df_reduced.to_csv("/content/drive/MyDrive/reduced_dataset.csv", index=False)

print("✅ Dataset reduced to 100K samples while preserving emotion distribution.")

print(df_reduced["tag"].value_counts())


  df_reduced = df.groupby("tag", group_keys=False).apply(lambda x: x.sample(n=emotion_sample_sizes[x.name], random_state=42))


✅ Dataset reduced to 100K samples while preserving emotion distribution.
tag
joy         27316
sadness     24049
fear        19833
neutral     19728
anger        3796
surprise     3383
disgust      1890
unknown         1
Name: count, dtype: int64


In [None]:
import pandas as pd

# ✅ File path for the CSV file
file_path = "/content/drive/MyDrive/reduced_dataset.csv"

# ✅ Output file path for the sorted CSV
output_file = "/content/drive/MyDrive/reduced_dataset_sorted.csv"

# ✅ Load the CSV file
df = pd.read_csv(file_path)

# ✅ Sort the DataFrame by the 'tag' column
df_sorted = df.sort_values(by='tag').reset_index(drop=True)

# ✅ Save the sorted DataFrame to a new CSV file
df_sorted.to_csv(output_file, index=False)

print(f"🎉✅ Dataset sorted by 'tag' column and saved to: {output_file}")

# ✅ Optional: Preview sorted data
print(f"📊 Sorted dataset shape: {df_sorted.shape}")
print(df_sorted.head())


🎉✅ Dataset sorted by 'tag' column and saved to: /content/drive/MyDrive/reduced_dataset_sorted.csv
📊 Sorted dataset shape: (99996, 3)
                                             pattern  \
0  Thank you for reminding me of that, Buddy. Bes...   
1  I'm feeling frustrated and abused. It's like I...   
2  Well, it all started with a breakdown in commu...   
3  The guilt stems from a sense of shame I feel a...   
4  I love my parents, and I don't want to see the...   

                                            response    tag  
0  Insecurity is a common feeling, especially whe...  anger  
1  Hello, Fabozzi. I'm here to listen and support...  anger  
2  The breakdown in communication must have made ...  anger  
3  Cletus, I can sense the pain and self-judgment...  anger  
4  That's a beautiful approach, Adalard. It shows...  anger  


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file
csv_file = "/content/drive/MyDrive/reduced_dataset_sorted.csv"  # Update with your actual path if needed
df = pd.read_csv(csv_file)

# Group by 'tag' column
grouped = df.groupby("tag")

train_data = []
validation_data = []
test_data = []

for tag, group in grouped:
    patterns = group["pattern"].tolist()
    responses = group["response"].tolist()

    if len(patterns) > 1:
        # Split into training (80%) and temp (20%)
        train_patterns, temp_patterns, train_responses, temp_responses = train_test_split(
            patterns, responses, train_size=0.8, random_state=42
        )

        if len(temp_patterns) > 1:
            # Split remaining into validation (50%) and test (50%)
            validation_patterns, test_patterns, validation_responses, test_responses = train_test_split(
                temp_patterns, temp_responses, train_size=0.5, random_state=42
            )
        else:
            validation_patterns, test_patterns = temp_patterns, []
            validation_responses, test_responses = temp_responses, []

        # Add to respective lists
        train_data.extend([{"tag": tag, "pattern": p, "response": r} for p, r in zip(train_patterns, train_responses)])
        validation_data.extend([{"tag": tag, "pattern": p, "response": r} for p, r in zip(validation_patterns, validation_responses)])
        test_data.extend([{"tag": tag, "pattern": p, "response": r} for p, r in zip(test_patterns, test_responses)])
    else:
        # If only one pattern exists for a tag, add it to training data
        train_data.append({"tag": tag, "pattern": patterns[0], "response": responses[0]})

train_df = pd.DataFrame(train_data)
validation_df = pd.DataFrame(validation_data)
test_df = pd.DataFrame(test_data)

train_df.to_csv("/content/drive/MyDrive/train_data.csv", index=False, encoding='utf-8-sig')
validation_df.to_csv("/content/drive/MyDrive/validation_data.csv", index=False, encoding='utf-8-sig')
test_df.to_csv("/content/drive/MyDrive/test_data.csv", index=False, encoding='utf-8-sig')

print(f"Train Data saved: {len(train_df)} samples")
print(f"Validation Data saved: {len(validation_df)} samples")
print(f"Test Data saved: {len(test_df)} samples")


Train Data saved: 79994 samples
Validation Data saved: 10000 samples
Test Data saved: 10002 samples


In [None]:
print(df_reduced.head())

                                             pattern  \
0  I plan to continue practicing self-care, Buddy...   
1  I've started writing consistently and sharing ...   
2  Definitely, Buddy. One surprising thing has be...   
3  That sounds like a challenge, but it's somethi...   
4  I have been trying to budget and cut down on u...   

                                            response   tag  
0  It's wonderful to hear that you have found cop...   joy  
1  It's great to hear that you have taken those i...  fear  
2  That's a profound insight, Ot. Grief indeed ha...   joy  
3  I admire your willingness to embrace this chal...  fear  
4  I hear you. It can be quite disheartening when...  fear  
