Prep the dataset: copy the labeled dataset, unlabel it and then save 10 examples from the labeled data where label = 1 for passing as few-shot examples and then the rest of the datasets would be unlabled test split.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Load the labeled dataset
print("Loading labeled dataset...")
df = pd.read_excel('manual_label_batch1.xlsx')

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

Loading labeled dataset...
Original dataset shape: (177, 4)
Columns: ['id', 'text', 'similarity_score', 'label']

Label distribution:
label
0      107
1       38
2        8
6        6
3        3
9        3
7        3
5        2
7,8      2
4        2
4,8      1
2,9      1
1,4      1
Name: count, dtype: int64


In [2]:
# Step 2: Create a copy and save the full labeled dataset as backup
print("\nSaving full labeled dataset as backup...")
df_backup = df.copy()
df_backup.to_csv('labeled_data_backup.csv', index=False)


Saving full labeled dataset as backup...


In [3]:
# Step 3: Extract 10 examples where label = 1 for few-shot learning
print("\nExtracting 10 examples where label = 1 for few-shot prompts")
burnout_examples = df[df['label'] == 1].sample(n=10, random_state=42)
fewshot_examples = burnout_examples.copy()


Extracting 10 examples where label = 1 for few-shot prompts


In [4]:
# Save few-shot examples (with labels)
fewshot_examples.to_csv('fewshot_examples.csv', index=False)
print(f"Few-shot examples saved: {fewshot_examples.shape}")

Few-shot examples saved: (10, 4)


In [5]:
# Step 4: Create test set (remaining data after removing few-shot examples)
# Get indices of few-shot examples
fewshot_indices = burnout_examples.index

# Remove few-shot examples from the dataset to create test set
test_data = df.drop(fewshot_indices)

print(f"Test set shape: {test_data.shape}")
print(f"Test set label distribution:")
print(test_data['label'].value_counts())

Test set shape: (167, 4)
Test set label distribution:
label
0      107
1       28
2        8
6        6
3        3
9        3
7        3
5        2
7,8      2
4        2
4,8      1
2,9      1
1,4      1
Name: count, dtype: int64


In [6]:
# Step 5: Save test set WITH labels (for validation later)
test_data.to_csv('test_data_with_labels.csv', index=False)
print("\nTest data with labels saved: test_data_with_labels.csv")



Test data with labels saved: test_data_with_labels.csv


In [7]:
# Step 6: Create and save test set WITHOUT labels (for actual inference)
test_data_unlabeled = test_data.drop(columns=['label'])
test_data_unlabeled.to_csv('test_data_unlabeled.csv', index=False)
print("Test data without labels saved: test_data_unlabeled.csv")


Test data without labels saved: test_data_unlabeled.csv


In [8]:
print("SUMMARY")
print("="*60)
print(f"Original dataset: {df.shape[0]} rows")
print(f"Few-shot examples (label=1): {fewshot_examples.shape[0]} rows")
print(f"Test set: {test_data.shape[0]} rows")
print(f"  - Label 0: {(test_data['label'] == 0).sum()} rows")
print(f"  - Label 1: {(test_data['label'] == 1).sum()} rows")
print("\nFiles created:")
print("  1. labeled_data_backup.csv - Full dataset with labels (backup)")
print("  2. fewshot_examples.csv - 10 examples where label=1 (for prompts)")
print("  3. test_data_with_labels.csv - Test set WITH labels (for validation)")
print("  4. test_data_unlabeled.csv - Test set WITHOUT labels (for inference)")
print("="*60)

SUMMARY
Original dataset: 177 rows
Few-shot examples (label=1): 10 rows
Test set: 167 rows
  - Label 0: 107 rows
  - Label 1: 28 rows

Files created:
  1. labeled_data_backup.csv - Full dataset with labels (backup)
  2. fewshot_examples.csv - 10 examples where label=1 (for prompts)
  3. test_data_with_labels.csv - Test set WITH labels (for validation)
  4. test_data_unlabeled.csv - Test set WITHOUT labels (for inference)
