Prep the dataset: copy the labeled dataset, unlabel it and then save 10 examples from the labeled data where label = 1 for passing as few-shot examples and then the rest of the datasets would be unlabled test split.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Step 1: Load the labeled dataset
print("Loading labeled dataset...")
df = pd.read_excel('manual_label_batch1.xlsx')

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

Loading labeled dataset...
Original dataset shape: (177, 4)
Columns: ['id', 'text', 'similarity_score', 'label']

Label distribution:
label
0      107
1       38
2        8
6        6
3        3
9        3
7        3
5        2
7,8      2
4        2
4,8      1
2,9      1
1,4      1
Name: count, dtype: int64


In [2]:
# Step 2: Create a copy and save the full labeled dataset as backup
print("\nSaving full labeled dataset as backup...")
df_backup = df.copy()
df_backup.to_csv('labeled_data_backup.csv', index=False)


Saving full labeled dataset as backup...


In [3]:
# Step 3: Extract 10 examples where label = 1 for few-shot learning
print("\nExtracting 10 examples where label = 1 for few-shot prompts")
burnout_examples = df[df['label'] == 1].sample(n=10, random_state=42)
fewshot_examples = burnout_examples.copy()


Extracting 10 examples where label = 1 for few-shot prompts


In [4]:
# Save few-shot examples (with labels)
fewshot_examples.to_csv('fewshot_examples.csv', index=False)
print(f"Few-shot examples saved: {fewshot_examples.shape}")

Few-shot examples saved: (10, 4)


In [5]:
# Step 4: Create test set (remaining data after removing few-shot examples)
# Get indices of few-shot examples
fewshot_indices = burnout_examples.index

# Remove few-shot examples from the dataset to create test set
test_data = df.drop(fewshot_indices)

print(f"Test set shape: {test_data.shape}")
print(f"Test set label distribution:")
print(test_data['label'].value_counts())

Test set shape: (167, 4)
Test set label distribution:
label
0      107
1       28
2        8
6        6
3        3
9        3
7        3
5        2
7,8      2
4        2
4,8      1
2,9      1
1,4      1
Name: count, dtype: int64


In [6]:
# Step 5: Save test set WITH labels (for validation later)
test_data.to_csv('test_data_with_labels.csv', index=False)
print("\nTest data with labels saved: test_data_with_labels.csv")



Test data with labels saved: test_data_with_labels.csv


In [7]:
# Step 6: Create and save test set WITHOUT labels (for actual inference)
test_data_unlabeled = test_data.drop(columns=['label'])
test_data_unlabeled.to_csv('test_data_unlabeled.csv', index=False)
print("Test data without labels saved: test_data_unlabeled.csv")


Test data without labels saved: test_data_unlabeled.csv


In [8]:
print("SUMMARY")
print("="*60)
print(f"Original dataset: {df.shape[0]} rows")
print(f"Few-shot examples (label=1): {fewshot_examples.shape[0]} rows")
print(f"Test set: {test_data.shape[0]} rows")
print(f"  - Label 0: {(test_data['label'] == 0).sum()} rows")
print(f"  - Label 1: {(test_data['label'] == 1).sum()} rows")
print("\nFiles created:")
print("  1. labeled_data_backup.csv - Full dataset with labels (backup)")
print("  2. fewshot_examples.csv - 10 examples where label=1 (for prompts)")
print("  3. test_data_with_labels.csv - Test set WITH labels (for validation)")
print("  4. test_data_unlabeled.csv - Test set WITHOUT labels (for inference)")
print("="*60)

SUMMARY
Original dataset: 177 rows
Few-shot examples (label=1): 10 rows
Test set: 167 rows
  - Label 0: 107 rows
  - Label 1: 28 rows

Files created:
  1. labeled_data_backup.csv - Full dataset with labels (backup)
  2. fewshot_examples.csv - 10 examples where label=1 (for prompts)
  3. test_data_with_labels.csv - Test set WITH labels (for validation)
  4. test_data_unlabeled.csv - Test set WITHOUT labels (for inference)


In [10]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.77.1-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.77.1-py3-none-any.whl (397 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/397.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m389.1/397.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.9/397.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.77.1


In [12]:
'''
API KEY CLaude
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
'''
import pandas as pd
import anthropic
import time
from tqdm import tqdm
import json

# Initialize Anthropic client
CLAUDE = "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"
client = anthropic.Anthropic(api_key=CLAUDE)


In [13]:
#Model parameters
MODEL_SONNET = "claude-sonnet-4-20250514"  # Claude Sonnet 4
TEMPERATURE = 0
MAX_TOKENS = 100

In [14]:
#ZERO SHOT Prompting

ZERO_SHOT_PROMPT = """Title: "Classification of Work-Related Burnout and Stress in Cybersecurity Professionals"

Definition: In this task, we ask you to classify the input text into two options:

(A): Work-related burnout/stress: The poster discussed burnout or work-related stress related to their own mental health in the past or present.
The context of burnout can be related to work, career, job responsibilities, workplace environment, or professional life in cybersecurity.

(B): No work-related burnout/stress: Burnout or stress used in a context unrelated to work or mental health. Or work-related burnout/stress
in hypothetical situations when the poster is not discussing their own experience in the past and present.

Emphasis & Caution: Discussions of hypothetical situations such as fear of burnout or future/imaginary circumstances should NOT be labeled as (A).

Things to avoid: All input must be classified into one of the options. If you cannot pick then choose the option with higher probability. The output
must be either (A) or (B) but not both.

Input: {text}

Output:"""

ZERO_SHOT_COT_PROMPT = """Title: "Classification of Work-Related Burnout and Stress in Cybersecurity Professionals"

Definition: In this task, we ask you to classify the input text into two options:

(A): Work-related burnout/stress: The poster discussed burnout or work-related stress related to their own mental health in the past or present. The context of burnout can be related to work, career, job responsibilities, workplace environment, or professional life in cybersecurity.

(B): No work-related burnout/stress: Burnout or stress used in a context unrelated to work or mental health. Or work-related burnout/stress in hypothetical situations when the poster is not discussing their own experience in the past and present.

Emphasis & Caution: Discussions of hypothetical situations such as fear of burnout or future/imaginary circumstances should NOT be labeled as (A).

Things to avoid: All input must be classified into one of the options. If you cannot pick then choose the option with higher probability. The output must be either (A) or (B) but not both.

Input: {text}

Let's think about it step by step.

Output:"""

In [15]:
#Helper function for few shot

def create_fewshot_examples(fewshot_df, text_column):
    """Create formatted few-shot examples from the dataframe"""
    examples = []
    for idx, row in fewshot_df.iterrows():
        label = "(A)" if row['label'] == 1 else "(B)"
        example = f"Input: {row[text_column]}\nOutput: {label}"
        examples.append(example)
    return "\n\n".join(examples)

def create_fewshot_prompt(fewshot_examples_text, is_cot=False):
    """Create few-shot prompt template"""
    cot_instruction = "\n\nLet's think about it step by step." if is_cot else ""

    prompt = f"""Title: "Classification of Work-Related Burnout and Stress in Cybersecurity Professionals"

Definition: In this task, we ask you to classify the input text into two options:

(A): Work-related burnout/stress: The poster discussed burnout or work-related stress related to their own mental health in the past or present. The context of burnout can be related to work, career, job responsibilities, workplace environment, or professional life in cybersecurity.

(B): No work-related burnout/stress: Burnout or stress used in a context unrelated to work or mental health. Or work-related burnout/stress in hypothetical situations when the poster is not discussing their own experience in the past and present.

Emphasis & Caution: Discussions of hypothetical situations such as fear of burnout or future/imaginary circumstances should NOT be labeled as (A).

Things to avoid: All input must be classified into one of the options. If you cannot pick then choose the option with higher probability. The output must be either (A) or (B) but not both.

Here are some examples:

{fewshot_examples_text}

Now classify the following:

Input: {{text}}{cot_instruction}

Output:"""

    return prompt


In [16]:
def truncate_text(text, max_chars=10000):
    """Truncate text to fit context window"""
    if len(text) > max_chars:
        return text[:max_chars] + "..."
    return text

In [17]:
def call_claude_api(prompt, model=MODEL_SONNET):
    """Call Claude API with error handling and retry logic"""
    max_retries = 3
    retry_delay = 2

    for attempt in range(max_retries):
        try:
            message = client.messages.create(
                model=model,
                max_tokens=MAX_TOKENS,
                temperature=TEMPERATURE,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            return message.content[0].text.strip()

        except anthropic.RateLimitError as e:
            print(f"Rate limit hit on attempt {attempt + 1}: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
                retry_delay *= 2
            else:
                return "ERROR: RATE_LIMIT"

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
                retry_delay *= 2
            else:
                return f"ERROR: {str(e)}"

    return "ERROR"

In [18]:
def parse_output(output):
    """Parse model output to extract (A) or (B)"""
    output = output.strip().upper()

    # Look for (A) or (B) in the output
    if "(A)" in output:
        return "A"
    elif "(B)" in output:
        return "B"
    # Fallback: check for just A or B
    elif output.startswith("A"):
        return "A"
    elif output.startswith("B"):
        return "B"
    else:
        return "PARSE_ERROR"

In [19]:
def classify_dataset(test_df, text_column, prompt_template, model, approach_name):
    """Classify entire dataset and return results"""
    results = []

    print(f"\n{'='*60}")
    print(f"Running {approach_name} with {model}")
    print(f"{'='*60}")

    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Classifying"):
        text = truncate_text(str(row[text_column]))
        prompt = prompt_template.format(text=text)

        # Call API
        raw_output = call_claude_api(prompt, model)

        # Parse output
        predicted_label = parse_output(raw_output)

        results.append({
            'index': idx,
            'text': row[text_column],
            'raw_output': raw_output,
            'predicted_label': predicted_label,
            'approach': approach_name,
            'model': model
        })

        # Rate limiting - delay between requests
        time.sleep(1)  # Claude API rate limits can be strict

    return pd.DataFrame(results)

In [None]:
def main():
    # Configuration - CHANGE THESE TO MATCH COLUMN NAMES
    TEXT_COLUMN = 'text'
    LABEL_COLUMN = 'label'

    # Load data
    print("Loading data...")
    fewshot_df = pd.read_csv('fewshot_examples.csv')
    test_df = pd.read_csv('test_data_unlabeled.csv')
    test_labels = pd.read_csv('test_data_with_labels.csv')

    print(f"Few-shot examples: {len(fewshot_df)}")
    print(f"Test samples: {len(test_df)}")

    # Create few-shot examples text
    fewshot_examples_text = create_fewshot_examples(fewshot_df, TEXT_COLUMN)
    fewshot_prompt = create_fewshot_prompt(fewshot_examples_text, is_cot=False)

    fewshot_cot_prompt = create_fewshot_prompt(fewshot_examples_text, is_cot=True)

    # Choose which experiments to run
    experiments = [
        # Uncomment the experiments you want to run

        # Zero-shot with Claude Sonnet-ran this
        # (ZERO_SHOT_PROMPT, MODEL_SONNET, "Zero-Shot_Claude-Sonnet")

        # Zero-shot with Claude Opus (more powerful but more expensive)
        # (ZERO_SHOT_PROMPT, MODEL_OPUS, "Zero-Shot_Claude-Opus"),

        # Zero-shot with CoT
        # (ZERO_SHOT_COT_PROMPT, MODEL_SONNET, "Zero-Shot-CoT_Claude-Sonnet"),
        # (ZERO_SHOT_COT_PROMPT, MODEL_OPUS, "Zero-Shot-CoT_Claude-Opus"),

        # Few-shot
        (fewshot_prompt, MODEL_SONNET, "Few-Shot_Claude-Sonnet")
        # (fewshot_prompt, MODEL_OPUS, "Few-Shot_Claude-Opus"),

        # Few-shot with CoT
        # (fewshot_cot_prompt, MODEL_SONNET, "Few-Shot-CoT_Claude-Sonnet"),
        # (fewshot_cot_prompt, MODEL_OPUS, "Few-Shot-CoT_Claude-Opus"),
    ]

    # Run experiments
    all_results = []
    for prompt_template, model, approach_name in experiments:
        results_df = classify_dataset(test_df, TEXT_COLUMN, prompt_template, model, approach_name)
        all_results.append(results_df)

        # Save individual results
        results_df.to_csv(f'results_{approach_name}.csv', index=False)
        print(f"Saved: results_{approach_name}.csv")

    # Combine all results
    if all_results:
        combined_results = pd.concat(all_results, ignore_index=True)
        combined_results.to_csv('all_results_combined.csv', index=False)
        print("\nAll results saved to: all_results_combined.csv")

if __name__ == "__main__":
    main()

Loading data...
Few-shot examples: 10
Test samples: 167

Running Few-Shot_Claude-Sonnet with claude-sonnet-4-20250514


Classifying:  10%|▉         | 16/167 [01:15<11:00,  4.37s/it]