In [3]:
!pip install anthropic




In [4]:
import pandas as pd
import json
from anthropic import Anthropic
import time
from tqdm import tqdm

# Load your data
print("Loading data...")
big_data = pd.read_csv('posts_processed2.csv')
labeled_data = pd.read_excel('manual_label_batch1.xlsx')

print(f" Big dataset: {len(big_data)} rows")
print(f" Labeled dataset: {len(labeled_data)} rows")

Loading data...
 Big dataset: 18653 rows
 Labeled dataset: 177 rows


In [5]:
# STRICT BINARY: Only label 1 is relevant
labeled_data['binary_label'] = (labeled_data['label'] == 1).astype(int)

print("\nLabel distribution:")
print(f"Label 1 (In-scope): {(labeled_data['binary_label'] == 1).sum()} posts")
print(f"All others (Out-of-scope): {(labeled_data['binary_label'] == 0).sum()} posts")

# Get examples
n_examples_per_class = 8
label_1_examples = labeled_data[labeled_data['binary_label'] == 1].sample(min(n_examples_per_class, (labeled_data['binary_label'] == 1).sum()), random_state=42)
other_examples = labeled_data[labeled_data['binary_label'] == 0].sample(min(n_examples_per_class, (labeled_data['binary_label'] == 0).sum()), random_state=42)

print(f"\nUsing {len(label_1_examples)} Label-1 and {len(other_examples)} other examples")



Label distribution:
Label 1 (In-scope): 38 posts
All others (Out-of-scope): 139 posts

Using 8 Label-1 and 8 other examples


In [6]:
# Create prompt
def create_strict_binary_prompt(label_1_ex, other_ex):
    prompt = """You are an expert at identifying work-related burnout, chronic stress, and
    workload mental pressure and burnout in Reddit posts from cybersecurity professionals.

Your task is to classify posts as either:
- Label 1 (IN-SCOPE): Posts SPECIFICALLY about work-related burnout, chronic stress from job demands,
overwhelming workload, work pressure, long hours, on-call exhaustion, or feeling burnt out from work.
- Label 0 (OUT-OF-SCOPE): Everything else including: imposter syndrome, general anxiety (not work-stress), PTSD,
mental health disorders, job search issues, work-life balance tips,
stress relief advice, technical questions, news, products, memes, etc.

IMPORTANT: Be STRICT. Only classify as Label 1 if the post is clearly about work burnout,
chronic work stress, or mental burnout and stress.

Here are examples of Label 1 (IN-SCOPE) posts:

"""

    for i, row in label_1_ex.iterrows():
        prompt += f"Example {i+1}:\n\"{row['text'][:400]}...\"\nLabel: 1\n\n"

    prompt += "\nHere are examples of Label 0 (OUT-OF-SCOPE) posts:\n\n"

    for i, row in other_ex.iterrows():
        prompt += f"Example {i+1}:\n\"{row['text'][:400]}...\"\nLabel: 0\n\n"

    prompt += """\nNow classify the following post. Respond with ONLY a JSON object:
{
  "label": 0 or 1,
  "confidence": a number between 0.0 and 1.0,
  "reasoning": "brief explanation"
}

Post to classify:
"""
    return prompt

In [7]:
base_prompt = create_strict_binary_prompt(label_1_examples, other_examples)
client = Anthropic()

# Test on small sample first
print("\n" + "="*60)
print("TESTING ON 10 SAMPLES:")
print("="*60)

test_sample = big_data.sample(20, random_state=42)



TESTING ON 10 SAMPLES:


In [11]:
def classify_post_icl(text, base_prompt, max_retries=3):
    full_prompt = base_prompt + f"\"{text}\""

    for attempt in range(max_retries):
        try:
            response = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=300,
                temperature=0,
                messages=[{"role": "user", "content": full_prompt}]
            )

            response_text = response.content[0].text.strip()

            if '{' in response_text and '}' in response_text:
                json_start = response_text.index('{')
                json_end = response_text.rindex('}') + 1
                json_str = response_text[json_start:json_end]
                result = json.loads(json_str)

                return {
                    'label': result.get('label', -1),
                    'confidence': result.get('confidence', 0.5),
                    'reasoning': result.get('reasoning', '')
                }
            else:
                return {'label': -1, 'confidence': 0.5, 'reasoning': 'Parse error'}

        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2)
            else:
                return {'label': -1, 'confidence': 0.5, 'reasoning': f'Error: {str(e)}'}

for idx, row in test_sample.iterrows():
    print(f"\n--- Sample {idx} ---")
    print(f"Text: {row['cleaned_text'][:150]}...")
    result = classify_post_icl(row['cleaned_text'], base_prompt)
    print(f"Label: {result['label']} | Confidence: {result['confidence']:.2f}")
    print(f"Reasoning: {result['reasoning']}")
    time.sleep(1)

print("\n" + "="*60)
print("Proceed with full dataset classification? (type 'yes' to continue)")


--- Sample 16852 ---
Text: what are the best data breach reports to compare to the verizon dbir i ve wondered for a while now i ve been reading the vdbir almost yearly and only ...
Label: -1 | Confidence: 0.50
Reasoning: Error: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted"

--- Sample 17679 ---
Text: cybersecurity education survey...
Label: -1 | Confidence: 0.50
Reasoning: Error: "Could not resolve authentication method. Expected either api_key or auth_token to be set. Or for one of the `X-Api-Key` or `Authorization` headers to be explicitly omitted"

--- Sample 2146 ---
Text: what are aspects of cybersecurity most people hate but are super important hi when i started my cybersecurity career i started out as a penetration te...
Label: -1 | Confidence: 0.50
Reasoning: Error: "Could not resolve authentication method. Expected either api_key or auth_token to be 

In [12]:
import os

# Classify the FULL dataset
print("\n" + "="*60)
print("CLASSIFYING FULL DATASET:")
print("="*60)
print(f"Total posts to classify: {len(big_data)}")
print("Estimated time: ~5 hours (1 request per second)")
print("Progress will be saved every 500 posts")

# Create a results directory if it doesn't exist
os.makedirs('icl_results', exist_ok=True)

# Check if we have any saved progress
progress_files = [f for f in os.listdir('icl_results') if f.startswith('progress_')]
start_index = 0

if progress_files:
    # Find the latest progress file
    latest_progress = max(progress_files)
    print(f"\nFound existing progress file: {latest_progress}")

    # Load existing results
    existing_results = pd.read_csv(f'icl_results/{latest_progress}')
    results = existing_results.to_dict('records')
    start_index = len(results)

    print(f"Resuming from index {start_index}")
else:
    results = []


CLASSIFYING FULL DATASET:
Total posts to classify: 18653
Estimated time: ~5 hours (1 request per second)
Progress will be saved every 500 posts


In [None]:
# Process all posts with progress bar
batch_size = 100  # Process in batches for easier management
save_interval = 500  # Save progress every 500 posts

print(f"\nStarting classification from index {start_index}...")

for i in tqdm(range(start_index, len(big_data)), desc="Classifying posts"):
    row = big_data.iloc[i]

    # Classify the post
    result = classify_post_icl(row['cleaned_text'], base_prompt)

    # Store the result
    results.append({
        'index': i,
        'label': result['label'],
        'confidence': result['confidence'],
        'reasoning': result['reasoning']
    })

    # Rate limiting - wait 1 second between API calls
    time.sleep(1)

    # Save progress periodically
    if (i + 1) % save_interval == 0:
        temp_results_df = pd.DataFrame(results)
        progress_file = f'icl_results/progress_batch_{i+1}.csv'
        temp_results_df.to_csv(progress_file, index=False)
        print(f"\n Progress saved at {i+1} posts to {progress_file}")

print("\n Classification complete!")



Starting classification from index 0...


Classifying posts:   0%|          | 28/18653 [02:20<25:53:32,  5.00s/it]

In [None]:
# Convert results to dataframe
results_df = pd.DataFrame(results)

# Merge results with original data
print("\n" + "="*60)
print("MERGING RESULTS WITH ORIGINAL DATA:")
print("="*60)

big_data['icl_label'] = results_df['label'].values
big_data['icl_confidence'] = results_df['confidence'].values
big_data['icl_reasoning'] = results_df['reasoning'].values

In [None]:
# Calculate summary statistics
print("\n" + "="*60)
print("IN-CONTEXT LEARNING RESULTS:")
print("="*60)

total_label_1 = (big_data['icl_label'] == 1).sum()
total_others = (big_data['icl_label'] == 0).sum()
total_errors = (big_data['icl_label'] == -1).sum()

print(f"Label 1 (Work burnout/stress/workload): {total_label_1} posts ({total_label_1/len(big_data)*100:.1f}%)")
print(f"Label 0 (All others - Out-of-scope): {total_others} posts ({total_others/len(big_data)*100:.1f}%)")
print(f"Classification errors: {total_errors} posts ({total_errors/len(big_data)*100:.1f}%)")

print(f"\nAverage confidence score: {big_data['icl_confidence'].mean():.3f}")

In [None]:
# Confidence distribution
print("\nConfidence distribution:")
for threshold in [0.6, 0.7, 0.8, 0.9]:
    high_conf = (big_data['icl_confidence'] >= threshold).sum()
    print(f"  Predictions with ≥{threshold:.1f} confidence: {high_conf} ({high_conf/len(big_data)*100:.1f}%)")

# Save all results
print("\n" + "="*60)
print("SAVING FINAL RESULTS:")
print("="*60)

In [None]:
# Confidence distribution
print("\nConfidence distribution:")
for threshold in [0.6, 0.7, 0.8, 0.9]:
    high_conf = (big_data['icl_confidence'] >= threshold).sum()
    print(f"  Predictions with ≥{threshold:.1f} confidence: {high_conf} ({high_conf/len(big_data)*100:.1f}%)")

# Save all results
print("\n" + "="*60)
print("SAVING FINAL RESULTS:")
print("="*60)

In [None]:
# 2. Only Label 1 posts (work burnout/stress/workload)
label_1_posts = big_data[big_data['icl_label'] == 1]
label_1_output = "reddit_LABEL1_ICL_only.csv"
label_1_posts.to_csv(label_1_output, index=False)
print(f" Label 1 posts saved to: {label_1_output}")
print(f"  Total: {len(label_1_posts)} posts")

In [None]:
# 3. High confidence Label 1 posts (confidence > 0.7)
high_conf_label_1 = big_data[
    (big_data['icl_label'] == 1) &
    (big_data['icl_confidence'] > 0.7)
]
high_conf_output = "reddit_LABEL1_ICL_high_confidence.csv"
high_conf_label_1.to_csv(high_conf_output, index=False)
print(f" High confidence Label 1 saved to: {high_conf_output}")
print(f"  Total: {len(high_conf_label_1)} posts")



In [None]:
# 4. Very high confidence Label 1 posts (confidence > 0.85)
very_high_conf = big_data[
    (big_data['icl_label'] == 1) &
    (big_data['icl_confidence'] > 0.85)
]
very_high_output = "reddit_LABEL1_ICL_very_high_confidence.csv"
very_high_conf.to_csv(very_high_output, index=False)
print(f" Very high confidence Label 1 saved to: {very_high_output}")
print(f"  Total: {len(very_high_conf)} posts")


In [None]:
# 5. Uncertain predictions (confidence between 0.4 and 0.6)
uncertain = big_data[
    (big_data['icl_confidence'] > 0.4) &
    (big_data['icl_confidence'] < 0.6)
]
uncertain_output = "reddit_ICL_uncertain_predictions.csv"
uncertain.to_csv(uncertain_output, index=False)
print(f" Uncertain predictions saved to: {uncertain_output}")
print(f"  Total: {len(uncertain)} posts (recommend manual review)")


In [None]:
# 5. Uncertain predictions (confidence between 0.4 and 0.6)
uncertain = big_data[
    (big_data['icl_confidence'] > 0.4) &
    (big_data['icl_confidence'] < 0.6)
]
uncertain_output = "reddit_ICL_uncertain_predictions.csv"
uncertain.to_csv(uncertain_output, index=False)
print(f"✓ Uncertain predictions saved to: {uncertain_output}")
print(f"  Total: {len(uncertain)} posts (recommend manual review)")


In [None]:
# 6. Posts with classification errors (label = -1)
if total_errors > 0:
    error_posts = big_data[big_data['icl_label'] == -1]
    error_output = "reddit_ICL_classification_errors.csv"
    error_posts.to_csv(error_output, index=False)
    print(f"✓ Classification errors saved to: {error_output}")
    print(f"  Total: {len(error_posts)} posts (need reprocessing)")


In [None]:
# 7. Sample review file - stratified by confidence levels
print("\nCreating stratified sample for quality review...")


In [None]:
sample_parts = []

# High confidence Label 1
if len(high_conf_label_1) > 0:
    sample_parts.append(high_conf_label_1.sample(min(30, len(high_conf_label_1)), random_state=42))

# Low confidence Label 1
low_conf_label_1 = label_1_posts[label_1_posts['icl_confidence'] <= 0.7]
if len(low_conf_label_1) > 0:
    sample_parts.append(low_conf_label_1.sample(min(20, len(low_conf_label_1)), random_state=42))

# Label 0 (others)
label_0_posts = big_data[big_data['icl_label'] == 0]
if len(label_0_posts) > 0:
    sample_parts.append(label_0_posts.sample(min(30, len(label_0_posts)), random_state=42))

# Uncertain predictions
if len(uncertain) > 0:
    sample_parts.append(uncertain.sample(min(20, len(uncertain)), random_state=42))

# Combine samples
sample_review = pd.concat(sample_parts, ignore_index=True)
sample_review.to_csv("reddit_ICL_sample_for_review.csv", index=False)
print(f" Sample review file saved: reddit_ICL_sample_for_review.csv")
print(f"  Contains {len(sample_review)} samples for quality checking")

In [None]:
# Create a summary report
print("\n" + "="*60)
print("CREATING SUMMARY REPORT:")
print("="*60)

summary_report = {
    'Total Posts': len(big_data),
    'Label 1 (Burnout)': total_label_1,
    'Label 1 Percentage': f"{total_label_1/len(big_data)*100:.2f}%",
    'Label 0 (Others)': total_others,
    'Label 0 Percentage': f"{total_others/len(big_data)*100:.2f}%",
    'Classification Errors': total_errors,
    'Average Confidence': f"{big_data['icl_confidence'].mean():.3f}",
    'High Confidence (>0.7) Posts': (big_data['icl_confidence'] > 0.7).sum(),
    'Very High Confidence (>0.85) Posts': (big_data['icl_confidence'] > 0.85).sum(),
    'Uncertain (0.4-0.6) Posts': len(uncertain)
}

summary_df = pd.DataFrame([summary_report]).T
summary_df.columns = ['Value']
summary_df.to_csv("reddit_ICL_summary_report.csv")
print(f" Summary report saved to: reddit_ICL_summary_report.csv")
print("\nSummary:")
print(summary_df)
