# Data Leakage Analysis

Analyzing train_data.csv and validation_data.csv for potential data leakage between training and validation sets.

In [1]:
import pandas as pd
import hashlib

print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:
# Load the datasets - this will trigger auto-split from BBC News Train.csv if needed
from pathlib import Path
import sys
sys.path.append('src')
from utils.data_loader import NewsDataLoader

# Initialize data loader - this will auto-create the split files from BBC News Train.csv
data_loader = NewsDataLoader(\"data/bbc-news-articles-labeled\")

# Now load the split datasets
train_data = pd.read_csv(\"data/bbc-news-articles-labeled/train_data.csv\")
validation_data = pd.read_csv(\"data/bbc-news-articles-labeled/validation_data.csv\")

print(\"Dataset shapes:\")
print(f\"Training set: {train_data.shape}\")
print(f\"Validation set: {validation_data.shape}\")

print(\"\\nColumn check:\")
print(f\"Train columns: {train_data.columns.tolist()}\")
print(f\"Validation columns: {validation_data.columns.tolist()}\")

print(\"\\nFirst few rows of training data:\")
display(train_data.head())"

In [10]:
validation_data['Text'].unique().shape

(370,)

In [3]:
# Check for duplicate ArticleIds between train and validation sets
train_ids = set(train_data['ArticleId'])
validation_ids = set(validation_data['ArticleId'])

# Find overlapping IDs
overlapping_ids = train_ids.intersection(validation_ids)

print("🚨 DATA LEAKAGE ANALYSIS - ARTICLE IDs")
print("=" * 50)
print(f"Training set ArticleIds: {len(train_ids)}")
print(f"Validation set ArticleIds: {len(validation_ids)}")
print(f"Overlapping ArticleIds: {len(overlapping_ids)}")

if overlapping_ids:
    print(f"\n⚠️  CRITICAL: {len(overlapping_ids)} duplicate ArticleIds found!")
    print("Sample overlapping IDs:", list(overlapping_ids)[:10])
else:
    print("\n✅ No duplicate ArticleIds found")

# Calculate leakage percentage
id_leakage_percent = (len(overlapping_ids) / len(validation_ids)) * 100
print(f"ID leakage percentage: {id_leakage_percent:.1f}% of validation set")

🚨 DATA LEAKAGE ANALYSIS - ARTICLE IDs
Training set ArticleIds: 1117
Validation set ArticleIds: 373
Overlapping ArticleIds: 0

✅ No duplicate ArticleIds found
ID leakage percentage: 0.0% of validation set


In [4]:
# Check for duplicate text content (same articles with different IDs)
def get_text_hash(text):
    """Create hash of normalized text for comparison."""
    # Normalize text: lowercase, strip whitespace, remove extra spaces
    normalized = ' '.join(str(text).lower().strip().split())
    return hashlib.md5(normalized.encode()).hexdigest()

# Create text hashes for both datasets
train_data['text_hash'] = train_data['Text'].apply(get_text_hash)
validation_data['text_hash'] = validation_data['Text'].apply(get_text_hash)

# Find overlapping text hashes
train_hashes = set(train_data['text_hash'])
validation_hashes = set(validation_data['text_hash'])
overlapping_hashes = train_hashes.intersection(validation_hashes)

print("🚨 DATA LEAKAGE ANALYSIS - TEXT CONTENT")
print("=" * 50)
print(f"Training set unique texts: {len(train_hashes)}")
print(f"Validation set unique texts: {len(validation_hashes)}")
print(f"Overlapping text hashes: {len(overlapping_hashes)}")

if overlapping_hashes:
    print(f"\n⚠️  CRITICAL: {len(overlapping_hashes)} duplicate texts found!")
    
    # Show examples of duplicate texts
    duplicate_examples = validation_data[validation_data['text_hash'].isin(overlapping_hashes)]
    print(f"\nExample duplicate articles:")
    for i, row in duplicate_examples.head(3).iterrows():
        matching_train = train_data[train_data['text_hash'] == row['text_hash']]
        print(f"  Validation ID {row['ArticleId']} matches Training ID {matching_train.iloc[0]['ArticleId']}")
        print(f"  Text preview: {row['Text'][:100]}...")
        print()
else:
    print("\n✅ No duplicate text content found")

# Calculate text leakage percentage
text_leakage_percent = (len(overlapping_hashes) / len(validation_hashes)) * 100
print(f"Text leakage percentage: {text_leakage_percent:.1f}% of validation set")

🚨 DATA LEAKAGE ANALYSIS - TEXT CONTENT
Training set unique texts: 1094
Validation set unique texts: 370
Overlapping text hashes: 24

⚠️  CRITICAL: 24 duplicate texts found!

Example duplicate articles:
  Validation ID 1115 matches Training ID 2098
  Text preview: pop band busted to  take a break  chart-topping pop band busted have confirmed that they plan to  ta...

  Validation ID 789 matches Training ID 2042

  Validation ID 1937 matches Training ID 636
  Text preview: more power to the people says hp the digital revolution is focused on letting people tell and share ...

Text leakage percentage: 6.5% of validation set


In [5]:
# Comprehensive data leakage summary report
print("📊 DATA LEAKAGE SUMMARY REPORT")
print("=" * 60)

# Basic stats
total_train = len(train_data)
total_validation = len(validation_data)
total_articles = total_train + total_validation

print(f"Dataset Overview:")
print(f"  Training articles: {total_train:,}")
print(f"  Validation articles: {total_validation:,}")
print(f"  Total articles: {total_articles:,}")
print(f"  Train/Val split ratio: {total_train/total_validation:.1f}:1")

# ID-based leakage
id_leakage_count = len(overlapping_ids) if 'overlapping_ids' in locals() else 0
id_leakage_percent = (id_leakage_count / total_validation) * 100 if total_validation > 0 else 0

print(f"\n📋 ArticleId Leakage:")
print(f"  Duplicate ArticleIds: {id_leakage_count}")
print(f"  Leakage rate: {id_leakage_percent:.1f}% of validation set")

# Text-based leakage
text_leakage_count = len(overlapping_hashes) if 'overlapping_hashes' in locals() else 0
text_leakage_percent = (text_leakage_count / total_validation) * 100 if total_validation > 0 else 0

print(f"\n📝 Text Content Leakage:")
print(f"  Duplicate text content: {text_leakage_count}")
print(f"  Leakage rate: {text_leakage_percent:.1f}% of validation set")

# Overall assessment
total_leakage = max(id_leakage_count, text_leakage_count)
overall_leakage_percent = (total_leakage / total_validation) * 100 if total_validation > 0 else 0

print(f"\n🎯 OVERALL ASSESSMENT:")
if total_leakage == 0:
    print("  ✅ NO DATA LEAKAGE DETECTED")
    print("  ✅ Train/validation split is clean")
elif overall_leakage_percent < 5:
    print(f"  ⚠️  MINOR LEAKAGE: {total_leakage} articles ({overall_leakage_percent:.1f}%)")
    print("  📝 Consider removing duplicates but may not significantly impact results")
elif overall_leakage_percent < 10:
    print(f"  🚨 MODERATE LEAKAGE: {total_leakage} articles ({overall_leakage_percent:.1f}%)")
    print("  ⚠️  Should remove duplicates to ensure valid evaluation")
else:
    print(f"  🔥 SEVERE LEAKAGE: {total_leakage} articles ({overall_leakage_percent:.1f}%)")
    print("  🚨 MUST remove duplicates - current evaluation is invalid")

print(f"\nRecommendation:")
if total_leakage == 0:
    print("  Proceed with current train/validation split")
else:
    print(f"  Remove {total_leakage} duplicate articles from validation set")
    print(f"  Clean validation set would have {total_validation - total_leakage} articles")

📊 DATA LEAKAGE SUMMARY REPORT
Dataset Overview:
  Training articles: 1,117
  Validation articles: 373
  Total articles: 1,490
  Train/Val split ratio: 3.0:1

📋 ArticleId Leakage:
  Duplicate ArticleIds: 0
  Leakage rate: 0.0% of validation set

📝 Text Content Leakage:
  Duplicate text content: 24
  Leakage rate: 6.4% of validation set

🎯 OVERALL ASSESSMENT:
  🚨 MODERATE LEAKAGE: 24 articles (6.4%)
  ⚠️  Should remove duplicates to ensure valid evaluation

Recommendation:
  Remove 24 duplicate articles from validation set
  Clean validation set would have 349 articles
