# Fed Minutes Parsing Validation

This notebook provides comprehensive validation of the Fed Minutes parsing results, identifying and analyzing problematic files and overall data quality.

## Contents
1. Data Loading and Setup
2. Overall Quality Assessment
3. Problem File Identification
4. Detailed Problem Analysis
5. Sample File Examination
6. Quality Improvement Recommendations

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
from datetime import datetime
from IPython.display import display, HTML, Markdown
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 100)

# Plot styling
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ Libraries loaded successfully")

In [None]:
# Load the parsed data
data_path = Path('../data/processed/meetings_summary.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"📊 Loaded data from {data_path}")
else:
    # Try alternative locations
    alt_paths = [
        Path('../fed_minutes_output/meetings_summary.csv'),
        Path('../data/validation/test_results.csv')
    ]
    
    for alt_path in alt_paths:
        if alt_path.exists():
            df = pd.read_csv(alt_path)
            print(f"📊 Loaded data from {alt_path}")
            break
    else:
        raise FileNotFoundError("Could not find meetings data. Please run the parser first.")

# Parse JSON columns
json_columns = ['attendees', 'decisions', 'topics', 'main_topics', 'board_members']
for col in json_columns:
    if col in df.columns:
        df[col] = df[col].apply(
            lambda x: json.loads(x) if isinstance(x, str) and x.startswith('[') else x
        )

# Convert dates
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year

print(f"\n🔍 Dataset loaded:")
print(f"  - Shape: {df.shape}")
print(f"  - Date range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
print(f"  - Columns: {', '.join(df.columns)}")

## 2. Overall Quality Assessment

In [None]:
# Calculate comprehensive quality metrics
quality_metrics = {
    'Data Completeness': {
        'Date Extraction Success': f"{df['date'].notna().mean()*100:.1f}%",
        'Files with Attendees': f"{(df['num_attendees'] > 0).mean()*100:.1f}%",
        'Files with Decisions': f"{(df['num_decisions'] > 0).mean()*100:.1f}%",
        'Files with Topics': f"{(df['num_topics'] > 0).mean()*100:.1f}%" if 'num_topics' in df.columns else 'N/A'
    },
    'Content Quality': {
        'Average Attendees': f"{df['num_attendees'].mean():.1f}",
        'Average Decisions': f"{df['num_decisions'].mean():.1f}",
        'Average Text Length': f"{df['text_length'].mean():,.0f} chars" if 'text_length' in df.columns else 'N/A',
        'Files with Adequate Attendance (≥3)': f"{(df['num_attendees'] >= 3).mean()*100:.1f}%"
    },
    'Potential Issues': {
        'Low Attendance (<3)': f"{(df['num_attendees'] < 3).sum()} files ({(df['num_attendees'] < 3).mean()*100:.1f}%)",
        'No Decisions': f"{(df['num_decisions'] == 0).sum()} files ({(df['num_decisions'] == 0).mean()*100:.1f}%)",
        'Short Documents (<1000 chars)': f"{(df['text_length'] < 1000).sum()} files ({(df['text_length'] < 1000).mean()*100:.1f}%)" if 'text_length' in df.columns else 'N/A',
        'Missing Dates': f"{df['date'].isna().sum()} files ({df['date'].isna().mean()*100:.1f}%)"
    }
}

display(Markdown("## 📊 **Quality Assessment Dashboard**"))

for category, metrics in quality_metrics.items():
    display(Markdown(f"### {category}"))
    
    # Create DataFrame for better display
    metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])
    display(metrics_df)
    print()  # Add spacing

In [None]:
# Visual quality overview
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Distribution of key metrics
axes[0,0].hist([df['num_attendees'], df['num_decisions']], 
               bins=20, alpha=0.7, label=['Attendees', 'Decisions'], color=['blue', 'orange'])
axes[0,0].set_title('Distribution of Attendees and Decisions')
axes[0,0].set_xlabel('Count')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Quality flags by year
yearly_quality = df.groupby('year').agg({
    'num_attendees': lambda x: (x < 3).mean() * 100,
    'num_decisions': lambda x: (x == 0).mean() * 100
}).rename(columns={'num_attendees': 'Low Attendance %', 'num_decisions': 'No Decisions %'})

yearly_quality.plot(kind='bar', ax=axes[0,1], color=['red', 'darkred'], alpha=0.7)
axes[0,1].set_title('Quality Issues by Year')
axes[0,1].set_xlabel('Year')
axes[0,1].set_ylabel('Percentage of Files')
axes[0,1].legend()
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].grid(True, alpha=0.3)

# 3. Scatter plot: Attendees vs Decisions
colors = ['red' if (att < 3 or dec == 0) else 'blue' 
          for att, dec in zip(df['num_attendees'], df['num_decisions'])]

axes[1,0].scatter(df['num_attendees'], df['num_decisions'], c=colors, alpha=0.6)
axes[1,0].set_title('Attendees vs Decisions (Red = Problem Files)')
axes[1,0].set_xlabel('Number of Attendees')
axes[1,0].set_ylabel('Number of Decisions')
axes[1,0].grid(True, alpha=0.3)

# Add problem threshold lines
axes[1,0].axvline(x=3, color='red', linestyle='--', alpha=0.7, label='Min Attendees')
axes[1,0].axhline(y=0.5, color='red', linestyle='--', alpha=0.7, label='Min Decisions')
axes[1,0].legend()

# 4. Text length distribution (if available)
if 'text_length' in df.columns:
    axes[1,1].hist(df['text_length'], bins=30, color='green', alpha=0.7, edgecolor='black')
    axes[1,1].axvline(1000, color='red', linestyle='--', label='Min Length Threshold')
    axes[1,1].set_title('Document Length Distribution')
    axes[1,1].set_xlabel('Text Length (characters)')
    axes[1,1].set_ylabel('Frequency')
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
else:
    axes[1,1].text(0.5, 0.5, 'Text length data\nnot available', 
                   ha='center', va='center', transform=axes[1,1].transAxes,
                   fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray'))
    axes[1,1].set_title('Document Length Distribution')

plt.tight_layout()
plt.show()

## 3. Problem File Identification

In [None]:
# Define criteria for problem files
low_attendance = df['num_attendees'] < 3
no_decisions = df['num_decisions'] == 0
missing_date = df['date'].isna()
short_text = df['text_length'] < 1000 if 'text_length' in df.columns else pd.Series([False] * len(df))

# Identify problem files
problem_files = df[
    low_attendance | no_decisions | missing_date | short_text
].copy()

# Categorize issues
problem_files['issues'] = problem_files.apply(lambda row: 
    '; '.join([
        'Low Attendance' if row['num_attendees'] < 3 else '',
        'No Decisions' if row['num_decisions'] == 0 else '',
        'Missing Date' if pd.isna(row['date']) else '',
        'Short Text' if 'text_length' in row and row['text_length'] < 1000 else ''
    ]).strip('; ').replace(';; ', '; ').replace(';;', ''), axis=1)

# Remove empty issues
problem_files['issues'] = problem_files['issues'].str.replace('^; |; $', '', regex=True)
problem_files = problem_files[problem_files['issues'] != '']

display(Markdown(f"## ⚠️ **Found {len(problem_files)} Problem Files** (out of {len(df)} total = {len(problem_files)/len(df)*100:.1f}%)"))

# Issue breakdown
issue_breakdown = {
    'Low Attendance (<3 people)': low_attendance.sum(),
    'No Decisions Found': no_decisions.sum(),
    'Missing Date': missing_date.sum(),
    'Short Text (<1000 chars)': short_text.sum() if 'text_length' in df.columns else 0
}

# Remove zero counts
issue_breakdown = {k: v for k, v in issue_breakdown.items() if v > 0}

breakdown_df = pd.DataFrame([
    {'Issue Type': issue, 'Count': count, 'Percentage': f"{count/len(df)*100:.1f}%"}
    for issue, count in issue_breakdown.items()
])

display(breakdown_df)

# Overlap analysis
overlap_analysis = {
    'Low Attendance Only': (low_attendance & ~no_decisions).sum(),
    'No Decisions Only': (~low_attendance & no_decisions).sum(),
    'Both Issues': (low_attendance & no_decisions).sum()
}

if any(overlap_analysis.values()):
    display(Markdown("### Issue Overlap Analysis"))
    overlap_df = pd.DataFrame([
        {'Category': cat, 'Count': count, 'Percentage': f"{count/len(df)*100:.1f}%"}
        for cat, count in overlap_analysis.items() if count > 0
    ])
    display(overlap_df)

In [None]:
# Visualize problem file distribution
if len(problem_files) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Problem files by year
    if 'year' in problem_files.columns:
        problem_by_year = problem_files.groupby('year').size()
        total_by_year = df.groupby('year').size()
        problem_rate = (problem_by_year / total_by_year * 100).fillna(0)
        
        axes[0,0].bar(problem_by_year.index, problem_by_year.values, color='coral', alpha=0.7)
        axes[0,0].set_title('Problem Files by Year')
        axes[0,0].set_xlabel('Year')
        axes[0,0].set_ylabel('Number of Problem Files')
        axes[0,0].grid(True, alpha=0.3)
    
    # 2. Problem rate by year
    if 'year' in problem_files.columns:
        axes[0,1].plot(problem_rate.index, problem_rate.values, marker='o', linewidth=2, color='red')
        axes[0,1].set_title('Problem Rate by Year')
        axes[0,1].set_xlabel('Year')
        axes[0,1].set_ylabel('Problem Rate (%)')
        axes[0,1].grid(True, alpha=0.3)
    
    # 3. Issue types distribution
    if issue_breakdown:
        issues, counts = zip(*issue_breakdown.items())
        axes[1,0].pie(counts, labels=issues, autopct='%1.1f%%', startangle=90)
        axes[1,0].set_title('Distribution of Issue Types')
    
    # 4. Problem vs Normal files comparison
    normal_files = df[~df.index.isin(problem_files.index)]
    
    comparison_data = {
        'Avg Attendees': [normal_files['num_attendees'].mean(), problem_files['num_attendees'].mean()],
        'Avg Decisions': [normal_files['num_decisions'].mean(), problem_files['num_decisions'].mean()]
    }
    
    x = np.arange(len(comparison_data))
    width = 0.35
    
    axes[1,1].bar(x - width/2, [comparison_data['Avg Attendees'][0], comparison_data['Avg Decisions'][0]], 
                  width, label='Normal Files', color='green', alpha=0.7)
    axes[1,1].bar(x + width/2, [comparison_data['Avg Attendees'][1], comparison_data['Avg Decisions'][1]], 
                  width, label='Problem Files', color='red', alpha=0.7)
    
    axes[1,1].set_title('Normal vs Problem Files Comparison')
    axes[1,1].set_xticks(x)
    axes[1,1].set_xticklabels(['Avg Attendees', 'Avg Decisions'])
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    display(Markdown("### 🎉 **No significant problems found in the dataset!**"))

## 4. Detailed Problem Analysis

In [None]:
if len(problem_files) > 0:
    # Show detailed problem file list
    display(Markdown("### 📋 **Problem Files Details**"))
    
    # Prepare display columns
    display_cols = ['filename', 'date', 'num_attendees', 'num_decisions', 'issues']
    if 'text_length' in problem_files.columns:
        display_cols.insert(-1, 'text_length')
    
    problem_display = problem_files[display_cols].copy()
    if 'date' in problem_display.columns:
        problem_display['date'] = problem_display['date'].dt.strftime('%Y-%m-%d')
    
    # Sort by date if available, otherwise by filename
    if 'date' in problem_display.columns:
        problem_display = problem_display.sort_values('date')
    else:
        problem_display = problem_display.sort_values('filename')
    
    # Show first 20 rows
    display(Markdown(f"**Showing first 20 of {len(problem_files)} problem files:**"))
    display(problem_display.head(20))
    
    # Save complete list
    output_path = Path('../data/validation/problematic_files_detailed.csv')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    problem_display.to_csv(output_path, index=False)
    print(f"\n💾 Complete problem files list saved to: {output_path}")

else:
    display(Markdown("### ✅ **No problem files to analyze!**"))

## 5. Sample File Examination

In [None]:
def examine_file_content(filename, df):
    """Examine a specific file in detail"""
    try:
        row = df[df['filename'] == filename].iloc[0]
        
        print(f"\n{'='*60}")
        print(f"📄 FILE: {filename}")
        print('='*60)
        print(f"📅 Date: {row['date'] if pd.notna(row['date']) else 'MISSING'}")
        print(f"👥 Attendees: {row['num_attendees']}")
        print(f"📋 Decisions: {row['num_decisions']}")
        if 'text_length' in row:
            print(f"📝 Text length: {row['text_length']:,} characters")
        
        # Show attendees if available
        if 'attendees' in row and isinstance(row['attendees'], list) and len(row['attendees']) > 0:
            print(f"\n👥 Attendees list:")
            for i, attendee in enumerate(row['attendees'][:5], 1):
                if isinstance(attendee, dict):
                    name = attendee.get('name', 'Unknown')
                    title = attendee.get('title', '')
                    print(f"   {i}. {name}" + (f" - {title}" if title else ""))
                else:
                    print(f"   {i}. {attendee}")
            if len(row['attendees']) > 5:
                print(f"   ... and {len(row['attendees']) - 5} more")
        
        # Show decisions if available
        if 'decisions' in row and isinstance(row['decisions'], list) and len(row['decisions']) > 0:
            print(f"\n📋 Decisions:")
            for i, decision in enumerate(row['decisions'][:3], 1):
                if isinstance(decision, dict):
                    action = decision.get('action', 'Unknown')
                    subject = decision.get('subject', 'No subject')
                    print(f"   {i}. {action.upper()}: {subject[:80]}...")
                else:
                    print(f"   {i}. {decision}")
            if len(row['decisions']) > 3:
                print(f"   ... and {len(row['decisions']) - 3} more")
        
        # Try to show original text preview
        txt_paths = [
            Path(f"../data/raw/TXTs/{filename.replace('.pdf', '.txt')}"),
            Path(f"../TXTs/{filename.replace('.pdf', '.txt')}"),
            Path(f"TXTs/{filename.replace('.pdf', '.txt')}")
        ]
        
        for txt_path in txt_paths:
            if txt_path.exists():
                with open(txt_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                
                print(f"\n📖 Text Preview (first 500 characters):")
                print("-" * 50)
                print(text[:500])
                print("-" * 50)
                break
        else:
            print(f"\n❌ Could not find text file for {filename}")
                
    except IndexError:
        print(f"❌ File {filename} not found in dataset")
    except Exception as e:
        print(f"❌ Error examining file {filename}: {e}")

# Examine sample problem files
if len(problem_files) > 0:
    display(Markdown("## 🔍 **Sample Problem File Examination**"))
    
    # Select diverse sample files
    sample_files = []
    
    # Get files with different types of issues
    if (problem_files['num_attendees'] < 3).any():
        low_att_file = problem_files[problem_files['num_attendees'] < 3]['filename'].iloc[0]
        sample_files.append(low_att_file)
    
    if (problem_files['num_decisions'] == 0).any():
        no_dec_file = problem_files[problem_files['num_decisions'] == 0]['filename'].iloc[0]
        if no_dec_file not in sample_files:
            sample_files.append(no_dec_file)
    
    # Add one more random problem file if available
    if len(problem_files) > len(sample_files):
        remaining_files = problem_files[~problem_files['filename'].isin(sample_files)]
        if len(remaining_files) > 0:
            sample_files.append(remaining_files['filename'].iloc[0])
    
    # Limit to 3 files for readability
    sample_files = sample_files[:3]
    
    print(f"📋 Examining {len(sample_files)} sample problem files:")
    
    for filename in sample_files:
        examine_file_content(filename, df)

else:
    display(Markdown("## ✅ **No problem files to examine!**"))

## 6. Quality Score Calculation

In [None]:
# Calculate comprehensive quality score
def calculate_quality_score(df):
    """Calculate overall parsing quality score"""
    
    scores = {}
    weights = {}
    
    # Date extraction score
    scores['date_extraction'] = df['date'].notna().mean()
    weights['date_extraction'] = 0.25
    
    # Attendee quality score (normalized, capped at 1.0)
    scores['attendee_quality'] = min(1.0, df['num_attendees'].mean() / 10)
    weights['attendee_quality'] = 0.20
    
    # Decision quality score (normalized, capped at 1.0)
    scores['decision_quality'] = min(1.0, df['num_decisions'].mean() / 5)
    weights['decision_quality'] = 0.25
    
    # Content completeness (files with adequate attendance)
    scores['content_completeness'] = (df['num_attendees'] >= 3).mean()
    weights['content_completeness'] = 0.15
    
    # Text quality (if available)
    if 'text_length' in df.columns:
        scores['text_quality'] = (df['text_length'] >= 1000).mean()
        weights['text_quality'] = 0.15
    else:
        # Redistribute weight to other categories
        weights['date_extraction'] = 0.30
        weights['attendee_quality'] = 0.25
        weights['decision_quality'] = 0.30
        weights['content_completeness'] = 0.15
    
    # Calculate weighted average
    overall_score = sum(scores[k] * weights[k] for k in scores if k in weights) * 100
    
    return scores, weights, overall_score

scores, weights, overall_score = calculate_quality_score(df)

display(Markdown("## 🎯 **Overall Quality Assessment**"))

# Display component scores
component_df = pd.DataFrame([
    {
        'Component': component.replace('_', ' ').title(),
        'Score': f"{score*100:.1f}%",
        'Weight': f"{weights.get(component, 0)*100:.0f}%"
    }
    for component, score in scores.items()
])

display(component_df)

print(f"\n🏆 **OVERALL QUALITY SCORE: {overall_score:.1f}%**")

# Quality assessment
if overall_score >= 90:
    assessment = "🎉 **EXCELLENT** - Ready for analysis and production use!"
    color = "green"
elif overall_score >= 80:
    assessment = "✅ **GOOD** - Minor improvements could enhance quality"
    color = "orange"
elif overall_score >= 70:
    assessment = "⚠️ **ACCEPTABLE** - Several issues should be addressed"
    color = "orange"
else:
    assessment = "❌ **NEEDS IMPROVEMENT** - Significant parsing issues detected"
    color = "red"

display(Markdown(f"### {assessment}"))

# Create quality score visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Component scores bar chart
components = [comp.replace('_', ' ').title() for comp in scores.keys()]
score_values = [score * 100 for score in scores.values()]

bars = ax1.barh(components, score_values, color=['green' if s >= 90 else 'orange' if s >= 70 else 'red' for s in score_values])
ax1.set_title('Quality Component Scores')
ax1.set_xlabel('Score (%)')
ax1.set_xlim(0, 100)
ax1.grid(True, alpha=0.3)

# Add score labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax1.text(width + 1, bar.get_y() + bar.get_height()/2, 
             f'{width:.1f}%', ha='left', va='center')

# Overall score gauge
ax2.pie([overall_score, 100-overall_score], 
        colors=[color, 'lightgray'], 
        startangle=90,
        counterclock=False)
ax2.add_patch(plt.Circle((0, 0), 0.6, color='white'))
ax2.text(0, 0, f'{overall_score:.1f}%', ha='center', va='center', 
         fontsize=20, fontweight='bold')
ax2.set_title('Overall Quality Score')

plt.tight_layout()
plt.show()

## 7. Recommendations and Action Items

In [None]:
# Generate recommendations based on findings
recommendations = []

if (df['num_attendees'] < 3).sum() > 0:
    low_att_count = (df['num_attendees'] < 3).sum()
    recommendations.append({
        'Priority': 'High' if low_att_count > len(df) * 0.05 else 'Medium',
        'Issue': f'{low_att_count} files with low attendance (<3 people)',
        'Recommendation': 'Review attendee extraction patterns; check for OCR issues in PRESENT sections',
        'Action': 'Improve attendee parsing algorithm'
    })

if (df['num_decisions'] == 0).sum() > 0:
    no_dec_count = (df['num_decisions'] == 0).sum()
    recommendations.append({
        'Priority': 'High' if no_dec_count > len(df) * 0.05 else 'Medium',
        'Issue': f'{no_dec_count} files with no decisions found',
        'Recommendation': 'Expand decision action verb patterns; check for alternative decision formats',
        'Action': 'Enhance decision extraction patterns'
    })

if df['date'].isna().sum() > 0:
    missing_dates = df['date'].isna().sum()
    recommendations.append({
        'Priority': 'Medium',
        'Issue': f'{missing_dates} files with missing dates',
        'Recommendation': 'Add more date format patterns; check document headers and footers',
        'Action': 'Improve date extraction patterns'
    })

if 'text_length' in df.columns and (df['text_length'] < 1000).sum() > 0:
    short_docs = (df['text_length'] < 1000).sum()
    recommendations.append({
        'Priority': 'Low',
        'Issue': f'{short_docs} files with very short text (<1000 chars)',
        'Recommendation': 'Check PDF extraction quality; may indicate scan quality issues',
        'Action': 'Review PDF-to-text conversion process'
    })

# Add positive notes if quality is high
if overall_score >= 90:
    recommendations.append({
        'Priority': 'Info',
        'Issue': 'High overall quality achieved',
        'Recommendation': 'Dataset is ready for analysis. Consider expanding to full corpus.',
        'Action': 'Proceed with full processing pipeline'
    })

display(Markdown("## 📝 **Recommendations and Action Items**"))

if recommendations:
    rec_df = pd.DataFrame(recommendations)
    display(rec_df)
else:
    display(Markdown("### ✅ **No specific recommendations - Quality is excellent!**"))

# Summary report
display(Markdown("## 📊 **Validation Summary Report**"))

summary_report = {
    'validation_date': datetime.now().isoformat(),
    'dataset_info': {
        'total_files': len(df),
        'date_range': f"{df['date'].min()} to {df['date'].max()}",
        'problem_files': len(problem_files) if len(problem_files) > 0 else 0,
        'problem_rate': f"{len(problem_files)/len(df)*100:.2f}%" if len(problem_files) > 0 else "0%"
    },
    'quality_scores': {k: f"{v*100:.1f}%" for k, v in scores.items()},
    'overall_score': f"{overall_score:.1f}%",
    'assessment': assessment,
    'recommendations': recommendations
}

print(f"📋 VALIDATION SUMMARY:")
print(f"  • Total Files: {summary_report['dataset_info']['total_files']}")
print(f"  • Problem Files: {summary_report['dataset_info']['problem_files']} ({summary_report['dataset_info']['problem_rate']})")
print(f"  • Overall Quality Score: {summary_report['overall_score']}")
print(f"  • Assessment: {assessment}")

# Save validation report
report_path = Path('../data/validation/validation_report.json')
report_path.parent.mkdir(parents=True, exist_ok=True)

with open(report_path, 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print(f"\n💾 Complete validation report saved to: {report_path}")
print("\n🎉 Validation analysis complete!")