# Fed Minutes Data Exploration

This notebook explores the parsed Federal Reserve meeting minutes data to understand patterns, trends, and overall data characteristics.

## Contents
1. Data Loading and Overview
2. Temporal Analysis  
3. Attendee Patterns
4. Decision Analysis
5. Topic Distribution
6. Statistical Summary

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import json
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# Set plot style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ Libraries loaded successfully")

In [None]:
# Load the parsed data
data_path = Path('../data/processed/meetings_summary.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"📊 Loaded data from {data_path}")
else:
    # Try alternative location
    alt_path = Path('../fed_minutes_output/meetings_summary.csv')
    if alt_path.exists():
        df = pd.read_csv(alt_path)
        print(f"📊 Loaded data from {alt_path}")
    else:
        raise FileNotFoundError("Could not find meetings_summary.csv. Please run the parser first.")

# Parse JSON columns
json_columns = ['attendees', 'decisions', 'topics', 'main_topics', 'board_members']
for col in json_columns:
    if col in df.columns:
        df[col] = df[col].apply(
            lambda x: json.loads(x) if isinstance(x, str) and x.startswith('[') else x
        )

# Convert dates and create time features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['month_year'] = df['date'].dt.to_period('M')

print(f"\n🔍 Dataset Overview:")
print(f"  - Shape: {df.shape}")
print(f"  - Date range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
print(f"  - Total meetings: {len(df):,}")
print(f"  - Columns: {', '.join(df.columns)}")

## 2. Basic Data Overview

In [None]:
# Display basic info
print("📋 Data Info:")
print(df.info())

print("\n📈 Basic Statistics:")
numeric_cols = ['num_attendees', 'num_decisions', 'num_topics', 'text_length', 'total_amount_approved']
display(df[numeric_cols].describe())

In [None]:
# Show sample records
print("📄 Sample Records:")
display(df[['filename', 'date', 'meeting_type', 'num_attendees', 'num_decisions', 'num_topics']].head())

## 3. Temporal Analysis

In [None]:
# Meeting frequency over time
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Meetings by year
yearly_counts = df.groupby('year').size()
axes[0,0].bar(yearly_counts.index, yearly_counts.values, color='steelblue', alpha=0.7)
axes[0,0].set_title('Meetings per Year')
axes[0,0].set_xlabel('Year')
axes[0,0].set_ylabel('Number of Meetings')
axes[0,0].grid(True, alpha=0.3)

# Meetings by month
monthly_counts = df.groupby('month').size()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
axes[0,1].bar(range(1, 13), [monthly_counts.get(i, 0) for i in range(1, 13)], 
              color='forestgreen', alpha=0.7)
axes[0,1].set_title('Meetings by Month (All Years)')
axes[0,1].set_xlabel('Month')
axes[0,1].set_ylabel('Total Meetings')
axes[0,1].set_xticks(range(1, 13))
axes[0,1].set_xticklabels(month_names, rotation=45)
axes[0,1].grid(True, alpha=0.3)

# Meeting types distribution
meeting_types = df['meeting_type'].value_counts()
axes[1,0].pie(meeting_types.values, labels=meeting_types.index, autopct='%1.1f%%', startangle=90)
axes[1,0].set_title('Meeting Types Distribution')

# Timeline of meetings
df_sorted = df.sort_values('date')
axes[1,1].plot(df_sorted['date'], df_sorted.index, marker='.', alpha=0.6, markersize=2)
axes[1,1].set_title('Meeting Timeline')
axes[1,1].set_xlabel('Date')
axes[1,1].set_ylabel('Meeting Number (cumulative)')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n📊 Temporal Summary:")
print(f"  - Years covered: {df['year'].min()} - {df['year'].max()}")
print(f"  - Most active year: {yearly_counts.idxmax()} ({yearly_counts.max()} meetings)")
print(f"  - Most common month: {month_names[monthly_counts.idxmax()-1]} ({monthly_counts.max()} meetings)")
print(f"  - Average meetings per year: {yearly_counts.mean():.1f}")

## 4. Attendee Analysis

In [None]:
# Attendee statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribution of attendees per meeting
axes[0,0].hist(df['num_attendees'], bins=20, color='coral', alpha=0.7, edgecolor='black')
axes[0,0].set_title('Distribution of Attendees per Meeting')
axes[0,0].set_xlabel('Number of Attendees')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(df['num_attendees'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {df["num_attendees"].mean():.1f}')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Attendees over time
yearly_attendees = df.groupby('year')['num_attendees'].mean()
axes[0,1].plot(yearly_attendees.index, yearly_attendees.values, marker='o', linewidth=2, color='purple')
axes[0,1].set_title('Average Attendees per Meeting by Year')
axes[0,1].set_xlabel('Year')
axes[0,1].set_ylabel('Average Attendees')
axes[0,1].grid(True, alpha=0.3)

# Box plot of attendees by year
yearly_data = [df[df['year'] == year]['num_attendees'].values for year in sorted(df['year'].unique())]
axes[1,0].boxplot(yearly_data, labels=sorted(df['year'].unique()))
axes[1,0].set_title('Attendees Distribution by Year')
axes[1,0].set_xlabel('Year')
axes[1,0].set_ylabel('Number of Attendees')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].grid(True, alpha=0.3)

# Most frequent attendees (if available)
if 'attendees' in df.columns:
    all_attendees = []
    for attendees_list in df['attendees'].dropna():
        if isinstance(attendees_list, list):
            for attendee in attendees_list:
                if isinstance(attendee, dict) and 'name' in attendee:
                    all_attendees.append(attendee['name'])
    
    if all_attendees:
        from collections import Counter
        top_attendees = Counter(all_attendees).most_common(10)
        names, counts = zip(*top_attendees)
        
        axes[1,1].barh(range(len(names)), counts, color='lightblue')
        axes[1,1].set_yticks(range(len(names)))
        axes[1,1].set_yticklabels([name[:20] + '...' if len(name) > 20 else name for name in names])
        axes[1,1].set_title('Most Frequent Attendees')
        axes[1,1].set_xlabel('Number of Meetings')
        axes[1,1].grid(True, alpha=0.3)
    else:
        axes[1,1].text(0.5, 0.5, 'Attendee details not available', 
                       ha='center', va='center', transform=axes[1,1].transAxes)
        axes[1,1].set_title('Attendee Details')
else:
    axes[1,1].text(0.5, 0.5, 'Attendee data not available', 
                   ha='center', va='center', transform=axes[1,1].transAxes)
    axes[1,1].set_title('Attendee Details')

plt.tight_layout()
plt.show()

print(f"\n👥 Attendee Summary:")
print(f"  - Average attendees: {df['num_attendees'].mean():.1f}")
print(f"  - Range: {df['num_attendees'].min()} - {df['num_attendees'].max()}")
print(f"  - Meetings with <5 attendees: {(df['num_attendees'] < 5).sum()}")
print(f"  - Meetings with >50 attendees: {(df['num_attendees'] > 50).sum()}")

## 5. Decision Analysis

In [None]:
# Decision statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribution of decisions per meeting
axes[0,0].hist(df['num_decisions'], bins=20, color='gold', alpha=0.7, edgecolor='black')
axes[0,0].set_title('Distribution of Decisions per Meeting')
axes[0,0].set_xlabel('Number of Decisions')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(df['num_decisions'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {df["num_decisions"].mean():.1f}')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Decisions over time
yearly_decisions = df.groupby('year')['num_decisions'].mean()
axes[0,1].plot(yearly_decisions.index, yearly_decisions.values, marker='s', linewidth=2, color='darkgreen')
axes[0,1].set_title('Average Decisions per Meeting by Year')
axes[0,1].set_xlabel('Year')
axes[0,1].set_ylabel('Average Decisions')
axes[0,1].grid(True, alpha=0.3)

# Correlation between attendees and decisions
axes[1,0].scatter(df['num_attendees'], df['num_decisions'], alpha=0.6, color='navy')
axes[1,0].set_title('Attendees vs Decisions')
axes[1,0].set_xlabel('Number of Attendees')
axes[1,0].set_ylabel('Number of Decisions')
axes[1,0].grid(True, alpha=0.3)

# Add correlation coefficient
corr = df['num_attendees'].corr(df['num_decisions'])
axes[1,0].text(0.05, 0.95, f'Correlation: {corr:.3f}', transform=axes[1,0].transAxes, 
               bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Unanimous decisions over time (if available)
if 'unanimous_decisions' in df.columns:
    df['unanimous_rate'] = df['unanimous_decisions'] / df['num_decisions'].replace(0, np.nan)
    yearly_unanimous = df.groupby('year')['unanimous_rate'].mean()
    axes[1,1].plot(yearly_unanimous.index, yearly_unanimous.values * 100, 
                   marker='o', linewidth=2, color='red')
    axes[1,1].set_title('Average Unanimous Decision Rate by Year')
    axes[1,1].set_xlabel('Year')
    axes[1,1].set_ylabel('Unanimous Rate (%)')
    axes[1,1].grid(True, alpha=0.3)
else:
    # Total decisions by year
    yearly_total_decisions = df.groupby('year')['num_decisions'].sum()
    axes[1,1].bar(yearly_total_decisions.index, yearly_total_decisions.values, color='orange', alpha=0.7)
    axes[1,1].set_title('Total Decisions by Year')
    axes[1,1].set_xlabel('Year')
    axes[1,1].set_ylabel('Total Decisions')
    axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n📋 Decision Summary:")
print(f"  - Total decisions: {df['num_decisions'].sum():,}")
print(f"  - Average per meeting: {df['num_decisions'].mean():.1f}")
print(f"  - Range: {df['num_decisions'].min()} - {df['num_decisions'].max()}")
print(f"  - Meetings with no decisions: {(df['num_decisions'] == 0).sum()}")
print(f"  - Most decisive year: {yearly_decisions.idxmax()} ({yearly_decisions.max():.1f} avg decisions)")

## 6. Topic Analysis

In [None]:
# Topic analysis
if 'main_topics' in df.columns:
    # Extract all topics
    all_topics = []
    for topics_list in df['main_topics'].dropna():
        if isinstance(topics_list, list):
            all_topics.extend(topics_list)
    
    if all_topics:
        from collections import Counter
        topic_counts = Counter(all_topics)
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Most common topics
        top_topics = topic_counts.most_common(10)
        topics, counts = zip(*top_topics)
        
        axes[0,0].barh(range(len(topics)), counts, color='mediumpurple')
        axes[0,0].set_yticks(range(len(topics)))
        axes[0,0].set_yticklabels(topics)
        axes[0,0].set_title('Most Common Topics')
        axes[0,0].set_xlabel('Frequency')
        axes[0,0].grid(True, alpha=0.3)
        
        # Topics per meeting distribution
        axes[0,1].hist(df['num_topics'], bins=15, color='teal', alpha=0.7, edgecolor='black')
        axes[0,1].set_title('Distribution of Topics per Meeting')
        axes[0,1].set_xlabel('Number of Topics')
        axes[0,1].set_ylabel('Frequency')
        axes[0,1].axvline(df['num_topics'].mean(), color='red', linestyle='--', 
                          label=f'Mean: {df["num_topics"].mean():.1f}')
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)
        
        # Topics over time
        yearly_topics = df.groupby('year')['num_topics'].mean()
        axes[1,0].plot(yearly_topics.index, yearly_topics.values, marker='^', linewidth=2, color='brown')
        axes[1,0].set_title('Average Topics per Meeting by Year')
        axes[1,0].set_xlabel('Year')
        axes[1,0].set_ylabel('Average Topics')
        axes[1,0].grid(True, alpha=0.3)
        
        # Topic coverage rate
        topic_coverage = df['num_topics'] > 0
        yearly_coverage = df.groupby('year').apply(lambda x: (x['num_topics'] > 0).mean() * 100)
        axes[1,1].bar(yearly_coverage.index, yearly_coverage.values, color='lightcoral', alpha=0.7)
        axes[1,1].set_title('Topic Coverage Rate by Year')
        axes[1,1].set_xlabel('Year')
        axes[1,1].set_ylabel('Coverage Rate (%)')
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"\n📚 Topic Summary:")
        print(f"  - Unique topics: {len(topic_counts)}")
        print(f"  - Average topics per meeting: {df['num_topics'].mean():.1f}")
        print(f"  - Topic coverage rate: {topic_coverage.mean()*100:.1f}%")
        print(f"  - Most common topic: {top_topics[0][0]} ({top_topics[0][1]} times)")
    else:
        print("⚠️ No topic data available")
else:
    print("⚠️ Topic column not found in data")

## 7. Financial Data Analysis

In [None]:
# Financial analysis (if available)
if 'total_amount_approved' in df.columns:
    # Filter out zero amounts
    financial_data = df[df['total_amount_approved'] > 0]
    
    if len(financial_data) > 0:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Distribution of amounts
        axes[0,0].hist(financial_data['total_amount_approved'], bins=20, color='green', alpha=0.7)
        axes[0,0].set_title('Distribution of Approved Amounts')
        axes[0,0].set_xlabel('Amount ($)')
        axes[0,0].set_ylabel('Frequency')
        axes[0,0].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))
        axes[0,0].grid(True, alpha=0.3)
        
        # Amounts over time
        yearly_amounts = financial_data.groupby('year')['total_amount_approved'].sum()
        axes[0,1].bar(yearly_amounts.index, yearly_amounts.values, color='darkgreen', alpha=0.7)
        axes[0,1].set_title('Total Approved Amounts by Year')
        axes[0,1].set_xlabel('Year')
        axes[0,1].set_ylabel('Total Amount ($)')
        axes[0,1].ticklabel_format(style='scientific', axis='y', scilimits=(0,0))
        axes[0,1].grid(True, alpha=0.3)
        
        # Number of meetings with financial decisions
        yearly_financial_meetings = df.groupby('year').apply(lambda x: (x['total_amount_approved'] > 0).sum())
        axes[1,0].plot(yearly_financial_meetings.index, yearly_financial_meetings.values, 
                       marker='o', linewidth=2, color='red')
        axes[1,0].set_title('Meetings with Financial Decisions by Year')
        axes[1,0].set_xlabel('Year')
        axes[1,0].set_ylabel('Number of Meetings')
        axes[1,0].grid(True, alpha=0.3)
        
        # Top financial decisions
        top_financial = financial_data.nlargest(10, 'total_amount_approved')
        axes[1,1].barh(range(len(top_financial)), top_financial['total_amount_approved'].values, 
                       color='gold')
        axes[1,1].set_yticks(range(len(top_financial)))
        axes[1,1].set_yticklabels([f"{row['filename'][:15]}...\n{row['date'].strftime('%Y-%m-%d')}" 
                                   for _, row in top_financial.iterrows()])
        axes[1,1].set_title('Top Financial Decisions')
        axes[1,1].set_xlabel('Amount ($)')
        axes[1,1].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        print(f"\n💰 Financial Summary:")
        print(f"  - Meetings with financial decisions: {len(financial_data)}")
        print(f"  - Total amount approved: ${financial_data['total_amount_approved'].sum():,.2f}")
        print(f"  - Average amount per financial meeting: ${financial_data['total_amount_approved'].mean():,.2f}")
        print(f"  - Largest single approval: ${financial_data['total_amount_approved'].max():,.2f}")
    else:
        print("⚠️ No financial decision data available")
else:
    print("⚠️ Financial data column not found")

## 8. Summary Statistics

In [None]:
# Create comprehensive summary
summary_stats = {
    'Dataset Overview': {
        'Total Meetings': len(df),
        'Date Range': f"{df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}",
        'Years Covered': df['year'].max() - df['year'].min() + 1,
        'Average Meetings per Year': f"{len(df) / (df['year'].max() - df['year'].min() + 1):.1f}"
    },
    'Content Statistics': {
        'Total Attendees (all meetings)': df['num_attendees'].sum(),
        'Average Attendees per Meeting': f"{df['num_attendees'].mean():.1f}",
        'Total Decisions': df['num_decisions'].sum(),
        'Average Decisions per Meeting': f"{df['num_decisions'].mean():.1f}",
        'Total Topics Identified': df['num_topics'].sum(),
        'Average Topics per Meeting': f"{df['num_topics'].mean():.1f}"
    },
    'Data Quality': {
        'Date Extraction Success': f"{df['date'].notna().mean()*100:.1f}%",
        'Meetings with Attendees': f"{(df['num_attendees'] > 0).mean()*100:.1f}%",
        'Meetings with Decisions': f"{(df['num_decisions'] > 0).mean()*100:.1f}%",
        'Meetings with Topics': f"{(df['num_topics'] > 0).mean()*100:.1f}%",
        'Average Document Length': f"{df['text_length'].mean():,.0f} characters"
    }
}

print("📊 COMPREHENSIVE DATA SUMMARY")
print("="*50)

for category, stats in summary_stats.items():
    print(f"\n{category}:")
    for key, value in stats.items():
        print(f"  • {key}: {value}")

# Save summary to file
import json
from datetime import datetime

summary_report = {
    'generated_at': datetime.now().isoformat(),
    'summary': summary_stats
}

output_path = Path('../data/processed/exploration_summary.json')
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print(f"\n✅ Summary saved to: {output_path}")
print("\n🎉 Data exploration complete!")