# Bundestag Protocol Data Science Workflow

This notebook demonstrates how to work with extracted Bundestag protocol data using pandas and the specialized helper modules provided with the Bundestag Protocol Extractor.

## Setup and Data Loading

First, we'll import the necessary libraries and set up the environment.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use("ggplot")

# Configure pandas to display more columns and rows
pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", 1000)

# Import the Bundestag Protocol Extractor utilities
from bundestag_protocol_extractor.utils.pandas_helper import BundestagDataFrames
from bundestag_protocol_extractor.utils.data_quality import DataQualityReporter

## Load the data

Now we'll load the extracted data from the CSV files. You can specify the data directory and the base filename.

In [None]:
# Specify the data directory
data_dir = "../output"

# Create a helper instance
btdf = BundestagDataFrames(data_dir=data_dir)

# Load all available dataframes
# You can optionally specify a base filename if you have multiple exports
dataframes = btdf.load_csv_data()

# List the loaded dataframes
print(f"Loaded {len(dataframes)} dataframes:")
for name, df in dataframes.items():
    print(f"- {name}: {len(df)} rows")

## Explore the data structure

Let's examine the structure of the speech data.

In [None]:
# Get the speeches dataframe
df_speeches = btdf.get_dataframe("speeches")

# Display the first few rows
df_speeches.head()

Let's look at the extraction quality metrics that are available in the speeches dataframe.

In [None]:
# Display extraction method distribution
extraction_methods = df_speeches["extraction_method"].value_counts()
print("Extraction Methods:")
print(extraction_methods)
print("\nPercentages:")
print(extraction_methods / len(df_speeches) * 100)

# Display extraction status distribution
extraction_status = df_speeches["extraction_status"].value_counts()
print("\nExtraction Status:")
print(extraction_status)
print("\nPercentages:")
print(extraction_status / len(df_speeches) * 100)

# Display confidence score statistics
print("\nConfidence Score Statistics:")
print(df_speeches["extraction_confidence"].describe())

## Create an integrated dataframe

Now let's create an integrated dataframe that combines speeches with related entities like persons and protocols.

In [None]:
# Create an integrated dataframe
df_integrated = btdf.create_integrated_speeches_df()

# Display the first few rows
df_integrated.head()

## Filter data by extraction quality

Now let's filter the data to include only high-quality speeches.

In [None]:
# Filter for high-quality speeches (XML extracted, complete)
high_quality = btdf.filter_high_quality(df_integrated)
print(
    f"High-quality speeches: {len(high_quality)} out of {len(df_integrated)} ({len(high_quality)/len(df_integrated)*100:.1f}%)"
)

# Filter by minimum confidence score
medium_quality = btdf.filter_by_confidence(df_integrated, min_confidence=0.5)
print(
    f"Medium-quality speeches: {len(medium_quality)} out of {len(df_integrated)} ({len(medium_quality)/len(df_integrated)*100:.1f}%)"
)

## Generate quality visualizations

Let's use the DataQualityReporter to generate visualizations of extraction quality.

In [None]:
# Create a quality reporter
reporter = DataQualityReporter(output_dir=data_dir)

# Generate quality report
quality_report = reporter.generate_quality_report(
    df_speeches=df_integrated, protocol_metadata=btdf.get_dataframe("protocols")
)

# Generate visualizations (but don't save to disk in the notebook)
visualizations = reporter.generate_quality_visualizations(
    df_speeches=df_integrated, base_filename="bundestag_example", save_plots=False
)

Let's display some of the visualizations:

In [None]:
# Display the dashboard visualization
plt.figure(figsize=(16, 12))
visualizations["dashboard"]

## Analyze speech length by extraction method

Let's analyze how speech length varies by extraction method.

In [None]:
# Add text length if not already there
if "text_length" not in df_integrated.columns and "text" in df_integrated.columns:
    df_integrated["text_length"] = df_integrated["text"].str.len()

# Create a box plot
plt.figure(figsize=(12, 8))
ax = sns.boxplot(
    x="extraction_method", y="text_length", data=df_integrated, palette="Set3"
)
ax.set_title("Speech Length by Extraction Method", fontsize=16)
ax.set_xlabel("Extraction Method", fontsize=14)
ax.set_ylabel("Text Length (characters)", fontsize=14)

# Add mean values as text annotations
for i, method in enumerate(df_integrated["extraction_method"].unique()):
    method_data = df_integrated[df_integrated["extraction_method"] == method][
        "text_length"
    ]
    mean_val = method_data.mean()
    ax.text(
        i,
        mean_val,
        f"Mean: {int(mean_val)}",
        horizontalalignment="center",
        size="medium",
        color="black",
        weight="semibold",
    )

plt.grid(True, linestyle="--", alpha=0.7)
plt.show()

## Analyze data by party

Let's analyze the data by political party.

In [None]:
# Get party statistics
party_stats = btdf.get_party_stats(df_integrated)

# Print party counts
print("Speech count by party:")
for party, count in party_stats.get("party_counts", {}).items():
    percentage = party_stats.get("party_percentages", {}).get(party, 0)
    print(f"{party}: {count} speeches ({percentage:.1f}%)")

# Create a bar chart of speech count by party
plt.figure(figsize=(12, 6))
parties = list(party_stats.get("party_counts", {}).keys())
counts = list(party_stats.get("party_counts", {}).values())

# Sort by count (descending)
sorted_indices = np.argsort(counts)[::-1]
sorted_parties = [parties[i] for i in sorted_indices]
sorted_counts = [counts[i] for i in sorted_indices]

ax = sns.barplot(x=sorted_parties, y=sorted_counts, palette="Set3")
ax.set_title("Speech Count by Party", fontsize=16)
ax.set_xlabel("Party", fontsize=14)
ax.set_ylabel("Number of Speeches", fontsize=14)

# Add count labels
for i, count in enumerate(sorted_counts):
    ax.text(i, count + 5, str(count), ha="center", va="bottom", fontsize=10)

plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

## Analyze extraction method by party

Let's see if there are differences in extraction methods across parties.

In [None]:
# Create a dataframe of extraction method percentages by party
method_percentages = {}
for party, methods in party_stats.get("party_methods", {}).items():
    for method, percentage in methods.get("percentages", {}).items():
        if method not in method_percentages:
            method_percentages[method] = {}
        method_percentages[method][party] = percentage

# Convert to dataframe
df_methods = pd.DataFrame(method_percentages)

# Display the dataframe
print("Extraction method percentages by party:")
print(df_methods)

# Create a stacked bar chart
plt.figure(figsize=(14, 8))
df_methods.plot(kind="bar", stacked=True, figsize=(14, 8), colormap="Set3")
plt.title("Extraction Method Distribution by Party", fontsize=16)
plt.xlabel("Party", fontsize=14)
plt.ylabel("Percentage", fontsize=14)
plt.xticks(rotation=45, ha="right")
plt.legend(title="Extraction Method")
plt.grid(True, linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

## Analyze speech length distribution

Let's examine the distribution of speech lengths.

In [None]:
# Get speech length bins
length_bins = btdf.get_speech_length_bins(df_integrated, bin_size=500, max_length=10000)

# Display the binned data
print("Speech length distribution:")
print(length_bins)

# Create a stacked bar chart of speech length bins
plt.figure(figsize=(14, 8))

# Get columns that contain counts (not percentages)
count_columns = [
    col for col in length_bins.columns if not col.endswith("_pct") and col != "total"
]

# Plot the stacked bar chart
length_bins[count_columns].plot(
    kind="bar", stacked=True, figsize=(14, 8), colormap="Set3"
)
plt.title("Speech Length Distribution by Extraction Method", fontsize=16)
plt.xlabel("Speech Length (characters)", fontsize=14)
plt.ylabel("Number of Speeches", fontsize=14)
plt.xticks(rotation=45, ha="right")
plt.legend(title="Extraction Method")
plt.grid(True, linestyle="--", alpha=0.3)
plt.tight_layout()
plt.show()

## Create a multi-index dataframe for hierarchical analysis

Let's create a multi-index dataframe that allows for hierarchical analysis.

In [None]:
# Create a multi-index dataframe
df_multi = btdf.create_multi_index_df()

# Display the first few rows
df_multi.head()

With the multi-index dataframe, we can easily analyze data at different hierarchical levels.

In [None]:
# Get statistics by protocol
protocol_stats = df_multi.groupby(level=0).agg(
    {
        "text_length": ["count", "mean", "median", "min", "max"],
        "extraction_confidence": ["mean", "median"],
        "is_xml_extracted": "mean",  # Proportion of XML extracted speeches
        "is_complete": "mean",  # Proportion of complete extractions
        "is_high_confidence": "mean",  # Proportion of high confidence extractions
    }
)

# Display the protocol statistics
protocol_stats.head()

## Save the report and visualizations

Finally, let's save the quality report and visualizations to disk.

In [None]:
# Save the quality report
report_path = reporter.save_quality_report(quality_report, "bundestag_quality_report")
print(f"Saved quality report to {report_path}")

# Generate and save visualizations
visualizations = reporter.generate_quality_visualizations(
    df_speeches=df_integrated, base_filename="bundestag_quality", save_plots=True
)
print(f"Saved {len(visualizations)} visualizations")

# Create an HTML report
html_path = reporter.create_html_report(
    report=quality_report,
    visualizations=visualizations,
    filename="bundestag_quality_report.html",
)
print(f"Created HTML report at {html_path}")