In [None]:
````xml
<VSCode.Cell language="markdown">
# Exploratory Analysis: TAEG vs PRIMERA Methods

This notebook provides qualitative analysis and visualization comparing the three approaches for narrative consolidation:

1. **TAEG** - Extractive with temporal graph
2. **PRIMERA-MDS** - Abstractive standard summarization
3. **PRIMERA-Consolidation** - Abstractive with event-based segmentation

## Setup
</VSCode.Cell>

<VSCode.Cell language="python">
# Import required libraries
import sys
from pathlib import Path
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown, HTML

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / "src"))

# Configure plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Setup complete")
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Load Outputs and Evaluation Results
</VSCode.Cell>

<VSCode.Cell language="python">
# Define paths
OUTPUT_DIR = Path("../outputs")
EVAL_DIR = OUTPUT_DIR / "evaluation"

# Load outputs
def load_output(filename):
    """Load output text file."""
    filepath = OUTPUT_DIR / filename
    if filepath.exists():
        with open(filepath, 'r', encoding='utf-8') as f:
            return f.read()
    return None

# Load evaluation results
def load_evaluation(filename):
    """Load evaluation JSON file."""
    filepath = EVAL_DIR / filename
    if filepath.exists():
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    return None

# Load all methods
methods = {
    "TAEG": {
        "output": load_output("taeg_summary_lexrank-ta.txt"),
        "eval": load_evaluation("taeg_results.json")
    },
    "PRIMERA-MDS": {
        "output": load_output("primera_mds_output.txt"),
        "eval": load_evaluation("primera_mds_results.json")
    },
    "PRIMERA-Consolidation": {
        "output": load_output("primera_consolidation.txt"),
        "eval": load_evaluation("primera_consolidation_results.json")
    }
}

# Check what's available
for method, data in methods.items():
    output_status = "‚úÖ" if data["output"] else "‚ùå"
    eval_status = "‚úÖ" if data["eval"] else "‚ùå"
    print(f"{method:25} - Output: {output_status}  Evaluation: {eval_status}")
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Metric Comparison Visualization
</VSCode.Cell>

<VSCode.Cell language="python">
# Extract metrics into DataFrame
def extract_metrics(methods_dict):
    """Extract metrics from evaluation results."""
    data = []
    
    for method, content in methods_dict.items():
        if content["eval"] is None:
            continue
        
        eval_data = content["eval"]
        
        row = {
            "Method": method,
            "Kendall's Tau": eval_data.get("kendall_tau", 0),
            "ROUGE-1 F1": eval_data.get("rouge", {}).get("rouge1", {}).get("f1", 0),
            "ROUGE-2 F1": eval_data.get("rouge", {}).get("rouge2", {}).get("f1", 0),
            "ROUGE-L F1": eval_data.get("rouge", {}).get("rougeL", {}).get("f1", 0),
            "BERTScore F1": eval_data.get("bertscore", {}).get("f1", 0),
            "METEOR": eval_data.get("meteor", 0),
        }
        
        data.append(row)
    
    return pd.DataFrame(data)

# Create DataFrame
df_metrics = extract_metrics(methods)

if not df_metrics.empty:
    display(df_metrics.set_index("Method"))
    
    # Plot comparison
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()
    
    metrics_to_plot = ["Kendall's Tau", "ROUGE-1 F1", "ROUGE-2 F1", 
                       "ROUGE-L F1", "BERTScore F1", "METEOR"]
    
    for idx, metric in enumerate(metrics_to_plot):
        ax = axes[idx]
        df_metrics.plot(x="Method", y=metric, kind="bar", ax=ax, legend=False)
        ax.set_title(metric, fontsize=12, fontweight='bold')
        ax.set_ylabel("Score")
        ax.set_xlabel("")
        ax.set_ylim(0, 1.0)
        ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.3, label='0.5 threshold')
        ax.grid(True, alpha=0.3)
        
        # Rotate x labels
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / "metric_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    print("üìä Metrics visualization saved to outputs/metric_comparison.png")
else:
    print("‚ö†Ô∏è No evaluation data available to plot")
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Output Length Comparison
</VSCode.Cell>

<VSCode.Cell language="python">
# Compare output lengths
length_data = []

for method, content in methods.items():
    if content["output"]:
        length_data.append({
            "Method": method,
            "Characters": len(content["output"]),
            "Words": len(content["output"].split()),
            "Sentences": content["output"].count('.') + content["output"].count('!') + content["output"].count('?')
        })

df_lengths = pd.DataFrame(length_data)

if not df_lengths.empty:
    display(df_lengths.set_index("Method"))
    
    # Plot
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    df_lengths.plot(x="Method", y=["Characters", "Words", "Sentences"], 
                    kind="bar", ax=ax)
    ax.set_title("Output Length Comparison", fontsize=14, fontweight='bold')
    ax.set_ylabel("Count")
    ax.set_xlabel("")
    ax.legend(title="Unit")
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / "length_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    print("üìä Length visualization saved to outputs/length_comparison.png")
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Sample Text Comparison

Let's examine the first few sentences from each method to assess fluency and style.
</VSCode.Cell>

<VSCode.Cell language="python">
def display_sample(method_name, text, num_sentences=5):
    """Display sample of text."""
    if text is None:
        print(f"‚ö†Ô∏è No output available for {method_name}")
        return
    
    # Extract first N sentences (naive approach)
    sentences = []
    current_sentence = []
    
    for char in text:
        current_sentence.append(char)
        if char in '.!?':
            sentences.append(''.join(current_sentence).strip())
            current_sentence = []
            if len(sentences) >= num_sentences:
                break
    
    sample = ' '.join(sentences)
    
    display(Markdown(f"### {method_name}"))
    display(Markdown(f"_{sample}_"))
    display(Markdown(f"**Length:** {len(sample)} characters"))
    print()

# Display samples
print("="*80)
print("SAMPLE OUTPUTS (First 5 sentences)")
print("="*80)
print()

for method, content in methods.items():
    display_sample(method, content["output"], num_sentences=5)
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Temporal Order Analysis

Examine how well each method preserves chronological order by looking at Kendall's Tau scores.
</VSCode.Cell>

<VSCode.Cell language="python">
# Temporal order comparison
tau_data = []

for method, content in methods.items():
    if content["eval"]:
        tau_data.append({
            "Method": method,
            "Kendall's Tau": content["eval"].get("kendall_tau", 0)
        })

df_tau = pd.DataFrame(tau_data)

if not df_tau.empty:
    display(df_tau.set_index("Method"))
    
    # Create interpretation
    print("\nüìä INTERPRETATION:")
    print("="*60)
    
    for _, row in df_tau.iterrows():
        method = row["Method"]
        tau = row["Kendall's Tau"]
        
        if tau >= 0.9:
            status = "‚úÖ Excellent - Near-perfect chronological order"
        elif tau >= 0.7:
            status = "‚úì Good - Strong chronological preservation"
        elif tau >= 0.5:
            status = "‚ö†Ô∏è Moderate - Some chronological structure"
        elif tau >= 0.3:
            status = "‚ö†Ô∏è Weak - Limited chronological order"
        else:
            status = "‚ùå Poor - Random or inverse ordering"
        
        print(f"{method:25} (œÑ={tau:.3f}): {status}")
    
    # Plot
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = ['green' if t >= 0.9 else 'orange' if t >= 0.5 else 'red' 
              for t in df_tau["Kendall's Tau"]]
    
    ax.barh(df_tau["Method"], df_tau["Kendall's Tau"], color=colors, alpha=0.7)
    ax.set_xlabel("Kendall's Tau", fontsize=12)
    ax.set_title("Temporal Order Preservation", fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1.0)
    ax.axvline(x=0.9, color='green', linestyle='--', alpha=0.5, label='Excellent (‚â•0.9)')
    ax.axvline(x=0.5, color='orange', linestyle='--', alpha=0.5, label='Moderate (‚â•0.5)')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='x')
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / "temporal_order_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Content Coverage Analysis

Compare ROUGE-L scores which indicate how well the longest common subsequence is preserved.
</VSCode.Cell>

<VSCode.Cell language="python">
# ROUGE-L comparison (indicates content coverage with order)
rouge_l_data = []

for method, content in methods.items():
    if content["eval"]:
        rouge_l = content["eval"].get("rouge", {}).get("rougeL", {}).get("f1", 0)
        rouge_l_data.append({
            "Method": method,
            "ROUGE-L F1": rouge_l
        })

df_rouge_l = pd.DataFrame(rouge_l_data)

if not df_rouge_l.empty:
    display(df_rouge_l.set_index("Method"))
    
    # Interpretation
    print("\nüìä INTERPRETATION:")
    print("="*60)
    print("ROUGE-L measures the longest common subsequence between generated")
    print("text and reference. Higher scores indicate better content coverage")
    print("AND better preservation of sequential order.\n")
    
    for _, row in df_rouge_l.iterrows():
        method = row["Method"]
        score = row["ROUGE-L F1"]
        
        if score >= 0.9:
            status = "‚úÖ Excellent coverage"
        elif score >= 0.7:
            status = "‚úì Good coverage"
        elif score >= 0.5:
            status = "‚ö†Ô∏è Moderate coverage"
        else:
            status = "‚ùå Limited coverage"
        
        print(f"{method:25} (F1={score:.3f}): {status}")
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Summary and Recommendations
</VSCode.Cell>

<VSCode.Cell language="python">
# Generate summary recommendations
display(Markdown("## üìù ANALYSIS SUMMARY"))
display(Markdown("---"))

if not df_metrics.empty:
    # Find best method for each metric
    best_methods = {}
    
    for metric in ["Kendall's Tau", "ROUGE-1 F1", "ROUGE-2 F1", 
                   "ROUGE-L F1", "BERTScore F1", "METEOR"]:
        best_idx = df_metrics[metric].idxmax()
        best_method = df_metrics.loc[best_idx, "Method"]
        best_score = df_metrics.loc[best_idx, metric]
        best_methods[metric] = (best_method, best_score)
    
    display(Markdown("### Best Performing Method per Metric"))
    
    for metric, (method, score) in best_methods.items():
        display(Markdown(f"- **{metric}**: {method} ({score:.3f})"))
    
    display(Markdown("\n### Key Findings"))
    
    # Temporal order winner
    tau_winner = best_methods["Kendall's Tau"]
    display(Markdown(f"1. **Temporal Order**: {tau_winner[0]} achieves the best chronological preservation (œÑ={tau_winner[1]:.3f})"))
    
    # Content coverage winner
    rouge_l_winner = best_methods["ROUGE-L F1"]
    display(Markdown(f"2. **Content Coverage**: {rouge_l_winner[0]} provides the most comprehensive coverage (F1={rouge_l_winner[1]:.3f})"))
    
    # Semantic similarity winner
    bert_winner = best_methods["BERTScore F1"]
    display(Markdown(f"3. **Semantic Fidelity**: {bert_winner[0]} maintains the best semantic equivalence (F1={bert_winner[1]:.3f})"))
    
    display(Markdown("\n### Recommendations"))
    display(Markdown("""
- **For chronological accuracy**: Use the method with highest Kendall's Tau
- **For comprehensive narratives**: Use the method with highest ROUGE-L
- **For fluent text**: Consider abstractive methods (PRIMERA) over extractive (TAEG)
- **For exact reproduction**: TAEG's extractive approach provides literal accuracy
    """))

else:
    print("‚ö†Ô∏è No evaluation data available for summary")
</VSCode.Cell>

<VSCode.Cell language="markdown">
## Export Results

Save a comprehensive comparison report.
</VSCode.Cell>

<VSCode.Cell language="python">
# Create comprehensive report
report_lines = []
report_lines.append("="*80)
report_lines.append("COMPREHENSIVE COMPARISON REPORT")
report_lines.append("TAEG vs PRIMERA-MDS vs PRIMERA-Consolidation")
report_lines.append("="*80)
report_lines.append("")

# Add metrics table
if not df_metrics.empty:
    report_lines.append("METRICS COMPARISON")
    report_lines.append("-"*80)
    report_lines.append(df_metrics.to_string())
    report_lines.append("")

# Add lengths
if not df_lengths.empty:
    report_lines.append("OUTPUT LENGTH COMPARISON")
    report_lines.append("-"*80)
    report_lines.append(df_lengths.to_string())
    report_lines.append("")

# Save report
report_file = OUTPUT_DIR / "analysis_report.txt"
with open(report_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(report_lines))

print(f"‚úÖ Analysis report saved to: {report_file}")
print(f"üìä Visualizations saved to: {OUTPUT_DIR}")
</VSCode.Cell>
````