# HW5 NLP Pipeline - End-to-End Analysis

This notebook walks through the complete pipeline for analyzing entity descriptions across news sources.

## Pipeline Overview
1. Load articles from all news sources
2. Resolve coreferences and extract entity contexts
3. Extract descriptions (appositives, predicates, modifiers)
4. Embed descriptions and cluster
5. Generate frequency tables
6. Run statistical tests
7. Visualize results


## Setup


In [None]:
# Add src to path
import sys
sys.path.insert(0, '..')

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Project imports
from src import config
from src import utils_io
from src import load_articles
from src import coref_contexts
from src import extract_descriptions
from src import embed_cluster
from src import manual_eval_helpers
from src import freq_tables
from src import stats_tests

# Ensure output directories exist
config.ensure_output_dirs()

# Set random seed for reproducibility
np.random.seed(config.RANDOM_SEED)

print(f"Data directory: {config.DATA_DIR}")
print(f"Output directory: {config.OUT_DIR}")
print(f"News sources: {list(config.NEWS_SOURCES.keys())}")


## Step 1: Load Articles


In [None]:
# Load all articles
# articles_df = load_articles.load_all_articles()
# print(f"Loaded {len(articles_df)} articles")
# articles_df.head()


In [None]:
# Article statistics
# stats = load_articles.get_article_stats(articles_df)
# stats


## Step 2: Coreference Resolution


In [None]:
# Load coreference model
# coref_nlp = coref_contexts.load_coref_model()
# print("Coreference model loaded")


In [None]:
# Process articles for target entities
# entity_contexts = coref_contexts.process_articles_for_coref(
#     articles_df, 
#     target_entities=config.TARGET_ENTITIES
# )
# print(f"Extracted contexts for {len(entity_contexts)} entities")


## Step 3: Extract Descriptions


In [None]:
# Load spaCy model for description extraction
# nlp = extract_descriptions.load_spacy_model()
# print(f"Loaded spaCy model: {config.SPACY_MODEL}")


In [None]:
# Extract descriptions from contexts
# descriptions = extract_descriptions.process_contexts_for_descriptions(
#     entity_contexts,
#     nlp
# )
# print(f"Extracted descriptions for {len(descriptions)} entities")


## Step 4: Embedding & Clustering


In [None]:
# Load embedding model
# embed_model = embed_cluster.load_embedding_model()
# print(f"Loaded embedding model: {config.EMBEDDING_MODEL}")


In [None]:
# Run embedding and clustering pipeline
# For each target entity:
# results = {}
# for entity in config.TARGET_ENTITIES:
#     results[entity] = embed_cluster.run_embedding_clustering_pipeline(
#         descriptions[entity],
#         entity_name=entity
#     )
#     print(f"Processed {entity}: {len(results[entity]['labels'])} descriptions")


In [None]:
# Visualize clusters (UMAP)
# TODO: Add UMAP visualization code


## Step 5: Frequency Tables


In [None]:
# Generate frequency tables
# freq_table = freq_tables.compute_cluster_frequencies(
#     results['Trump']['labels'],
#     results['Trump']['sources']
# )
# freq_table


In [None]:
# Normalized frequencies
# norm_freq = freq_tables.compute_normalized_frequencies(freq_table)
# norm_freq


## Step 6: Statistical Tests


In [None]:
# Chi-square test
# contingency = freq_tables.create_contingency_table(
#     results['Trump']['labels'],
#     results['Trump']['sources']
# )
# chi2_results = stats_tests.chi_square_test(contingency)
# print(f"Chi-square statistic: {chi2_results['chi2']:.2f}")
# print(f"P-value: {chi2_results['p_value']:.4f}")


In [None]:
# Effect size
# effect = stats_tests.compute_effect_size(contingency)
# print(f"Cram√©r's V: {effect:.3f}")


In [None]:
# Pairwise comparisons
# pairwise = stats_tests.pairwise_source_comparisons(
#     results['Trump']['labels'],
#     results['Trump']['sources']
# )
# pairwise


## Step 7: Manual Evaluation


In [None]:
# Create evaluation sample
# eval_sample = manual_eval_helpers.create_evaluation_sample(
#     results['Trump']['descriptions'],
#     results['Trump']['labels'],
#     n_per_cluster=5
# )
# eval_sample.head(10)


In [None]:
# Export for annotation
# manual_eval_helpers.export_for_annotation(
#     eval_sample,
#     config.REPORTS_DIR / 'evaluation_sample.csv'
# )


## Save Results


In [None]:
# Save all results
# utils_io.save_pickle(results, config.PROCESSED_DIR / 'clustering_results.pkl')
# freq_tables.export_frequency_tables({'cluster_freq': freq_table}, config.REPORTS_DIR)
# stats_tests.export_stats_report(chi2_results, config.REPORTS_DIR / 'stats_report.txt')
# print("Results saved!")


## Summary

TODO: Add summary of findings after running the analysis.
