# News Article Frame Analysis - Data Exploration

This notebook explores the news articles dataset and demonstrates the frame detection system.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yourusername/spam_news/blob/main/notebooks/01_data_exploration.ipynb)

## 1. Environment Setup

In [None]:
# Detect if we're running in Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    # Clone the repository
    !git clone https://github.com/yourusername/spam_news.git
    %cd spam_news
    
    # Install requirements
    !pip install -q -r requirements.txt
    
    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Running locally")
    # Ensure we're in the project root
    import os
    if os.path.basename(os.getcwd()) == 'notebooks':
        os.chdir('..')

In [None]:
# GPU Detection and Setup
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    device = torch.device('cpu')
    print("No GPU available, using CPU")

# Set device for transformers
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' if torch.cuda.is_available() else ''

## 2. Load Data

In [None]:
import json
import pandas as pd
from pathlib import Path

# Configure data paths
if IN_COLAB:
    # Update this path to your Google Drive folder containing the full dataset
    FULL_DATA_PATH = Path('/content/drive/MyDrive/spam_news_data/articles.json')
    SAMPLE_DATA_PATH = Path('data/sample_articles.json')
else:
    FULL_DATA_PATH = Path('data/articles.json')
    SAMPLE_DATA_PATH = Path('data/sample_articles.json')

# Load data
if FULL_DATA_PATH.exists():
    print(f"Loading full dataset from {FULL_DATA_PATH}")
    with open(FULL_DATA_PATH, 'r') as f:
        articles = json.load(f)
else:
    print(f"Full dataset not found. Loading sample data from {SAMPLE_DATA_PATH}")
    with open(SAMPLE_DATA_PATH, 'r') as f:
        articles = json.load(f)

print(f"\nLoaded {len(articles)} articles")

In [None]:
# Convert to DataFrame for easier analysis
df = pd.DataFrame(articles)

# Display basic info
print("Dataset Overview:")
print(f"Total articles: {len(df)}")
print(f"\nSources: {df['source'].value_counts().to_dict()}")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")

# Show first few articles
df.head()

## 3. Explore Human Coding

In [None]:
# Extract human coding data
def extract_frame_counts(df):
    frame_types = ['underrepresentation', 'overrepresentation', 'obstacles', 'successes']
    demographic_groups = ['women', 'men', 'white_women', 'white_men', 'women_of_color', 'men_of_color']
    
    results = []
    for idx, row in df.iterrows():
        coding = row['human_coding']
        for frame in frame_types:
            if frame in coding:
                for group, count in coding[frame].items():
                    results.append({
                        'article_id': row['article_id'],
                        'frame': frame,
                        'demographic': group,
                        'count': count
                    })
    
    return pd.DataFrame(results)

coding_df = extract_frame_counts(df)
print("Human Coding Summary:")
print(coding_df.groupby(['frame', 'demographic'])['count'].sum().sort_values(ascending=False))

## 4. Text Analysis

In [None]:
# Analyze article lengths
df['content_length'] = df['content'].str.len()
df['word_count'] = df['content'].str.split().str.len()

print("Article Length Statistics:")
print(f"Average character count: {df['content_length'].mean():.0f}")
print(f"Average word count: {df['word_count'].mean():.0f}")
print(f"Min/Max words: {df['word_count'].min()} / {df['word_count'].max()}")

In [None]:
# Search for frame-related keywords
frame_keywords = {
    'underrepresentation': ['underrepresented', 'lower rates', 'less than', 'only', 'just'],
    'overrepresentation': ['overrepresented', 'dominate', 'majority', 'most'],
    'obstacles': ['barrier', 'ceiling', 'discrimination', 'harder', 'challenges'],
    'successes': ['first', 'breakthrough', 'achievement', 'milestone', 'appointed']
}

demographic_keywords = {
    'women': ['women', 'woman', 'female'],
    'men': ['men', 'man', 'male'],
    'white': ['white', 'caucasian'],
    'poc': ['black', 'african american', 'hispanic', 'latino', 'asian', 'people of color']
}

# Count keyword occurrences
for frame, keywords in frame_keywords.items():
    pattern = '|'.join(keywords)
    df[f'{frame}_keywords'] = df['content'].str.lower().str.count(pattern)

print("Frame Keyword Frequencies:")
for frame in frame_keywords:
    print(f"{frame}: {df[f'{frame}_keywords'].sum()} occurrences")

## 5. Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Frame distribution
if len(coding_df) > 0:
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    
    # Frame counts
    frame_counts = coding_df.groupby('frame')['count'].sum()
    frame_counts.plot(kind='bar', ax=ax[0])
    ax[0].set_title('Total Frame Counts in Dataset')
    ax[0].set_xlabel('Frame Type')
    ax[0].set_ylabel('Total Count')
    
    # Demographic distribution
    demo_counts = coding_df.groupby('demographic')['count'].sum()
    demo_counts.plot(kind='bar', ax=ax[1])
    ax[1].set_title('Frame Counts by Demographic Group')
    ax[1].set_xlabel('Demographic Group')
    ax[1].set_ylabel('Total Count')
    
    plt.tight_layout()
    plt.show()

## 6. Save Preprocessed Data

In [None]:
# Save progress to prevent data loss
if IN_COLAB:
    # Save to Google Drive
    save_path = Path('/content/drive/MyDrive/spam_news_data/preprocessed_data.pkl')
    save_path.parent.mkdir(parents=True, exist_ok=True)
else:
    save_path = Path('data/preprocessed_data.pkl')

# Save DataFrame with additional features
df.to_pickle(save_path)
print(f"Preprocessed data saved to {save_path}")

# Also save coding summary
coding_df.to_csv(save_path.with_suffix('.csv'), index=False)
print(f"Coding summary saved to {save_path.with_suffix('.csv')}")

## Next Steps

1. Continue to `02_train_model.ipynb` to train the frame detection model
2. Use `03_evaluate_model.ipynb` to evaluate model performance
3. Try `demo_analysis.ipynb` for a quick demo of the complete pipeline