In [None]:
# Import Required Libraries
import pandas as pd
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up paths
data_dir = Path("../data")
print(f"Data directory: {data_dir.absolute()}")
print(f"Directory exists: {data_dir.exists()}")

# Find all parquet files
parquet_files = list(data_dir.rglob("*.parquet"))
print(f"\nFound {len(parquet_files)} parquet files:")
for file in parquet_files:
    print(f"  - {file.relative_to(data_dir)}")
    print(f"    Size: {file.stat().st_size / (1024*1024):.2f} MB")
    print(f"    Modified: {datetime.fromtimestamp(file.stat().st_mtime)}")
    print()

In [None]:
# Load and explore ACLED data
acled_files = [f for f in parquet_files if 'acled' in f.name.lower()]
if acled_files:
    print(f"Loading ACLED data from: {acled_files[0].name}")
    acled_df = pl.read_parquet(acled_files[0])
    
    print(f"\nACLED Dataset Overview:")
    print(f"Shape: {acled_df.shape}")
    print(f"Columns: {acled_df.columns}")
    
    # Convert to pandas for plotting
    acled_pd = acled_df.to_pandas()
    
    # Display basic info
    print(f"\nData types:")
    for col in acled_df.columns:
        print(f"  {col}: {acled_df[col].dtype}")
    
    print(f"\nFirst few rows:")
    print(acled_pd.head())
else:
    print("No ACLED files found")
    acled_df = None
    acled_pd = None

In [None]:
# Temporal Distribution Analysis
if acled_pd is not None and 'date' in acled_pd.columns:
    # Convert date column if it's not already datetime
    if not pd.api.types.is_datetime64_any_dtype(acled_pd['date']):
        acled_pd['date'] = pd.to_datetime(acled_pd['date'])
    
    # Create temporal plots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Events Over Time', 'Events by Month', 'Events by Year', 'Events by Day of Week'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # 1. Time series plot
    daily_counts = acled_pd.groupby(acled_pd['date'].dt.date).size()
    fig.add_trace(
        go.Scatter(x=daily_counts.index, y=daily_counts.values, mode='lines', name='Daily Events'),
        row=1, col=1
    )
    
    # 2. Monthly distribution
    monthly_counts = acled_pd.groupby(acled_pd['date'].dt.month).size()
    fig.add_trace(
        go.Bar(x=monthly_counts.index, y=monthly_counts.values, name='Monthly Distribution'),
        row=1, col=2
    )
    
    # 3. Yearly distribution
    yearly_counts = acled_pd.groupby(acled_pd['date'].dt.year).size()
    fig.add_trace(
        go.Bar(x=yearly_counts.index, y=yearly_counts.values, name='Yearly Distribution'),
        row=2, col=1
    )
    
    # 4. Day of week distribution
    dow_counts = acled_pd.groupby(acled_pd['date'].dt.day_name()).size()
    fig.add_trace(
        go.Bar(x=dow_counts.index, y=dow_counts.values, name='Day of Week Distribution'),
        row=2, col=2
    )
    
    fig.update_layout(height=800, title_text="ACLED Temporal Distribution Analysis", showlegend=False)
    fig.show()
else:
    print("Date column not found or ACLED data not available")

In [None]:
# Event Type and Geographical Distribution
if acled_pd is not None:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Event Types', 'Sub Event Types (Top 15)', 'Admin1 Regions (Top 15)', 'Fatalities Distribution'),
        specs=[[{"type": "pie"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "histogram"}]]
    )
    
    # 1. Event types pie chart
    if 'event_type' in acled_pd.columns:
        event_type_counts = acled_pd['event_type'].value_counts()
        fig.add_trace(
            go.Pie(labels=event_type_counts.index, values=event_type_counts.values, name="Event Types"),
            row=1, col=1
        )
    
    # 2. Sub event types bar chart (top 15)
    if 'sub_event_type' in acled_pd.columns:
        sub_event_counts = acled_pd['sub_event_type'].value_counts().head(15)
        fig.add_trace(
            go.Bar(x=sub_event_counts.values, y=sub_event_counts.index, orientation='h', name='Sub Event Types'),
            row=1, col=2
        )
    
    # 3. Admin1 regions bar chart (top 15)
    if 'admin1' in acled_pd.columns:
        admin1_counts = acled_pd['admin1'].value_counts().head(15)
        fig.add_trace(
            go.Bar(x=admin1_counts.values, y=admin1_counts.index, orientation='h', name='Admin1 Regions'),
            row=2, col=1
        )
    
    # 4. Fatalities distribution
    if 'fatalities' in acled_pd.columns:
        # Filter out extreme outliers for better visualization
        fatalities_filtered = acled_pd['fatalities'][acled_pd['fatalities'] <= 100]
        fig.add_trace(
            go.Histogram(x=fatalities_filtered, name='Fatalities Distribution', nbinsx=50),
            row=2, col=2
        )
    
    fig.update_layout(height=1000, title_text="ACLED Event Type and Geographical Distribution", showlegend=False)
    fig.show()
    
    # Print summary statistics
    print("\n=== ACLED Summary Statistics ===")
    if 'fatalities' in acled_pd.columns:
        print(f"Total fatalities: {acled_pd['fatalities'].sum():,}")
        print(f"Average fatalities per event: {acled_pd['fatalities'].mean():.2f}")
        print(f"Median fatalities per event: {acled_pd['fatalities'].median():.2f}")
        print(f"Max fatalities in single event: {acled_pd['fatalities'].max():,}")
    
    print(f"Total events: {len(acled_pd):,}")
    if 'date' in acled_pd.columns:
        print(f"Date range: {acled_pd['date'].min()} to {acled_pd['date'].max()}")
    
    if 'country' in acled_pd.columns:
        print(f"Countries covered: {acled_pd['country'].nunique()}")
        print(f"Most active country: {acled_pd['country'].value_counts().index[0]} ({acled_pd['country'].value_counts().iloc[0]:,} events)")
else:
    print("ACLED data not available for analysis")

In [None]:
# Load and analyze Factal data
factal_files = [f for f in parquet_files if 'factal' in f.name.lower()]
if factal_files:
    print(f"Loading Factal data from: {factal_files[0].name}")
    factal_df = pl.read_parquet(factal_files[0])
    factal_pd = factal_df.to_pandas()
    
    print(f"\nFactal Dataset Overview:")
    print(f"Shape: {factal_df.shape}")
    print(f"Columns: {factal_df.columns}")
    
    # Factal-specific visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Severity Distribution', 'Theme Distribution', 'Content Length Distribution', 'Temporal Distribution'),
        specs=[[{"type": "bar"}, {"type": "pie"}],
               [{"type": "histogram"}, {"type": "scatter"}]]
    )
    
    # 1. Severity distribution
    if 'severity' in factal_pd.columns:
        severity_counts = factal_pd['severity'].value_counts().sort_index()
        fig.add_trace(
            go.Bar(x=severity_counts.index, y=severity_counts.values, name='Severity Distribution'),
            row=1, col=1
        )
    
    # 2. Theme distribution
    if 'theme' in factal_pd.columns:
        theme_counts = factal_pd['theme'].value_counts().head(10)
        fig.add_trace(
            go.Pie(labels=theme_counts.index, values=theme_counts.values, name="Themes"),
            row=1, col=2
        )
    
    # 3. Content length distribution
    if 'text' in factal_pd.columns:
        text_lengths = factal_pd['text'].str.len()
        fig.add_trace(
            go.Histogram(x=text_lengths, name='Text Length Distribution', nbinsx=50),
            row=2, col=1
        )
    
    # 4. Temporal scatter plot
    if 'date' in factal_pd.columns and 'severity' in factal_pd.columns:
        if not pd.api.types.is_datetime64_any_dtype(factal_pd['date']):
            factal_pd['date'] = pd.to_datetime(factal_pd['date'])
        
        fig.add_trace(
            go.Scatter(x=factal_pd['date'], y=factal_pd['severity'], mode='markers', 
                      name='Severity over Time', opacity=0.6),
            row=2, col=2
        )
    
    fig.update_layout(height=1000, title_text="Factal Data Distribution Analysis", showlegend=False)
    fig.show()
    
    # Print Factal summary statistics
    print("\n=== Factal Summary Statistics ===")
    print(f"Total items: {len(factal_pd):,}")
    if 'date' in factal_pd.columns:
        print(f"Date range: {factal_pd['date'].min()} to {factal_pd['date'].max()}")
    if 'severity' in factal_pd.columns:
        print(f"Average severity: {factal_pd['severity'].mean():.2f}")
        print(f"Severity distribution: {factal_pd['severity'].value_counts().sort_index().to_dict()}")
    if 'text' in factal_pd.columns:
        print(f"Average text length: {factal_pd['text'].str.len().mean():.0f} characters")
        
else:
    print("No Factal files found")
    factal_df = None
    factal_pd = None

In [None]:
# Load and analyze Google News data
google_files = [f for f in parquet_files if 'google' in f.name.lower()]
if google_files:
    print(f"Loading Google News data from: {google_files[0].name}")
    google_df = pl.read_parquet(google_files[0])
    google_pd = google_df.to_pandas()
    
    print(f"\nGoogle News Dataset Overview:")
    print(f"Shape: {google_df.shape}")
    print(f"Columns: {google_df.columns}")
    
    # Google News specific visualizations
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Article Length Distribution', 'Source Distribution (Top 15)', 'Publication Timeline', 'Articles per Day'),
        specs=[[{"type": "histogram"}, {"type": "bar"}],
               [{"type": "scatter"}, {"type": "bar"}]]
    )
    
    # 1. Article length distribution
    if 'text' in google_pd.columns:
        text_lengths = google_pd['text'].str.len()
        fig.add_trace(
            go.Histogram(x=text_lengths, name='Article Length Distribution', nbinsx=50),
            row=1, col=1
        )
    
    # 2. Source distribution
    if 'source' in google_pd.columns:
        source_counts = google_pd['source'].value_counts().head(15)
        fig.add_trace(
            go.Bar(x=source_counts.values, y=source_counts.index, orientation='h', name='Source Distribution'),
            row=1, col=2
        )
    
    # 3. Publication timeline
    if 'date' in google_pd.columns:
        if not pd.api.types.is_datetime64_any_dtype(google_pd['date']):
            google_pd['date'] = pd.to_datetime(google_pd['date'])
        
        # Cumulative articles over time
        daily_counts = google_pd.groupby(google_pd['date'].dt.date).size().sort_index()
        cumulative_counts = daily_counts.cumsum()
        
        fig.add_trace(
            go.Scatter(x=cumulative_counts.index, y=cumulative_counts.values, mode='lines', 
                      name='Cumulative Articles'),
            row=2, col=1
        )
        
        # Daily article counts
        fig.add_trace(
            go.Bar(x=daily_counts.index, y=daily_counts.values, name='Daily Articles'),
            row=2, col=2
        )
    
    fig.update_layout(height=1000, title_text="Google News Data Distribution Analysis", showlegend=False)
    fig.show()
    
    # Print Google News summary statistics
    print("\n=== Google News Summary Statistics ===")
    print(f"Total articles: {len(google_pd):,}")
    if 'date' in google_pd.columns:
        print(f"Date range: {google_pd['date'].min()} to {google_pd['date'].max()}")
    if 'source' in google_pd.columns:
        print(f"Number of unique sources: {google_pd['source'].nunique()}")
        print(f"Most active source: {google_pd['source'].value_counts().index[0]} ({google_pd['source'].value_counts().iloc[0]:,} articles)")
    if 'text' in google_pd.columns:
        print(f"Average article length: {google_pd['text'].str.len().mean():.0f} characters")
        
else:
    print("No Google News files found")
    google_df = None
    google_pd = None

In [None]:
# Comparative Analysis Across Data Sources
available_sources = []
source_info = {}

if acled_pd is not None:
    available_sources.append('ACLED')
    source_info['ACLED'] = {
        'count': len(acled_pd),
        'date_range': (acled_pd['date'].min(), acled_pd['date'].max()) if 'date' in acled_pd.columns else (None, None),
        'avg_text_length': acled_pd['text'].str.len().mean() if 'text' in acled_pd.columns else None
    }

if factal_pd is not None:
    available_sources.append('Factal')
    source_info['Factal'] = {
        'count': len(factal_pd),
        'date_range': (factal_pd['date'].min(), factal_pd['date'].max()) if 'date' in factal_pd.columns else (None, None),
        'avg_text_length': factal_pd['text'].str.len().mean() if 'text' in factal_pd.columns else None
    }

if google_pd is not None:
    available_sources.append('Google News')
    source_info['Google News'] = {
        'count': len(google_pd),
        'date_range': (google_pd['date'].min(), google_pd['date'].max()) if 'date' in google_pd.columns else (None, None),
        'avg_text_length': google_pd['text'].str.len().mean() if 'text' in google_pd.columns else None
    }

if available_sources:
    # Create comparative visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Data Volume Comparison', 'Average Text Length Comparison', 
                       'Temporal Coverage', 'Combined Timeline'),
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "scatter"}]]
    )
    
    # 1. Data volume comparison
    volumes = [source_info[source]['count'] for source in available_sources]
    fig.add_trace(
        go.Bar(x=available_sources, y=volumes, name='Record Count'),
        row=1, col=1
    )
    
    # 2. Average text length comparison
    text_lengths = [source_info[source]['avg_text_length'] for source in available_sources 
                   if source_info[source]['avg_text_length'] is not None]
    sources_with_text = [source for source in available_sources 
                        if source_info[source]['avg_text_length'] is not None]
    
    if text_lengths:
        fig.add_trace(
            go.Bar(x=sources_with_text, y=text_lengths, name='Avg Text Length'),
            row=1, col=2
        )
    
    # 3. Temporal coverage (days covered)
    coverage_days = []
    for source in available_sources:
        if source_info[source]['date_range'][0] is not None:
            start, end = source_info[source]['date_range']
            days = (end - start).days
            coverage_days.append(days)
        else:
            coverage_days.append(0)
    
    fig.add_trace(
        go.Bar(x=available_sources, y=coverage_days, name='Days Covered'),
        row=2, col=1
    )
    
    # 4. Combined timeline (if date data available)
    colors = ['blue', 'red', 'green']
    for i, source in enumerate(available_sources):
        if source == 'ACLED' and acled_pd is not None and 'date' in acled_pd.columns:
            daily_counts = acled_pd.groupby(acled_pd['date'].dt.date).size()
            fig.add_trace(
                go.Scatter(x=daily_counts.index, y=daily_counts.values, mode='lines', 
                          name='ACLED', line=dict(color=colors[i])),
                row=2, col=2
            )
        elif source == 'Factal' and factal_pd is not None and 'date' in factal_pd.columns:
            daily_counts = factal_pd.groupby(factal_pd['date'].dt.date).size()
            fig.add_trace(
                go.Scatter(x=daily_counts.index, y=daily_counts.values, mode='lines', 
                          name='Factal', line=dict(color=colors[i])),
                row=2, col=2
            )
        elif source == 'Google News' and google_pd is not None and 'date' in google_pd.columns:
            daily_counts = google_pd.groupby(google_pd['date'].dt.date).size()
            fig.add_trace(
                go.Scatter(x=daily_counts.index, y=daily_counts.values, mode='lines', 
                          name='Google News', line=dict(color=colors[i])),
                row=2, col=2
            )
    
    fig.update_layout(height=1000, title_text="Comparative Analysis of Data Sources", showlegend=True)
    fig.show()
    
    # Print comprehensive summary
    print("\n" + "="*60)
    print("COMPREHENSIVE DATA SUMMARY")
    print("="*60)
    
    for source in available_sources:
        info = source_info[source]
        print(f"\n{source}:")
        print(f"  Records: {info['count']:,}")
        if info['date_range'][0] is not None:
            print(f"  Date Range: {info['date_range'][0]} to {info['date_range'][1]}")
            print(f"  Coverage: {(info['date_range'][1] - info['date_range'][0]).days} days")
        if info['avg_text_length'] is not None:
            print(f"  Average Text Length: {info['avg_text_length']:.0f} characters")
    
    total_records = sum(volumes)
    print(f"\nTOTAL RECORDS ACROSS ALL SOURCES: {total_records:,}")
    
else:
    print("No data sources available for comparative analysis")

# Data Distribution Analysis Summary

This notebook provides comprehensive exploratory data analysis (EDA) of the conflict-related datasets stored in the GraphRAG pipeline data folder. The analysis covers:

## Key Insights:

### 📊 **ACLED Data**
- **Temporal patterns**: Shows conflict event distributions over time, seasonal patterns, and day-of-week variations
- **Event classification**: Breaks down different types of conflict events and their frequency
- **Geographic distribution**: Identifies hotspot regions and conflict concentration areas
- **Severity metrics**: Analyzes fatality distributions and event impact

### 🚨 **Factal Data** 
- **Intelligence severity**: Tracks threat level distributions on a 1-4 scale
- **Thematic analysis**: Shows categorization of different threat types
- **Content analysis**: Examines the depth and length of intelligence reports
- **Real-time patterns**: Reveals temporal patterns in threat detection

### 📰 **Google News Data**
- **Media coverage**: Analyzes news article volume and source diversity
- **Content characteristics**: Examines article length and coverage patterns
- **Publication timeline**: Shows news reporting patterns over time
- **Source analysis**: Identifies most active news sources and coverage distribution

### 🔄 **Cross-Source Comparison**
- **Data volume comparison**: Shows relative contribution of each data source
- **Temporal alignment**: Identifies overlap and gaps in temporal coverage
- **Content depth**: Compares text length and information density across sources
- **Complementary coverage**: Demonstrates how different sources provide unique perspectives

## Usage for GraphRAG:
This EDA provides the foundation for understanding data characteristics that will influence:
- **Entity extraction strategies** (based on text length and complexity)
- **Temporal modeling approaches** (based on coverage patterns)
- **Geographic analysis scope** (based on spatial distribution)
- **Multi-source integration methods** (based on complementary patterns)

The insights from this analysis inform the preprocessing, knowledge graph construction, and query optimization strategies in the broader GraphRAG security reporting system.