In [61]:
# Import Required Libraries
import pandas as pd
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up paths
data_dir = Path("../data")
print(f"Data directory: {data_dir.absolute()}")
print(f"Directory exists: {data_dir.exists()}")

# Find all parquet files
parquet_files = list(data_dir.rglob("*.parquet"))
print(f"\nFound {len(parquet_files)} parquet files:")
for file in parquet_files:
    print(f"  - {file.relative_to(data_dir)}")
    print(f"    Size: {file.stat().st_size / (1024*1024):.2f} MB")
    print(f"    Modified: {datetime.fromtimestamp(file.stat().st_mtime)}")
    print()

Data directory: c:\Users\matia\OneDrive\Escritorio\Nastia_BSE\Master_Thesis\UN_Conflict_Report\graphrag_pipeline\example_notebooks\..\data
Directory exists: True

Found 11 parquet files:
  - acled\Acled_India_2025-04-03_2025-07-03.parquet
    Size: 0.65 MB
    Modified: 2025-07-03 19:06:38.296592

  - acled\Acled_Sudan_2025-04-03_2025-07-03.parquet
    Size: 0.11 MB
    Modified: 2025-07-03 19:06:38.298276

  - acled\Acled_United_States_2025-04-03_2025-07-03.parquet
    Size: 0.66 MB
    Modified: 2025-07-03 19:06:38.298276

  - factal\Factal_India_2025-04-03_2025-07-03.parquet
    Size: 0.30 MB
    Modified: 2025-07-03 20:04:16.332583

  - factal\Factal_Sudan_2025-04-03_2025-07-03.parquet
    Size: 0.06 MB
    Modified: 2025-07-03 20:00:32.020078

  - factal\Factal_United_States_2025-04-03_2025-07-03.parquet
    Size: 0.56 MB
    Modified: 2025-07-03 20:06:03.056631

  - google_news\google_news_India_2025-06-26_2025-07-01.parquet
    Size: 0.79 MB
    Modified: 2025-07-03 19:06:38.310

In [62]:
# ACLED Distribution Analysis (Forecast Generation Style)
if acled_pd is not None:
    
    def create_distribution_plot(data_column, column_name, top_n=15, chart_title=None):
        """
        Create a horizontal bar chart with forecast_generation styling
        """
        if column_name not in acled_pd.columns:
            print(f"Column '{column_name}' not found in dataset")
            return None
            
        # Get value counts
        value_counts = acled_pd[column_name].value_counts()
        
        # Take top N values if specified
        if top_n and len(value_counts) > top_n:
            value_counts = value_counts.head(top_n)
            title_suffix = f" (Top {top_n})"
        else:
            title_suffix = ""
        
        # Sort in ascending order for proper horizontal bar chart display
        value_counts = value_counts.sort_values(ascending=True)
        
        # Prepare data
        categories = value_counts.index.tolist()
        counts = value_counts.values.tolist()
        
        # Create text labels showing both count and percentage
        total_events = acled_pd[column_name].count()
        text_labels = []
        for count in counts:
            percentage = (count / total_events) * 100
            text_labels.append(f"<b>{count:,}</b> ({percentage:.1f}%)")
        
        # Create color mapping (similar to forecast_generation script)
        colors = []
        max_count = max(counts)
        for count in counts:
            # Color intensity based on relative frequency
            intensity = count / max_count
            if intensity >= 0.8:
                colors.append('#d73600')  # High frequency - dark red
            elif intensity >= 0.1:
                colors.append('#ff6b35')  # Medium-high frequency - orange-red
            elif intensity >= 0.07:
                colors.append('#ffd700')  # Medium frequency - gold
            elif intensity >= 0.05:
                colors.append('#87ceeb')  # Low-medium frequency - sky blue
            else:
                colors.append('#5b9bd5')  # Low frequency - blue
        
        # Create the horizontal bar chart
        fig = go.Figure()
        
        fig.add_trace(go.Bar(
            y=categories,
            x=counts,
            orientation='h',
            marker=dict(
                color=colors,
                line=dict(color='white', width=0.5)
            ),
            text=text_labels,
            textposition='outside',
            textfont=dict(size=14),
            cliponaxis=False,
            hovertemplate=(
                "<b>%{y}</b><br>"
                "Count: %{x:,}<br>"
                "Percentage: %{customdata:.1f}%"
                "<extra></extra>"
            ),
            customdata=[(count / total_events) * 100 for count in counts]
        ))
        
        # Calculate dynamic height based on number of categories
        num_categories = len(categories)
        bar_height = 50
        calculated_height = max(400, num_categories * bar_height)
        
        # Set x-axis range with extra space for labels
        x_range = [0, max(counts) * 1.5]
        
        # Update axes with styling similar to forecast_generation
        fig.update_xaxes(
            showticklabels=True,
            tickfont=dict(size=16),
            showgrid=False,
            gridcolor='lightgray',
            automargin=True,
            title="Number of Events"
        )

        fig.update_yaxes(
            tickfont=dict(size=16),
            tickmode='linear',
            showgrid=False,
            side='left',
            categoryorder='array',
            categoryarray=categories,
            automargin=True,
            title=column_name.replace('_', ' ').title()
        )
        
        # Update layout with styling similar to forecast_generation
        fig.update_layout(
            font=dict(size=16, family="Times New Roman", color="black"),
            uniformtext_minsize=16,
            uniformtext_mode='show',
            margin=dict(l=200, r=100, t=80, b=80),
            height=calculated_height,
            width=1200,
            paper_bgcolor='white',
            plot_bgcolor='white',
            showlegend=False,
            xaxis=dict(range=x_range),
            bargap=0.3
        )
        
        # Add vertical line at median for reference
        median_count = np.median(counts)
        fig.add_vline(
            x=median_count, 
            line_dash="dash", 
            line_color="gray", 
            line_width=1,
            annotation_text=f"Median: {median_count:,.0f}",
            annotation_position="top"
        )
        
        return fig, value_counts
    
    # 1. Event Type Distribution
    print("Creating Event Type Distribution...")
    fig_event_type, event_type_stats = create_distribution_plot('event_type', 'event_type')
    if fig_event_type:
        fig_event_type.show()
        print(f"Event types found: {len(event_type_stats)}")
    
    # 2. Sub Event Type Distribution (Top 15)
    print("\nCreating Sub Event Type Distribution...")
    fig_sub_event_type, sub_event_stats = create_distribution_plot('sub_event_type', 'sub_event_type', top_n=15)
    if fig_sub_event_type:
        fig_sub_event_type.show()
        print(f"Total sub-event types: {acled_pd['sub_event_type'].nunique()}")
        print(f"Showing top 15 sub-event types")
    
    # 3. Fatalities Distribution (Special handling for numerical data)
    print("\nCreating Fatalities Distribution...")
    if 'fatalities' in acled_pd.columns:
        # Create fatality bins for better visualization
        fatalities_data = acled_pd['fatalities'].copy()
        
        # Define fatality ranges
        def categorize_fatalities(x):
            if pd.isna(x) or x == 0:
                return "0 fatalities"
            elif x == 1:
                return "1 fatality"
            elif 2 <= x <= 5:
                return "2-5 fatalities"
            elif 6 <= x <= 10:
                return "6-10 fatalities"
            elif 11 <= x <= 25:
                return "11-25 fatalities"
            elif 26 <= x <= 50:
                return "26-50 fatalities"
            elif 51 <= x <= 100:
                return "51-100 fatalities"
            else:
                return "100+ fatalities"
        
        acled_pd['fatality_category'] = fatalities_data.apply(categorize_fatalities)
        
        fig_fatalities, fatality_stats = create_distribution_plot('fatality_category', 'fatality_category')
        if fig_fatalities:
            fig_fatalities.show()
            
            # Print fatality statistics
            print(f"Total fatalities across all events: {fatalities_data.sum():,}")
            print(f"Average fatalities per event: {fatalities_data.mean():.2f}")
            print(f"Median fatalities per event: {fatalities_data.median():.1f}")
            print(f"Maximum fatalities in single event: {fatalities_data.max():,}")
            print(f"Events with zero fatalities: {(fatalities_data == 0).sum():,} ({(fatalities_data == 0).mean()*100:.1f}%)")
    
    # Print overall summary
    print(f"\n" + "="*60)
    print("COMPREHENSIVE ACLED SUMMARY")
    print("="*60)
    print(f"Total events in merged dataset: {len(acled_pd):,}")
    if 'date' in acled_pd.columns:
        acled_pd['date'] = pd.to_datetime(acled_pd['date'])
        print(f"Date range: {acled_pd['date'].min()} to {acled_pd['date'].max()}")
        print(f"Time span: {(acled_pd['date'].max() - acled_pd['date'].min()).days} days")
    
    if 'country' in acled_pd.columns:
        print(f"Countries covered: {acled_pd['country'].nunique()}")
        top_countries = acled_pd['country'].value_counts().head(3)
        print("Top 3 countries by event count:")
        for country, count in top_countries.items():
            print(f"  {country}: {count:,} events")
    
else:
    print("ACLED data not available for distribution analysis")

Creating Event Type Distribution...


Event types found: 6

Creating Sub Event Type Distribution...


Total sub-event types: 20
Showing top 15 sub-event types

Creating Fatalities Distribution...


Total fatalities across all events: 376
Average fatalities per event: 0.06
Median fatalities per event: 0.0
Maximum fatalities in single event: 29
Events with zero fatalities: 5,777 (96.5%)

COMPREHENSIVE ACLED SUMMARY
Total events in merged dataset: 5,986
Date range: 2025-04-03 00:00:00 to 2025-06-27 00:00:00
Time span: 85 days
Countries covered: 1
Top 3 countries by event count:
  India: 5,986 events


In [67]:
# FACTAL Distribution Analysis (Forecast Generation Style)
if factal_pd is not None:
    
    def create_factal_distribution_plot(data_column, column_name, top_n=15, chart_title=None):
        """
        Create a horizontal bar chart with forecast_generation styling for Factal data
        """
        if column_name not in factal_pd.columns:
            print(f"Column '{column_name}' not found in Factal dataset")
            return None, None
            
        # Handle potential null values and convert to string
        data_series = factal_pd[column_name].fillna('Unknown').astype(str)
        
        # Get value counts
        value_counts = data_series.value_counts()
        
        # Remove 'Unknown' or empty values for cleaner visualization
        value_counts = value_counts[~value_counts.index.isin(['Unknown', 'nan', '', 'None', "N/A", "Air India"])]
        
        # Take top N values if specified
        if top_n and len(value_counts) > top_n:
            value_counts = value_counts.head(top_n)
            title_suffix = f" (Top {top_n})"
        else:
            title_suffix = ""
        
        if len(value_counts) == 0:
            print(f"No valid data found for {column_name}")
            return None, None
        
        # Sort in ascending order for proper horizontal bar chart display
        value_counts = value_counts.sort_values(ascending=True)
        
        # Prepare data
        categories = value_counts.index.tolist()
        counts = value_counts.values.tolist()
        
        # Create text labels showing both count and percentage
        total_events = len(data_series[~data_series.isin(['Unknown', 'nan', '', 'None'])])
        text_labels = []
        for count in counts:
            percentage = (count / total_events) * 100
            text_labels.append(f"<b>{count:,}</b> ({percentage:.1f}%)")
        
        # Create color mapping (similar to forecast_generation script)
        colors = []
        max_count = max(counts)
        for count in counts:
            # Color intensity based on relative frequency
            intensity = count / max_count
            if intensity >= 0.8:
                colors.append("#E02A2A")  # High frequency - saddle brown
            elif intensity >= 0.6:
                colors.append("#FF8F1F")  # Medium-high frequency - sandy brown
            elif intensity >= 0.4:
                colors.append("#FFC01F")  # Medium frequency - goldenrod
            elif intensity >= 0.2:
                colors.append("#53C2EE")  # Low-medium frequency - light sea green
            else:
                colors.append("#477091")  # Low frequency - steel blue
        
        # Create the horizontal bar chart
        fig = go.Figure()
        
        fig.add_trace(go.Bar(
            y=categories,
            x=counts,
            orientation='h',
            marker=dict(
                color=colors,
                line=dict(color='white', width=0.5)
            ),
            text=text_labels,
            textposition='outside',
            textfont=dict(size=16),
            cliponaxis=False,
            hovertemplate=(
                "<b>%{y}</b><br>"
                "Count: %{x:,}<br>"
                "Percentage: %{customdata:.1f}%"
                "<extra></extra>"
            ),
            customdata=[(count / total_events) * 100 for count in counts]
        ))
        
        # Calculate dynamic height based on number of categories
        num_categories = len(categories)
        bar_height = 50
        calculated_height = max(400, num_categories * bar_height)
        
        # Set x-axis range with extra space for labels
        x_range = [0, max(counts) * 1.5]
        
        # Update axes with styling similar to forecast_generation
        fig.update_xaxes(
            showticklabels=True,
            tickfont=dict(size=16),
            showgrid=False,
            gridcolor='lightgray',
            automargin=True,
            title="Number of Articles"
        )

        fig.update_yaxes(
            tickfont=dict(size=16),
            tickmode='linear',
            showgrid=False,
            side='left',
            categoryorder='array',
            categoryarray=categories,
            automargin=True,
            title=column_name.replace('_', ' ').title()
        )
        
        # Update layout with styling similar to forecast_generation
        fig.update_layout(
            font=dict(size=16, family="Times New Roman", color="black"),
            uniformtext_minsize=16,
            uniformtext_mode='show',
            margin=dict(l=200, r=100, t=80, b=80),
            height=calculated_height,
            width=1200,
            paper_bgcolor='white',
            plot_bgcolor='white',
            showlegend=False,
            xaxis=dict(range=x_range),
            bargap=0.3
        )
        
        # Add vertical line at median for reference
        median_count = np.median(counts)
        fig.add_vline(
            x=median_count, 
            line_dash="dash", 
            line_color="gray", 
            line_width=1,
            annotation_text=f"Median: {median_count:,.0f}",
            annotation_position="top"
        )
        
        return fig, value_counts
    
    print("="*60)
    print("FACTAL DISTRIBUTION ANALYSIS")
    print("="*60)
    
    # 1. Severity Distribution
    print("Creating Factal Severity Distribution...")
    severity_col = None
    # Check for different possible severity column names
    for col in ['severity', 'Severity', 'severity_score', 'alert_level']:
        if col in factal_pd.columns:
            severity_col = col
            break
    
    if severity_col:
        fig_severity, severity_stats = create_factal_distribution_plot(severity_col, severity_col)
        if fig_severity:
            fig_severity.show()
            print(f"Severity levels found: {len(severity_stats)}")
        else:
            print(f"No valid severity data found in column '{severity_col}'")
    else:
        print("No severity column found in Factal data")
        print(f"Available columns: {list(factal_pd.columns)}")
    
    # 2. Tag Distribution (Top 15)
    print(f"\nCreating Factal Tag Distribution...")
    tag_col = None
    # Check for different possible tag column names
    for col in ['tag', 'tags', 'Tag', 'Tags', 'category', 'categories']:
        if col in factal_pd.columns:
            tag_col = col
            break
    
    if tag_col:
        fig_tag, tag_stats = create_factal_distribution_plot(tag_col, tag_col, top_n=15)
        if fig_tag:
            fig_tag.show()
            print(f"Total unique tags: {factal_pd[tag_col].nunique()}")
            print(f"Showing top 15 tags")
        else:
            print(f"No valid tag data found in column '{tag_col}'")
    else:
        print("No tag column found in Factal data")
    
    # 3. Theme Distribution (Top 15)
    print(f"\nCreating Factal Theme Distribution...")
    theme_col = None
    # Check for different possible theme column names
    for col in ['theme', 'themes', 'Theme', 'Themes', 'topic', 'topics']:
        if col in factal_pd.columns:
            theme_col = col
            break
    
    if theme_col:
        fig_theme, theme_stats = create_factal_distribution_plot(theme_col, theme_col, top_n=15)
        if fig_theme:
            fig_theme.show()
            print(f"Total unique themes: {factal_pd[theme_col].nunique()}")
            print(f"Showing top 15 themes")
        else:
            print(f"No valid theme data found in column '{theme_col}'")
    else:
        print("No theme column found in Factal data")
    
    # 4. Content Length Distribution (if text data is available)
    print(f"\nAnalyzing Factal Content Length...")
    text_cols = ['content', 'text', 'description', 'summary', 'body']
    text_col = None
    for col in text_cols:
        if col in factal_pd.columns:
            text_col = col
            break
    
    if text_col:
        # Calculate text lengths
        factal_pd['content_length'] = factal_pd[text_col].fillna('').astype(str).str.len()
        
        # Create length categories
        def categorize_length(x):
            if x == 0:
                return "Empty"
            elif 1 <= x <= 100:
                return "Very Short (1-100)"
            elif 101 <= x <= 300:
                return "Short (101-300)"
            elif 301 <= x <= 600:
                return "Medium (301-600)"
            elif 601 <= x <= 1000:
                return "Long (601-1000)"
            elif 1001 <= x <= 2000:
                return "Very Long (1001-2000)"
            else:
                return "Extremely Long (2000+)"
        
        factal_pd['length_category'] = factal_pd['content_length'].apply(categorize_length)
        
        fig_length, length_stats = create_factal_distribution_plot('length_category', 'length_category')
        if fig_length:
            fig_length.show()
            
            # Print length statistics
            print(f"Average content length: {factal_pd['content_length'].mean():.0f} characters")
            print(f"Median content length: {factal_pd['content_length'].median():.0f} characters")
            print(f"Maximum content length: {factal_pd['content_length'].max():,} characters")
            print(f"Articles with empty content: {(factal_pd['content_length'] == 0).sum():,} ({(factal_pd['content_length'] == 0).mean()*100:.1f}%)")
    else:
        print("No text content column found for length analysis")
    
    # Print overall Factal summary
    print(f"\n" + "="*60)
    print("COMPREHENSIVE FACTAL SUMMARY")
    print("="*60)
    print(f"Total articles in merged dataset: {len(factal_pd):,}")
    
    if 'date' in factal_pd.columns or 'published_date' in factal_pd.columns or 'timestamp' in factal_pd.columns:
        date_col = None
        for col in ['date', 'published_date', 'timestamp', 'created_at']:
            if col in factal_pd.columns:
                date_col = col
                break
        
        if date_col:
            try:
                factal_pd[date_col] = pd.to_datetime(factal_pd[date_col])
                print(f"Date range: {factal_pd[date_col].min()} to {factal_pd[date_col].max()}")
                print(f"Time span: {(factal_pd[date_col].max() - factal_pd[date_col].min()).days} days")
            except:
                print(f"Could not parse dates in column '{date_col}'")
    
    if 'country' in factal_pd.columns or 'region' in factal_pd.columns:
        geo_col = 'country' if 'country' in factal_pd.columns else 'region'
        print(f"Geographic regions covered: {factal_pd[geo_col].nunique()}")
        top_regions = factal_pd[geo_col].value_counts().head(3)
        print(f"Top 3 {geo_col}s by article count:")
        for region, count in top_regions.items():
            print(f"  {region}: {count:,} articles")
    
else:
    print("Factal data not available for distribution analysis")

FACTAL DISTRIBUTION ANALYSIS
Creating Factal Severity Distribution...


Severity levels found: 5

Creating Factal Tag Distribution...


Total unique tags: 105
Showing top 15 tags

Creating Factal Theme Distribution...


Total unique themes: 10
Showing top 15 themes

Analyzing Factal Content Length...


Average content length: 267 characters
Median content length: 257 characters
Maximum content length: 1,058 characters
Articles with empty content: 0 (0.0%)

COMPREHENSIVE FACTAL SUMMARY
Total articles in merged dataset: 1,467
Date range: 2025-04-03 00:00:00 to 2025-07-03 00:00:00
Time span: 91 days
Geographic regions covered: 20
Top 3 countrys by article count:
  India: 1,294 articles
  Pakistan: 145 articles
  US: 4 articles


In [64]:
# Load and merge all Google News data files
google_files = [f for f in parquet_files if 'google' in f.name.lower()]
if google_files:
    print(f"Found {len(google_files)} Google News files:")
    
    # Load and combine all Google files with error handling
    all_google_dfs = []
    for file in google_files:
        print(f"  Loading: {file.name}")
        try:
            df = pl.read_parquet(file)
            all_google_dfs.append(df)
            print(f"    Successfully loaded: {df.shape}")
        except Exception as e:
            print(f"    Error loading {file.name}: {e}")
            continue
    
    if all_google_dfs:
        # Merge all Google dataframes
        if len(all_google_dfs) > 1:
            print("Merging all Google News dataframes...")
            google_df = pl.concat(all_google_dfs, how="vertical")
            print(f"Combined shape: {google_df.shape}")
        else:
            google_df = all_google_dfs[0]
            print(f"Single file shape: {google_df.shape}")
        
        print(f"\nCombined Google News Dataset Overview:")
        print(f"Shape: {google_df.shape}")
        print(f"Columns: {google_df.columns}")
        
        # Convert to pandas for plotting
        google_pd = google_df.to_pandas()
        
        # Remove duplicates if any (based on url or title)
        original_size = len(google_pd)
        if 'url' in google_pd.columns:
            google_pd = google_pd.drop_duplicates(subset=['url'])
            print(f"\nRemoved {original_size - len(google_pd)} duplicate records based on URL")
        elif 'title' in google_pd.columns:
            google_pd = google_pd.drop_duplicates(subset=['title'])
            print(f"\nRemoved {original_size - len(google_pd)} duplicate records based on title")
        
        # Remove rows with empty cells in key columns
        size_before_empty_removal = len(google_pd)
        
        # Check for important text columns and remove rows where they are empty
        text_columns_to_check = ['full_text', 'text', 'content', 'description', 'summary', 'title']
        existing_text_cols = [col for col in text_columns_to_check if col in google_pd.columns]
        
        if existing_text_cols:
            print(f"\nChecking for empty cells in text columns: {existing_text_cols}")
            
            # For each text column, show empty statistics before removal
            for col in existing_text_cols:
                empty_count = google_pd[col].isna().sum() + (google_pd[col].astype(str).str.strip() == '').sum()
                empty_pct = (empty_count / len(google_pd)) * 100
                print(f"  {col}: {empty_count:,} empty cells ({empty_pct:.1f}%)")
            
            # Remove rows where primary text column (if exists) is empty
            primary_text_col = None
            for col in ['full_text', 'text', 'content', 'description']:
                if col in google_pd.columns:
                    primary_text_col = col
                    break
            
            if primary_text_col:
                print(f"\nRemoving rows with empty '{primary_text_col}' column...")
                # Remove rows where the primary text column is null or empty string
                google_pd = google_pd[
                    google_pd[primary_text_col].notna() & 
                    (google_pd[primary_text_col].astype(str).str.strip() != '')
                ]
                removed_empty = size_before_empty_removal - len(google_pd)
                print(f"Removed {removed_empty:,} rows with empty '{primary_text_col}' ({removed_empty/size_before_empty_removal*100:.1f}%)")
            
            # Also remove rows where title is empty (if title column exists)
            if 'title' in google_pd.columns:
                size_before_title = len(google_pd)
                google_pd = google_pd[
                    google_pd['title'].notna() & 
                    (google_pd['title'].astype(str).str.strip() != '')
                ]
                removed_title = size_before_title - len(google_pd)
                if removed_title > 0:
                    print(f"Additionally removed {removed_title:,} rows with empty titles")
        
        else:
            print("\nNo standard text columns found for empty cell removal")
        
        print(f"Final dataset size: {len(google_pd):,} records")
        total_removed = original_size - len(google_pd)
        print(f"Total records removed: {total_removed:,} ({total_removed/original_size*100:.1f}%)")
        
    else:
        print("No Google News files could be loaded successfully")
        google_df = None
        google_pd = None
        
else:
    print("No Google News files found")
    google_df = None
    google_pd = None

Found 5 Google News files:
  Loading: google_news_India_2025-06-26_2025-07-01.parquet
    Successfully loaded: (500, 7)
  Loading: google_news_India_2025-07-01_2025-07-03.parquet
    Successfully loaded: (192, 7)
  Loading: google_news_Sudan_2025-07-01_2025-07-03.parquet
    Successfully loaded: (142, 7)
  Loading: google_news_United_States_2025-06-26_2025-07-02.parquet
    Successfully loaded: (500, 7)
  Loading: google_news_United_States_2025-07-02_2025-07-03.parquet
    Successfully loaded: (100, 7)
Merging all Google News dataframes...
Combined shape: (1434, 7)

Combined Google News Dataset Overview:
Shape: (1434, 7)
Columns: ['title', 'google_link', 'source', 'id', 'date', 'decoded_url', 'full_text']

Removed 8 duplicate records based on title

Checking for empty cells in text columns: ['full_text', 'title']
  full_text: 343 empty cells (24.1%)
  title: 0 empty cells (0.0%)

Removing rows with empty 'full_text' column...
Removed 343 rows with empty 'full_text' (24.1%)
Final datase

In [65]:
# GOOGLE NEWS Full Text Character Count Distribution Analysis
if google_pd is not None:
    
    def create_google_character_distribution_plot(text_column, chart_title=None):
        """
        Create a horizontal bar chart showing character count distribution for Google News text data
        """
        if text_column not in google_pd.columns:
            print(f"Column '{text_column}' not found in Google News dataset")
            print(f"Available columns: {list(google_pd.columns)}")
            return None, None
            
        # Calculate character lengths
        print(f"Analyzing character count distribution for '{text_column}' column...")
        
        # Handle null values and calculate text lengths
        text_data = google_pd[text_column].fillna('').astype(str)
        char_lengths = text_data.str.len()
        
        print(f"Text length statistics:")
        print(f"  Total articles: {len(char_lengths):,}")
        print(f"  Average length: {char_lengths.mean():.0f} characters")
        print(f"  Median length: {char_lengths.median():.0f} characters")
        print(f"  Min length: {char_lengths.min():,} characters")
        print(f"  Max length: {char_lengths.max():,} characters")
        print(f"  Articles with empty text: {(char_lengths == 0).sum():,} ({(char_lengths == 0).mean()*100:.1f}%)")
        
        # Create character length categories
        def categorize_char_length(x):
            if x == 0:
                return "Empty (0)"
            elif 1 <= x <= 100:
                return "Very Short (1-100)"
            elif 101 <= x <= 500:
                return "Short (101-500)"
            elif 501 <= x <= 1000:
                return "Medium (501-1000)"
            elif 1001 <= x <= 2000:
                return "Long (1001-2000)"
            elif 2001 <= x <= 5000:
                return "Very Long (2001-5000)"
            elif 5001 <= x <= 10000:
                return "Extremely Long (5001-10000)"
            else:
                return "Ultra Long (10000+)"
        
        # Apply categorization
        google_pd['char_length_category'] = char_lengths.apply(categorize_char_length)
        
        # Get value counts for categories
        value_counts = google_pd['char_length_category'].value_counts()
        
        # Sort in ascending order for proper horizontal bar chart display
        # Custom order for logical progression
        category_order = [
            "Empty (0)",
            "Very Short (1-100)", 
            "Short (101-500)",
            "Medium (501-1000)",
            "Long (1001-2000)",
            "Very Long (2001-5000)",
            "Extremely Long (5001-10000)",
            "Ultra Long (10000+)"
        ]
        
        # Reorder based on category_order, only including existing categories
        ordered_counts = pd.Series(dtype='int64')
        for category in category_order:
            if category in value_counts.index:
                ordered_counts[category] = value_counts[category]
        
        # Prepare data
        categories = ordered_counts.index.tolist()
        counts = ordered_counts.values.tolist()
        
        if len(counts) == 0:
            print("No valid character count data found")
            return None, None
        
        # Create text labels showing both count and percentage
        total_articles = len(google_pd)
        text_labels = []
        for count in counts:
            percentage = (count / total_articles) * 100
            text_labels.append(f"<b>{count:,}</b> ({percentage:.1f}%)")
        
        # Create color mapping (Google-themed colors)
        colors = []
        max_count = max(counts)
        for count in counts:
            # Color intensity based on relative frequency
            intensity = count / max_count
            if intensity >= 0.8:
                colors.append('#EA4335')  # High frequency - Google red
            elif intensity >= 0.6:
                colors.append('#FBBC04')  # Medium-high frequency - Google yellow
            elif intensity >= 0.4:
                colors.append("#F6C43C")  # Medium frequency - Google green
            elif intensity >= 0.2:
                colors.append("#42BFF4")  # Low-medium frequency - Google blue
            else:
                colors.append("#4D94DB")  # Low frequency - Google gray
        
        # Create the horizontal bar chart
        fig = go.Figure()
        
        fig.add_trace(go.Bar(
            y=categories,
            x=counts,
            orientation='h',
            marker=dict(
                color=colors,
                line=dict(color='white', width=0.5)
            ),
            text=text_labels,
            textposition='outside',
            textfont=dict(size=16, family="Times New Roman", color="black"),
            cliponaxis=False,
            hovertemplate=(
                "<b>%{y}</b><br>"
                "Count: %{x:,}<br>"
                "Percentage: %{customdata:.1f}%"
                "<extra></extra>"
            ),
            customdata=[(count / total_articles) * 100 for count in counts],
            name="Google News Character Count Distribution"
        ))
        
        # Calculate dynamic height based on number of categories
        num_categories = len(categories)
        bar_height = 50
        calculated_height = max(400, num_categories * bar_height)
        
        # Set x-axis range with extra space for labels
        x_range = [0, max(counts) * 1.5]
        
        # Update axes with styling similar to other plots
        fig.update_xaxes(
            showticklabels=True,
            tickfont=dict(size=16, family="Times New Roman", color="black"),
            showgrid=False,
            gridcolor='lightgray',
            automargin=True,
            title="Number of Articles"
        )

        fig.update_yaxes(
            tickfont=dict(size=16, family="Times New Roman", color="black"),
            tickmode='linear',
            showgrid=False,
            side='left',
            categoryorder='array',
            categoryarray=categories,
            automargin=True,
            title="Character Length Category"
        )
        
        # Update layout with styling similar to other plots
        fig.update_layout(
            font=dict(size=16, family="Times New Roman", color="black"),
            uniformtext_minsize=16,
            uniformtext_mode='show',
            margin=dict(l=250, r=100, t=80, b=80),
            height=calculated_height,
            width=1200,
            paper_bgcolor='white',
            plot_bgcolor='white',
            showlegend=False,
            xaxis=dict(range=x_range),
            bargap=0.3
        )
        
        # Add vertical line at median for reference
        median_count = np.median(counts)
        fig.add_vline(
            x=median_count, 
            line_dash="dash", 
            line_color="gray", 
            line_width=1,
            annotation_text=f"Median: {median_count:,.0f}",
            annotation_position="top"
        )
        
        return fig, ordered_counts
    
    print("="*60)
    print("GOOGLE NEWS CHARACTER COUNT ANALYSIS")
    print("="*60)
    
    # Check if 'full_text' column exists
    if 'full_text' in google_pd.columns:
        fig_char_count, char_count_stats = create_google_character_distribution_plot('full_text')
        if fig_char_count:
            fig_char_count.show()
            print(f"\nCharacter count categories found: {len(char_count_stats)}")
            
            # Print detailed statistics
            char_lengths = google_pd['full_text'].fillna('').astype(str).str.len()
            print(f"\nDetailed Statistics:")
            print(f"  Standard deviation: {char_lengths.std():.0f} characters")
            print(f"  25th percentile: {char_lengths.quantile(0.25):.0f} characters")
            print(f"  75th percentile: {char_lengths.quantile(0.75):.0f} characters")
            print(f"  95th percentile: {char_lengths.quantile(0.95):.0f} characters")
            
            # Show distribution of categories
            print(f"\nCategory Distribution:")
            for category, count in char_count_stats.items():
                percentage = (count / len(google_pd)) * 100
                print(f"  {category}: {count:,} articles ({percentage:.1f}%)")
        else:
            print("Could not create character count distribution plot")
    else:
        print("'full_text' column not found in Google News data")
        print(f"Available columns: {list(google_pd.columns)}")
        
        # Check for alternative text columns
        alternative_text_cols = ['text', 'content', 'description', 'summary', 'body', 'snippet']
        found_text_col = None
        for col in alternative_text_cols:
            if col in google_pd.columns:
                found_text_col = col
                break
        
        if found_text_col:
            print(f"\nFound alternative text column: '{found_text_col}'")
            print("Creating character count distribution for this column instead...")
            fig_char_count, char_count_stats = create_google_character_distribution_plot(found_text_col)
            if fig_char_count:
                fig_char_count.show()
                print(f"Character count categories found: {len(char_count_stats)}")
        else:
            print("No suitable text column found for character count analysis")
    
    # Print overall Google News summary
    print(f"\n" + "="*60)
    print("COMPREHENSIVE GOOGLE NEWS SUMMARY")
    print("="*60)
    print(f"Total articles in merged dataset: {len(google_pd):,}")
    
    # Check for date information
    date_cols = ['date', 'published_date', 'timestamp', 'pubDate', 'published']
    date_col = None
    for col in date_cols:
        if col in google_pd.columns:
            date_col = col
            break
    
    if date_col:
        try:
            google_pd[date_col] = pd.to_datetime(google_pd[date_col])
            print(f"Date range: {google_pd[date_col].min()} to {google_pd[date_col].max()}")
            print(f"Time span: {(google_pd[date_col].max() - google_pd[date_col].min()).days} days")
        except:
            print(f"Could not parse dates in column '{date_col}'")
    
    # Check for source information
    source_cols = ['source', 'Source', 'publisher', 'site', 'domain']
    source_col = None
    for col in source_cols:
        if col in google_pd.columns:
            source_col = col
            break
    
    if source_col:
        unique_sources = google_pd[source_col].nunique()
        print(f"Unique sources: {unique_sources}")
        if unique_sources > 0:
            top_sources = google_pd[source_col].value_counts().head(3)
            print("Top 3 sources by article count:")
            for source, count in top_sources.items():
                print(f"  {source}: {count:,} articles")
    
else:
    print("Google News data not available for character count analysis")

GOOGLE NEWS CHARACTER COUNT ANALYSIS
Analyzing character count distribution for 'full_text' column...
Text length statistics:
  Total articles: 1,083
  Average length: 5137 characters
  Median length: 3336 characters
  Min length: 72 characters
  Max length: 84,673 characters
  Articles with empty text: 0 (0.0%)



Character count categories found: 7

Detailed Statistics:
  Standard deviation: 6369 characters
  25th percentile: 2126 characters
  75th percentile: 6236 characters
  95th percentile: 13407 characters

Category Distribution:
  Very Short (1-100): 3 articles (0.3%)
  Short (101-500): 32 articles (3.0%)
  Medium (501-1000): 54 articles (5.0%)
  Long (1001-2000): 156 articles (14.4%)
  Very Long (2001-5000): 486 articles (44.9%)
  Extremely Long (5001-10000): 239 articles (22.1%)
  Ultra Long (10000+): 113 articles (10.4%)

COMPREHENSIVE GOOGLE NEWS SUMMARY
Total articles in merged dataset: 1,083
Date range: 2025-06-26 00:00:00 to 2025-07-03 00:00:00
Time span: 7 days
Unique sources: 403
Top 3 sources by article count:
  Times of India: 90 articles
  army.mil: 54 articles
  The Economic Times: 46 articles
