In [1]:
import requests
import zipfile
import io
import pandas as pd
from newspaper import Article
import random
from datetime import datetime
import time
from bs4 import BeautifulSoup
import re

In [2]:
# configuration
THEME = 'politic'
WESTERN_COUNTRY_CODES = ['US', 'CA', 'GB', 'UK', 'IE', 'FR',
                         'DE', 'GM', 'NL', 'BE', 'LU', 'CH', 'AT', 'DK', 'SE', 'NO', 'FI',
                        'IT', 'ES', 'PT', 'IS', 'AU', 'NZ', 'GR', 'CY', 'IL']
MAJOR_SOURCES = ['cnn.com', 'theguardian.com', 'washingtonexaminer.com', 'newsweek.com', 'breitbart.com']
ARTICLES_PER_SOURCE = 5

# GDELT column indices
# GDELT column indices
DATE_COL = 0
THEMES_COL = 3
LOCATIONS_COL = 4
TONE_COL = 7
SOURCE_COL = 9

In [3]:
def get_gdelt_file_list():
    """Scrape the GDELT index page to get all available CSV files."""
    index_url = "http://data.gdeltproject.org/gkg/index.html"
    try:
        response = requests.get(index_url, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all links ending with .gkg.csv.zip
        file_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            # Match pattern: digits.gkg.csv.zip
            if re.match(r'^\d+\.gkg\.csv\.zip$', href):
                # Extract the date number
                date_str = href.split('.')[0]
                # Check if date is between 20151108 and 20251108
                if len(date_str) == 8 and 20150101 <= int(date_str) <= 20251109:
                    file_links.append(f"http://data.gdeltproject.org/gkg/{href}")
        
        return sorted(file_links, reverse=True)  # Most recent first
    except Exception as e:
        print(f"Error fetching file list: {e}")
        return []

In [4]:
def download_and_parse_gdelt(file_url):
    """Download and parse a GDELT GKG file."""
    try:
        print(f"Downloading {file_url}...")
        response = requests.get(file_url, timeout=60)
        response.raise_for_status()
        
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            inner_filename = z.namelist()[0]
            with z.open(inner_filename) as f:
                df = pd.read_csv(f, sep="\t", header=None, low_memory=False)
        
        print(f"Loaded {len(df):,} rows")
        return df
    except Exception as e:
        print(f"Error downloading/parsing {file_url}: {e}")
        return None

In [5]:
def filter_gdelt_data(df, theme, country_codes, sources):
    """Filter GDELT data by theme, location, and source."""
    # Filter by theme
    theme_filter = df[THEMES_COL].astype(str).str.contains(theme, case=False, na=False)
    
    # Filter by Western countries
    western_pattern = '|'.join([f'#{code}#' for code in country_codes])
    location_filter = df[LOCATIONS_COL].astype(str).str.contains(western_pattern, case=False, na=False)
    
    # Filter by major sources
    df[SOURCE_COL] = df[SOURCE_COL].astype(str).str.lower()
    source_filter = df[SOURCE_COL].apply(lambda s: any(src in s for src in sources))
    
    # Combine filters
    filtered_df = df[theme_filter & location_filter & source_filter]
    
    return filtered_df

In [6]:
def extract_urls_by_source(df, sources):
    """Extract URLs grouped by news source."""
    urls_by_source = {source: [] for source in sources}
    
    for idx, row in df.iterrows():
        source_urls_col = df.shape[1] - 1
        raw_urls = str(row.iloc[source_urls_col])
        source_name = str(row.iloc[SOURCE_COL]).lower()
        date = str(row.iloc[DATE_COL])
        tone = str(row.iloc[TONE_COL])
        
        # Split multiple URLs
        urls = [u.strip() for u in raw_urls.split(';') if u.strip()]
        
        # Debug: print first few rows
        if idx < 3:
            print(f"Debug - Row {idx}: source='{source_name}', urls={urls[:2]}")
        
        # Categorize by source
        for url in urls:
            for source in sources:
                if source in url.lower():
                    urls_by_source[source].append({
                        'url': url,
                        'date': date,
                        'tone': tone,
                        'source': source
                    })
                    break
    
    print(f"Debug - URLs extracted: {[(k, len(v)) for k, v in urls_by_source.items()]}")
    return urls_by_source

In [7]:
def fetch_headline(url, max_retries=2):
    """Fetch headline from URL using newspaper3k."""
    for attempt in range(max_retries):
        try:
            article = Article(url)
            article.download()
            article.parse()
            
            if article.title:
                return article.title
            return None
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to fetch {url}: {e}")
                return None
            time.sleep(1)
    return None

In [8]:
def sample_articles_from_source(url_data_list, num_articles):
    """Sample random articles and fetch headlines."""
    if not url_data_list:
        return []
    
    # Sample exactly num_articles (or all if fewer available)
    sample_size = min(len(url_data_list), num_articles)
    sampled = random.sample(url_data_list, sample_size)
    
    articles = []
    
    for url_data in sampled:
        headline = fetch_headline(url_data['url'])
        
        if headline:
            articles.append({
                'Date': url_data['date'],
                'Tone': url_data['tone'],
                'Source': url_data['source'],
                'Headline': headline,
                'URL': url_data['url']
            })
            print(f"✓ {url_data['source']}: {headline[:60]}...")
        else:
            print(f"✗ {url_data['source']}: Failed to fetch headline")
        
        time.sleep(0.5)  # Rate limiting
    
    return articles

In [9]:
def process_single_file(file_url, final_df):
    """Process a single GDELT file and append to final dataframe."""
    try:
        # Download and parse
        df = download_and_parse_gdelt(file_url)
        if df is None or len(df) == 0:
            print("No data in file")
            return final_df
        
        # Filter data
        filtered_df = filter_gdelt_data(df, THEME, WESTERN_COUNTRY_CODES, MAJOR_SOURCES)
        print(f"Filtered to {len(filtered_df):,} relevant rows")
        
        if len(filtered_df) == 0:
            print("No relevant rows after filtering")
            return final_df
        
        # Extract URLs by source
        print("Extracting URLs by source...")
        urls_by_source = extract_urls_by_source(filtered_df, MAJOR_SOURCES)
        
        # Determine minimum number of articles across sources
        source_counts = {src: len(urls) for src, urls in urls_by_source.items() if len(urls) > 0}
        if not source_counts:
            print("No articles found for any source")
            return final_df
        
        min_count = min(min(source_counts.values()), ARTICLES_PER_SOURCE)
        print(f"\nSampling {min_count} articles per source")
        print(f"Available articles per source: {source_counts}")
        
        # Sample articles from each source
        all_articles = []
        for source, url_data_list in urls_by_source.items():
            if len(url_data_list) == 0:
                print(f"Skipping {source} - no articles available")
                continue
            
            print(f"\nProcessing {source}...")
            articles = sample_articles_from_source(url_data_list, min_count)
            
            if len(articles) == 0:
                print(f"Warning: Could not fetch any valid headlines from {source}")
            else:
                all_articles.extend(articles)
                print(f"Successfully fetched {len(articles)} articles from {source}")
        
        # Append to final dataframe
        if all_articles:
            new_df = pd.DataFrame(all_articles)
            final_df = pd.concat([final_df, new_df], ignore_index=True)
            print(f"\nAdded {len(all_articles)} articles. Total articles: {len(final_df)}")
        
        return final_df
        
    except Exception as e:
        print(f"Error in process_single_file (detailed): {type(e).__name__}: {e}")
        import traceback
        traceback.print_exc()
        raise

In [10]:
def main():
    """Main execution function."""
    print("Starting GDELT news scraper...")
    print(f"Articles per source: {ARTICLES_PER_SOURCE}\n")
    
    # Initialize empty dataframe
    final_df = pd.DataFrame(columns=['Date', 'Tone', 'Source', 'Headline', 'URL'])
    
    # Get list of all GDELT files
    print("Fetching list of GDELT files...")
    file_list = get_gdelt_file_list()
    
    if not file_list:
        print("No files found. Exiting.")
        return None
    
    print(f"Found {len(file_list)} GDELT files\n")
    
    # Process all files from target year
    year = '2015'
    files_year = [f for f in file_list if year in f]
    print(f"Found {len(files_year)} files from {year}")

    for i, file_url in enumerate(files_year, 1):
        print(f"Processing file {i}/{len(files_year)}")
        try:
            final_df = process_single_file(file_url, final_df)
        except Exception as e:
            print(f"Error processing file {file_url}: {e}")
            continue

        # Add delay between files
        if i < len(files_year):
            time.sleep(2)
    
    print(f"Scraping complete! Total articles collected: {len(final_df)}")

    # Save to CSV
    if len(final_df) > 0:
        output_file = "2015_political_articles.csv"
        final_df.to_csv(output_file, index=False)
        print(f"\nSaved to {output_file}")

    return final_df

In [11]:
results_df = main()

Starting GDELT news scraper...
Articles per source: 5

Fetching list of GDELT files...
Found 3947 GDELT files

Found 365 files from 2015
Processing file 1/365
Downloading http://data.gdeltproject.org/gkg/20151231.gkg.csv.zip...
Loaded 115,598 rows
Filtered to 223 relevant rows
Extracting URLs by source...
Debug - URLs extracted: [('cnn.com', 53), ('theguardian.com', 80), ('washingtonexaminer.com', 36), ('newsweek.com', 4), ('breitbart.com', 50)]

Sampling 4 articles per source
Available articles per source: {'cnn.com': 53, 'theguardian.com': 80, 'washingtonexaminer.com': 36, 'newsweek.com': 4, 'breitbart.com': 50}

Processing cnn.com...
✓ cnn.com: Obama looks toward legacy in final year...
Failed to fetch http://money.cnn.com/2015/12/31/pf/college/illinois-budget-college-grants/: Article `download()` failed with 502 Server Error: Bad Gateway for url: https://money.cnn.com/2015/12/31/pf/college/illinois-budget-college-grants/ on URL http://money.cnn.com/2015/12/31/pf/college/illinois-bu

Failed to fetch http://europe.newsweek.com/us-preparing-fresh-sanctions-iran-wsj-410300: Article `download()` failed with 403 Client Error: Forbidden for url: http://europe.newsweek.com/us-preparing-fresh-sanctions-iran-wsj-410300 on URL http://europe.newsweek.com/us-preparing-fresh-sanctions-iran-wsj-410300
✗ newsweek.com: Failed to fetch headline
Failed to fetch http://europe.newsweek.com/real-africa-through-lens-african-photographers-407886: Article `download()` failed with 403 Client Error: Forbidden for url: http://europe.newsweek.com/real-africa-through-lens-african-photographers-407886 on URL http://europe.newsweek.com/real-africa-through-lens-african-photographers-407886
✗ newsweek.com: Failed to fetch headline
Failed to fetch http://europe.newsweek.com/how-stop-europes-jewish-exodus-israel-408822: Article `download()` failed with 403 Client Error: Forbidden for url: http://europe.newsweek.com/how-stop-europes-jewish-exodus-israel-408822 on URL http://europe.newsweek.com/how-st

KeyboardInterrupt: 

In [None]:
results_df