In [1]:
import pandas as pd
import re
import os
import pandas as pd
import tqdm.notebook as tqdm
import os
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
from io import BytesIO

In [2]:
# Read the CAMEO codes from the text file
cameo = pd.read_csv("cameo.txt", delimiter=':', header=None)

cameo.columns = ['EventType', 'EventDesc']
cameo['EventDesc'] = cameo['EventDesc'].str.strip()
cameo

Unnamed: 0,EventType,EventDesc
0,1,Make public statement
1,10,Make statement
2,11,Decline comment
3,12,Make pessimistic comment
4,13,Make optimistic comment
...,...,...
311,202,Engage in mass killings
312,203,Engage in ethnic cleansing
313,204,Use weapons of mass destruction
314,2041,"Use chemical, biological, or radiological weapons"


SELECT
  MonthYear,
  EventCode,
  QuadClass,
  GoldsteinScale,
  AvgTone,
  ActionGeo_CountryCode,
  Actor1Geo_CountryCode,
  Actor1Geo_ADM1Code,
  SOURCEURL
FROM
  `gdelt-bq.full.events`
WHERE
  (Actor1Geo_CountryCode = "US" OR ActionGeo_CountryCode = "US")
  AND MonthYear >= 201501
  AND MonthYear <= 202106


In [None]:
# Set your destination folder
data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)

# GDELT events archive URL
base_url = "http://data.gdeltproject.org/events/"

# Step 1: Get the list of ZIP files
response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')

# Step 2: Filter links that end with .zip
zip_links = [
    a['href']
    for a in soup.find_all('a', href=True)
    if a['href'].endswith('.zip') and a['href'].startswith('2')
]

# OPTIONAL: limit number of files to download (for testing)
zip_links = zip_links[:1]  # e.g., first 10 files

# Step 3: Download and extract each ZIP file
for link in zip_links:
    zip_url = base_url + link
    print(f'Downloading: {zip_url}')
    r = requests.get(zip_url)
    
    if r.status_code == 200:
        with ZipFile(BytesIO(r.content)) as z:
            # Assume there's only 1 CSV file per zip
            for file_name in z.namelist():
                print(f'  Extracting: {file_name}')
                z.extract(file_name, data_dir)
    else:
        print(f'  Failed to download: {zip_url} (Status code {r.status_code})')

Downloading: http://data.gdeltproject.org/events/20250513.export.CSV.zip
  Extracting: 20250513.export.CSV


In [None]:
def is_article_id(part):
    # Check if the part starts with 'article_' followed by a UUID and optional .html extension
    article_uuid_pattern = re.compile(
        r'^article_'
        r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'
        r'(\.html)?$',
        re.IGNORECASE
    )
    return bool(article_uuid_pattern.match(part))

def extract_title_from_url(url):
    # Split the URL into parts, stripping any leading/trailing slashes
    parts = url.strip('/').split('/')
    # Iterate from the end to find the first article ID part
    for i in tqdm.tqdm(reversed(range(len(parts)))):
        part = parts[i]
        if is_article_id(part):
            # Found an article ID, check if there's a previous part for the title
            if i > 0:
                title_part = parts[i-1]
                return title_part.replace('-', ' ').strip()
            else:
                continue  # No previous part, keep looking
    # If no article ID found, take the last part and remove any file extension
    last_part = parts[-1]
    if '.' in last_part:
        last_part = last_part.split('.')[0]
    return last_part.replace('-', ' ').strip()

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

df_all = pd.DataFrame()
# column_dict = {
#     0: 'event_id',
#     1: 'SQLDATE',
#     2: 'MonthYear',
#     3: 'Year',
#     4: 'FractionDate',
#     5: 'Actor1Code',
#     6: 'Actor1Name',
#     7: 'Actor1CountryCode',
#     8: 'Actor1KnownGroupCode',
#     9: 'Actor1EthnicCode',
#     10: 'Actor1Religion1Code',
#     11: 'Actor1Religion2Code',
#     12: 'Actor1Type1Code',
#     13: 'Actor1Type2Code',
#     14: 'Actor1Type3Code',
#     15: 'Actor2Code',
#     16: 'Actor2Name',
#     17: 'Actor2CountryCode',
#     18: 'Actor2KnownGroupCode',
#     19: 'Actor2EthnicCode',
#     20: 'Actor2Religion1Code',
#     21: 'Actor2Religion2Code',
#     22: 'Actor2Type1Code',
#     23: 'Actor2Type2Code',
#     24: 'Actor2Type3Code',
#     25: 'IsRootEvent',
#     26: 'EventCode',
#     27: 'EventBaseCode',
#     28: 'EventRootCode',
#     29: 'QuadClass',
#     30: 'GoldsteinScale',
#     31: 'NumMentions',
#     32: 'NumSources',
#     33: 'NumArticles',
#     34: 'AvgTone',
#     35: 'Actor1Geo_Type',
#     36: 'Actor1Geo_FullName',
#     37: 'Actor1Geo_CountryCode',
#     38: 'Actor1Geo_ADM1Code',
#     39: 'Actor1Geo_Lat',
#     40: 'Actor1Geo_Long',
#     41: 'Actor1Geo_FeatureID',
#     42: 'Actor2Geo_Type',
#     43: 'Actor2Geo_FullName',
#     44: 'Actor2Geo_CountryCode',
#     45: 'Actor2Geo_ADM1Code',
#     46: 'Actor2Geo_Lat',
#     47: 'Actor2Geo_Long',
#     48: 'Actor2Geo_FeatureID',
#     49: 'ActionGeo_Type',
#     50: 'ActionGeo_FullName',
#     51: 'ActionGeo_CountryCode',
#     52: 'ActionGeo_ADM1Code',
#     53: 'ActionGeo_Lat',
#     54: 'ActionGeo_Long',
#     55: 'ActionGeo_FeatureID',
#     56: 'date',
#     57: 'url',
# }

# Iterate over CSV files, format the DF, and extract title from URL
for csv_file in csv_files:
    file_path = os.path.join(data_dir, csv_file)
    print(f'Importing {csv_file}')
    
    df = pd.read_csv(file_path, delimiter=',', low_memory=False)
    #df = df.rename(columns=column_dict)
    #df = df[column_dict.values()]

    if 'SOURCEURL' in df.columns:  
        df['url_title'] = df['SOURCEURL'].apply(extract_title_from_url)
    else:
        print(f"Warning: 'url' not found for file {csv_file}")
    
    # Append results to the df_all DataFrame
    df_all = pd.concat([df_all, df], ignore_index=True)

# Clean the data by removing duplicates
df_all_clean = df_all.drop_duplicates(subset=['MonthYear', 'url_title']).copy()

# Map the EventCode column in df_all_clean to get the corresponding event descriptions
df_all_clean['EventCode'] = df_all_clean['EventCode'].astype(int)
cameo_dict = dict(zip(cameo['EventType'], cameo['EventDesc']))
df_all_clean['EventDesc'] = df_all_clean['EventCode'].map(cameo_dict)

print('---------------------------------')
print(f"Total rows after cleaning: {len(df_all_clean)}")

df_all_clean

In [None]:
# Save the cleaned data to a CSV file
df_all_clean.to_csv('cleaned_data.csv', index=False)

In [3]:
df_all_clean = pd.read_csv('cleaned_data.csv', low_memory=False)

In [6]:
# Test on the first 10 rows
df_all_clean_mini = df_all_clean.head(10).copy()
df_all_clean_mini.shape

(10, 60)

In [7]:
from newsfeed.utils import fulltext as ft
import time
import tqdm.notebook as tqdm
import sys
import contextlib
import signal

def timeout_handler(signum, frame):
    raise TimeoutError("Request timed out")

# Create empty lists to store titles and full_text
titles = []
full_texts = []
successful_downloads = 0
failed_downloads = 0

# Get total number of entries
total_entries = len(df_all_clean_mini)

# Loop through each row in df_all_clean_mini with improved progress tracking
for index, row in tqdm.tqdm(df_all_clean_mini.iterrows(), total=total_entries, leave=True):
    with contextlib.redirect_stdout(None):
        url = row['url']
        start_time = time.time()  # Define start_time for each iteration
        
        try:
            # Set timeout alarm
            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(15)  # 15 seconds timeout
            
            # Download and parse article
            article = ft.download(url=url)
            article.download()
            article.parse()
            
            # Clear the alarm
            signal.alarm(0)
            
            # Extract title and full text
            titles.append(article.title)
            full_texts.append(article.text)
            successful_downloads += 1
            
        except Exception as e:
            print(f"Error processing URL: {e}")
            # Add placeholder values for failures
            titles.append(None)
            full_texts.append(None)
            failed_downloads += 1
        
    # Add a small delay to avoid overloading servers
    time.sleep(0.5)
    # Check if this iteration took too long, but continue to next article rather than breaking
    if time.time() - start_time > 15:
        print(f"Timeout reached for URL: {url}")


# Add the new columns to df_all_clean_mini
df_all_clean_mini['title'] = titles
df_all_clean_mini['full_text'] = full_texts

# Print summary statistics
print(f"Download complete: {successful_downloads} successful, {failed_downloads} failed out of {total_entries} articles")

df_all_clean_mini

  0%|          | 0/10 [00:00<?, ?it/s]

Download complete: 9 successful, 1 failed out of 10 articles


Unnamed: 0,event_id,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,date,url,url_title,EventDesc,title,full_text
0,1243421051,20240513,202405,2024,2024.3644,CHE,BASEL,CHE,,,...,SZ,47.0,8.0,SZ,20250513,https://www.manchestereveningnews.co.uk/news/t...,graham norton shares verdict uks 31627176,Express intent to meet or negotiate,Graham Norton shares verdict on UK's Eurovisio...,Graham Norton shares verdict on UK's Eurovisio...
1,1243421054,20240513,202405,2024,2024.3644,CRMELI,CRIMINAL,,,,...,MX25,24.9086,-107.594,-1703946,20250513,https://www.yahoo.com/news/mexico-vows-hunt-do...,mexico vows hunt down killer 225607029,Demand,Mexico vows to hunt down killer of mayoral can...,Mexican authorities were seeking suspects Mond...
2,1243421055,20240513,202405,2024,2024.3644,CVL,SCIENTIST,,,,...,USCA,34.3667,-118.201,277283,20250513,https://www.independent.co.uk/news/world/ameri...,southern california heat wave rescues b2749565,Demand,Early-season Southern California heat wave res...,The latest headlines from our reporters across...
3,1243421060,20240513,202405,2024,2024.3644,GOV,GOVERNMENT,,,,...,USIL,41.3917,-88.2584,421765,20250513,https://www.nbcchicago.com/news/local/chicago-...,3743837,Use unconventional mass violence,Trump officials offer support to plan aimed at...,A stalemate between President Donald Trump and...
4,1243421062,20240513,202405,2024,2024.3644,JUD,APPEALS COURT,,,,...,USTN,35.7449,-86.7489,TN,20250513,https://www.yahoo.com/news/aclu-tennessee-file...,aclu tennessee files petition challenge 205527040,Provide economic aid,ACLU of Tennessee files petition to challenge ...,"NASHVILLE, Tenn. (WKRN) — The American Civil L..."
5,1243421063,20240513,202405,2024,2024.3644,JUD,LAWYER,,,,...,US,39.828175,-98.5795,US,20250513,https://www.wutc.org/2025-05-12/the-president-...,the president has named a new acting librarian...,"Arrest, detain, or charge with legal action",The President has named a new Acting Librarian...,"Todd Blanche, the Deputy Attorney General of t..."
6,1243421064,20240513,202405,2024,2024.3644,USA,CALIFORNIA,USA,,,...,USCA,36.17,-119.746,CA,20250513,https://www.yahoo.com/news/washington-rein-fas...,washington rein fast drivers speed 230745730,Use unconventional mass violence,Washington to rein in fast drivers with speed ...,Yahoo is using AI to generate takeaways from t...
7,1243421066,20240513,202405,2024,2024.3644,USA,NORTH CAROLINA,USA,,,...,USNC,35.6411,-79.8431,NC,20250513,https://www.wfmynews2.com/article/news/local/2...,83 b33ace78 efb2 45a7 b770 865622ddc17d,Use unconventional mass violence,,
8,1243421069,20240513,202405,2024,2024.3644,USA,UNITED STATES,USA,,,...,MX,23.0,-102.0,MX,20250513,https://www.yahoo.com/news/latin-music-festiva...,latin music festivals scramble amid 223251657,Threaten,Latin Music Festivals Scramble Amid Visa Uncer...,Yahoo is using AI to generate takeaways from t...
9,1243421070,20240513,202405,2024,2024.3644,USA,ATLANTA,USA,,,...,USGA,32.9866,-83.6487,GA,20250513,https://tiftongazette.com/2025/05/12/schools-p...,schools prepare to ban cellphones through eigh...,Impose administrative sanctions,Schools prepare to ban cellphones through eigh...,Schools prepare to ban cellphones through eigh...
