In [11]:
# CELL 0 – MUST RUN FIRST – Fix Python path so src/ is importable
import sys
from pathlib import Path

# Go up one level from notebooks/ to the project root
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(project_root))

print(f"Added to Python path: {project_root}")
print("You can now do: from src.data.loader import load_full_dataset")

Added to Python path: c:\Users\manuz\Desktop\Adm\Projects\GDELT_Sentiment_FX
You can now do: from src.data.loader import load_full_dataset


In [12]:
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from src.utils import COUNTRIES, CCYS, COUNTRY_TO_CCY
date_str = "2018-01-01"
yyyymmdd = date_str.replace('-', '')
url = f"http://data.gdeltproject.org/events/{yyyymmdd}.export.CSV.zip"
r = requests.get(url, timeout=90)
r.raise_for_status()
with ZipFile(BytesIO(r.content)) as z:
            csv_name = z.namelist()[0]
            df = pd.read_csv(
                z.open(csv_name),
                sep='\t',
                header=None,
                dtype=str,
                low_memory=False
            )
if len(df.columns) == 58:  # Old schema (2015–Feb 2019)
    positions = {0:0, 1:1, 7:7, 25:25, 31:31, 34:34}
elif len(df.columns) >= 61:  # New schema (2019+)
    positions = {0:0, 1:1, 7:7, 26:26, 31:31, 34:34}  # IsRootEvent moves to 26, AvgTone to 34
df = df.iloc[:, list(positions.keys())].copy()
df.columns = ['GLOBALEVENTID', 'SQLDATE', 'Actor1CountryCode', 'IsRootEvent', 'NumMentions', 'AvgTone']
df = df.replace({'---': pd.NA, '': pd.NA})
for col in ['GLOBALEVENTID', 'SQLDATE', 'IsRootEvent', 'NumMentions']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df['AvgTone'] = pd.to_numeric(df['AvgTone'], errors='coerce')
df = df.dropna(subset=['SQLDATE', 'Actor1CountryCode', 'AvgTone']).reset_index(drop=True)
df = df[df['Actor1CountryCode'].isin(COUNTRIES)]
df['event_date'] = pd.to_datetime(df['SQLDATE'].astype(int), format='%Y%m%d').dt.date
df = df[df['event_date'] == pd.to_datetime(date_str).date()]
df['currency'] = df['Actor1CountryCode'].map(COUNTRY_TO_CCY)
df = df[df['currency'].isin(CCYS)]
df = df[df['NumMentions'] >= 1]

agg = df.groupby(['event_date', 'currency']).agg(
    avg_tone=('AvgTone', 'mean'),
    tone_dispersion=('AvgTone', 'std'),
    event_count=('GLOBALEVENTID', 'count')
).reset_index()
agg = agg[agg['event_count'] >= 1]
agg

Unnamed: 0,event_date,currency,avg_tone,tone_dispersion,event_count
0,2018-01-01,BRL,-0.678061,4.016056,79
1,2018-01-01,CLP,-0.947001,3.524795,88
2,2018-01-01,COP,-4.07336,4.514388,47
3,2018-01-01,EGP,-4.044589,4.013127,391
4,2018-01-01,HUF,-1.930604,2.855313,35
5,2018-01-01,IDR,-2.093377,4.454862,173
6,2018-01-01,INR,-2.047259,4.153561,1055
7,2018-01-01,MXN,-1.736979,3.807214,303
8,2018-01-01,NGN,-1.792278,3.841791,1505
9,2018-01-01,PEN,-1.709077,2.931021,70
