In [1]:
# CELL 0 – MUST RUN FIRST – Fix Python path so src/ is importable
import sys
from pathlib import Path

# Go up one level from notebooks/ to the project root
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(project_root))

print(f"Added to Python path: {project_root}")
print("You can now do: from src.data.loader import load_full_dataset")

Added to Python path: c:\Users\manuz\Desktop\Adm\Projects\GDELT_Sentiment_FX
You can now do: from src.data.loader import load_full_dataset


In [2]:
# CELL 1 — Imports + pretty plots
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm

px.defaults.template = "plotly_white"
%matplotlib inline
plt.style.use('seaborn-v0_8')

print("All imports ready – let's go!")

All imports ready – let's go!


In [3]:
# CELL 2 — THE ONLY CELL THAT CAN TAKE TIME — NOW BULLETPROOF
from src.data.loader import load_full_dataset
from pathlib import Path

cache_file = Path("data/processed/gdelt_fx_full.parquet")

if cache_file.exists():
    print("Cached dataset found → loading instantly...")
    df = pd.read_parquet(cache_file)
else:
    print("No cache yet → building full GDELT + FX dataset")
    df = load_full_dataset()           # ← automatically saves the parquet when done

print("\nSUCCESS! Dataset ready")
print(f"Rows × Columns: {df.shape}")
print(f"Date range: {df.index.get_level_values(0).min().date()} → {df.index.get_level_values(0).max().date()}")
print(f"Trading days        : {df.shape[0]:,}")
tone_cols = [col for col in df.columns if col.startswith('avg_tone_')]
currencies = sorted([col.replace('avg_tone_', '') for col in tone_cols])
print(f"Unique currencies   : {len(currencies)}")
print("Currencies          :", ", ".join(currencies))
full_days = (df[tone_cols].notna().all(axis=1)).sum()
print(f"Days with all 15 ccys covered: {full_days:,} ({full_days/len(df)*100:.1f}%)")
df.head()

No cache yet → building full GDELT + FX dataset
Building fresh dataset via HTTP (no quota) – it will take time...

STARTING FULL GDELT 2.0 EVENTS DOWNLOAD
Date range : 2018-01-01 → 2025-11-17
Currencies : EUR, GBP, JPY, CHF, AUD, NZD, CAD, NOK, SEK, TRY, ZAR, BRL, INR, MXN, PHP, THB, PLN, HUF, CLP, COP, PEN

Success 2018-01-01 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-4.07, 0.24]
Success 2018-01-12 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-4.26, -0.46]
Success 2018-01-08 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-2.66, -0.05]
Success 2018-01-07 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-5.63, 0.92]
Success 2018-01-06 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-4.10, 0.47]
Success 2018-01-02 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-4.25, 0.08]
Success 2018-01-09 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-2.86, 0.34]
Success 2018-01-14 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-5.01, -1.03]
Success 2018-01-13 →  21 rows (m≥1, e≥1) | 21 ccy | tone ∈ [-4.26, -0.13]
Success 2018-01-10 →  21 rows

Unnamed: 0_level_0,avg_tone_aud,avg_tone_brl,avg_tone_cad,avg_tone_chf,avg_tone_clp,avg_tone_cop,avg_tone_eur,avg_tone_gbp,avg_tone_huf,avg_tone_inr,...,MXN,NOK,NZD,PEN,PHP,PLN,SEK,THB,TRY,ZAR
event_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,-1.407439,-0.678061,-1.327471,0.24305,-0.947001,-4.073359,-1.4898,-1.805686,-1.930604,-2.047259,...,-0.007809,-0.001235,0.003628,-0.000852,-0.000881,-0.002612,-5.5e-05,0.021221,-0.001001,0.005439
2018-01-02,-2.043891,-4.249702,-1.145379,0.083159,-1.291108,-2.941575,-1.213645,-1.463323,-2.125329,-3.892303,...,-0.007504,-0.009095,-0.001694,0.00218,0.0,-0.006965,-0.002757,-0.003719,-0.006726,0.003925
2018-01-03,-1.394862,-2.020098,-2.363975,-1.470887,0.138515,-1.310221,-1.429859,-1.574423,0.056039,-3.5891,...,-0.002281,-0.001852,0.001413,-0.002018,-0.002805,0.001072,0.002905,-0.002715,0.003983,-0.006001
2018-01-04,-1.263004,-1.615088,-1.267951,-2.419136,0.325209,-2.912089,-1.551083,-1.486245,-1.717047,-3.530204,...,-0.006887,-0.004808,-0.009196,-0.004802,-0.000362,-0.005139,-0.00593,-0.002939,-0.008023,-0.00597
2018-01-05,-1.298047,-0.752147,-1.764104,-1.066723,0.087185,-1.549256,-1.505575,-1.600543,-1.590686,-3.107791,...,0.000945,-0.002963,-0.002011,0.018633,0.000905,0.003903,0.001664,-0.002327,-0.004778,0.000679


In [4]:
# CELL 3 — Quick fix: create a single next_day_return column (in case you have many _ret columns)
return_cols = [col for col in df.columns if col.endswith('_ret')]
print(f"Found {len(return_cols)} return columns → averaging them")

df = df.copy()
df['next_day_return'] = df[return_cols].mean(axis=1)

Found 0 return columns → averaging them


In [None]:
# CELL 4 — Global EM Sentiment Over Time (2025 FINAL VERSION)

# 1. Extract all avg_tone columns
tone_cols = [col for col in df.columns if col.startswith('avg_tone_')]

# 2. Daily equal-weighted average tone across all 15 EM currencies
daily_global_tone = df[tone_cols].mean(axis=1)

# 3. 21-day and 63-day moving averages (1-month & 3-month)
tone_ma_21 = daily_global_tone.rolling(window=21, min_periods=10).mean()
tone_ma_63 = daily_global_tone.rolling(window=63, min_periods=30).mean()

# 4. Plot — clean, professional, Man AHL style
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=daily_global_tone.index,
    y=daily_global_tone,
    mode='lines',
    line=dict(color='lightgray', width=1),
    name='Daily avg tone',
    hovertemplate='%{y:.2f}<br>%{x|%Y-%m-%d}'
))

fig.add_trace(go.Scatter(
    x=tone_ma_21.index,
    y=tone_ma_21,
    mode='lines',
    line=dict(color='#1f77b4', width=3),
    name='21-day MA (1 month)',
    hovertemplate='21d MA: %{y:.2f}<br>%{x|%Y-%m-%d}'
))

fig.add_trace(go.Scatter(
    x=tone_ma_63.index,
    y=tone_ma_63,
    mode='lines',
    line=dict(color='#ff7f0e', width=3),
    name='63-day MA (3 months)',
    hovertemplate='63d MA: %{y:.2f}<br>%{x|%Y-%m-%d}'
))

# Neutral line + shading for extreme regimes
fig.add_hline(y=0, line_dash="dash", line_color="red", annotation_text=" Neutral")
fig.add_hrect(y0=-4, y1=-10, fillcolor="red", opacity=0.1, line_width=0,
              annotation_text="Extreme negative", annotation_position="top left")
fig.add_hrect(y0=2, y1=5, fillcolor="green", opacity=0.1, line_width=0,
              annotation_text="Extreme positive", annotation_position="bottom left")

fig.update_layout(
    title="<b>21-Day & 63-Day Moving Average of GDELT News Tone<br>15-Currency FX Basket (2018–2025)</b>",
    title_x=0.5,
    xaxis_title="",
    yaxis_title="Average Tone (lower = more negative)",
    template="plotly_white",
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    hovermode="x unified",
    height=600
)

fig.show()

In [6]:
# CELL 5 — TRY/USD: Your star currency (extremely news-sensitive)

# Extract TRY columns from the wide-format df
try_df = pd.DataFrame({
    'avg_tone'        : df['avg_tone_try'],
    'tone_dispersion' : df['tone_dispersion_try'],
    'event_count'     : df['event_count_try'],
    'next_day_return' : df['TRY'] * 100  # in percent for nicer plot
}).dropna()

fig = px.scatter(
    try_df,
    x='avg_tone',
    y='next_day_return',
    size='event_count',
    color='tone_dispersion',
    hover_data={'event_count': True},
    trendline='ols',
    trendline_color_override='red',
    opacity=0.8,
    color_continuous_scale='RdYlGn',
    labels={
        "avg_tone": "GDELT Average Tone (higher = more positive)",
        "next_day_return": "Next-Day Return (%)",
        "tone_dispersion": "Tone Dispersion",
        "event_count": "Event Count"
    },
    title="<b>TRY/USD – Higher GDELT Tone → Stronger Next-Day Appreciation?</b><br>"
          "<sub>15 EM FX basket • No look-ahead • Only same-day events • 2018–2025</sub>"
)

# Beautify
fig.update_layout(
    template="plotly_white",
    height=620,
    title_x=0.5,
    xaxis=dict(range=[-10, 6]),
    yaxis=dict(title="TRY/USD Next-Day Return (%)"),
    coloraxis_colorbar=dict(title="Dispersion")
)

# Add R² and p-value directly on the plot
results = px.get_trendline_results(fig)
fit = results.iloc[0]['px_fit_results']
r2 = getattr(fit, 'rsquared', float('nan'))
p_val = getattr(fit, 'f_pvalue', float('nan'))
if pd.isna(r2) or pd.isna(p_val):
    print("Trendline fit object type:", type(fit))
    print("Available attrs:", [a for a in dir(fit) if not a.startswith('_')])

fig.add_annotation(
    x=0.02, y=0.98,
    xref="paper", yref="paper",
    text=f"R² = {r2:.3f}<br>p-value = {p_val:.2e}<br>n = {len(try_df):,}",
    showarrow=False,
    font_size=14,
    align='left',
    bgcolor="rgba(255,255,255,0.8)",
    bordercolor="black",
    borderwidth=1
)

fig.show()

# Print full regression summary (for your notebook)
X = sm.add_constant(try_df['avg_tone'])
model = sm.OLS(try_df['next_day_return'], X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        next_day_return   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                   0.01304
Date:                Tue, 18 Nov 2025   Prob (F-statistic):              0.909
Time:                        14:14:06   Log-Likelihood:                -3286.8
No. Observations:                2034   AIC:                             6578.
Df Residuals:                    2032   BIC:                             6589.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1336      0.073      1.820      0.0

In [7]:
# CELL 6 — Correlation heatmap
features = ['avg_tone_try', 'event_count_try', 'tone_dispersion_try', 'TRY']
corr = df[features].corr()

fig = px.imshow(
    corr.round(3),
    text_auto=True,
    color_continuous_scale='RdBu',
    aspect="auto",
    title="Correlation Matrix – Sentiment Features vs Next-Day Return - TRY"
)
fig.show()

In [8]:
# CELL 7 — Naïve strategy: every day long the currency with the most positive tone
tone_cols = [c for c in df.columns if c.startswith('avg_tone_')]
if not tone_cols:
    raise ValueError("No avg_tone_ columns found in df")

# best tone column per date (e.g. 'avg_tone_try')
best_tone_col = df[tone_cols].idxmax(axis=1)            # series of 'avg_tone_<ccy>'
best_ccy = best_tone_col.str.replace('avg_tone_', '')  # series of currency codes like 'try'

# map to actual return column name in df (try uppercase first, then lowercase fallback)
def find_return_col(ccy):
    for cand in (ccy.upper(), ccy.lower()):
        if cand in df.columns:
            return cand
    return None

return_cols = best_ccy.map(find_return_col)
if return_cols.isnull().any():
    missing = sorted(best_ccy[return_cols.isnull()].unique())
    raise KeyError(f"Return column not found for currencies: {missing}")

# get integer column positions and use numpy advanced indexing to pick the correct return per row
col_pos = [df.columns.get_loc(c) for c in return_cols]

row_pos = np.arange(len(df))
selected_values = df.to_numpy()[row_pos, col_pos]
selected_returns = pd.Series(selected_values, index=df.index, name='selected_return')

# cumulative equity
equity = (1 + selected_returns).cumprod()

fig = go.Figure()
fig.add_trace(go.Scatter(x=equity.index, y=equity, mode='lines', name='Naïve Tone Strategy'))
fig.update_layout(
    title="Naïve Strategy: Long the Highest-Tone Currency Every Day",
    yaxis_title="Cumulative Return",
    yaxis_tickformat=".1%"
)
fig.show()

print(f"Naïve strategy final return: {equity.iloc[-1]:.1%}")

Naïve strategy final return: 130.4%


In [9]:
# CELL 8 — Summary table per currency
tone_cols = [c for c in df.columns if c.startswith('avg_tone_')]
if not tone_cols:
    raise ValueError("No avg_tone_ columns found in df")

ccys = [c.replace('avg_tone_', '') for c in tone_cols]
rows = []
for c in ccys:
    # avg tone
    avg_tone_col = f'avg_tone_{c}'
    avg_tone = df[avg_tone_col].mean()

    # total events (case-insensitive check)
    ev_col_upper = f'event_count_{c}'
    ev_col_lower = ev_col_upper.lower()
    if ev_col_upper in df.columns:
        total_events = df[ev_col_upper].sum()
    elif ev_col_lower in df.columns:
        total_events = df[ev_col_lower].sum()
    else:
        total_events = 0

    # tone dispersion
    disp_col_upper = f'tone_dispersion_{c}'
    disp_col_lower = disp_col_upper.lower()
    if disp_col_upper in df.columns:
        tone_disp = df[disp_col_upper].mean()
    elif disp_col_lower in df.columns:
        tone_disp = df[disp_col_lower].mean()
    else:
        tone_disp = float('nan')

    # returns: prefer UPPER (e.g. 'TRY'), then lower, then fallback to 'next_day_return'
    ret_col = None
    for cand in (c.upper(), c.lower()):
        if cand in df.columns:
            ret_col = cand
            break
    if ret_col is not None:
        ret_ser = df[ret_col].dropna()
    else:
        ret_ser = df['next_day_return'].dropna() if 'next_day_return' in df.columns else pd.Series(dtype=float)

    avg_ret = ret_ser.mean() if not ret_ser.empty else float('nan')
    vol = ret_ser.std() if not ret_ser.empty else float('nan')
    days = int(ret_ser.count())

    rows.append([avg_tone, total_events, tone_disp, avg_ret, vol, days])

summary = pd.DataFrame(rows, index=[c.upper() for c in ccys],
                       columns=['Avg_Tone', 'Total_Events', 'Tone_Dispersion', 'Avg_Return', 'Volatility', 'Days'])
summary = summary.sort_values('Total_Events', ascending=False).round(5)
summary

Unnamed: 0,Avg_Tone,Total_Events,Tone_Dispersion,Avg_Return,Volatility,Days
GBP,-1.37578,8500489.0,3.75042,5e-05,0.0055,2034
CAD,-1.26303,4231705.0,3.48591,7e-05,0.00423,2034
AUD,-1.43441,3852559.0,3.81313,0.00013,0.00638,2034
INR,-2.18086,3020587.0,3.92923,0.00018,0.00385,2034
EUR,-1.31419,2219001.0,3.28408,4e-05,0.00455,2034
JPY,-0.65183,1798027.0,3.47896,0.0002,0.00546,2034
TRY,-2.33483,1787759.0,3.53721,0.00126,0.01218,2034
PHP,-1.43809,1388281.0,3.76581,0.00011,0.00464,2034
MXN,-2.15176,1137940.0,3.80661,2e-05,0.00784,2034
NZD,-0.75706,1004400.0,3.55623,0.00014,0.00635,2034
