# 1. Data Exploration

In [1]:
# Step 1: Importing packages
import altair as alt
import polars as pl
import pandas as pd
import pathlib as Path
import numpy as np
from datetime import datetime
from vega_datasets import data
#alt.data_transformers.enable("vegafusion")

In [2]:
Path = "/Users/mariajosereyesramirez/Documents/Autumn quarter 2025/Data Viz/Project/raw data/All_Historical_Data_Separately/Geopolitical Risk Index Daily.csv"
geo_risk = pd.read_csv(Path)
geo_risk.columns = geo_risk.columns.str.strip()
type(geo_risk['GPRD'])

pandas.core.series.Series

In [3]:
geo_risk['DATE']=pd.to_datetime(geo_risk['DATE'], dayfirst=True, errors='coerce')
geo_risk = geo_risk.dropna(subset=['DATE']).set_index('DATE').sort_index()

# Cleaning and GPRD to float
num_gdrp = geo_risk['GPRD'].astype(str).str.strip()

# Cleaning data for symbols and special characters
num_gdrp = num_gdrp.str.replace(r'[^\d,.\-]', '', regex=True)

mask_decimal_coma = num_gdrp.str.contains(',') & ~num_gdrp.str.contains(r'\.')
num_gdrp = num_gdrp.where(~mask_decimal_coma, num_gdrp.str.replace(',', '.', regex=False))
num_gdrp = num_gdrp.str.replace(',', '', regex=False).str.replace(' ', '', regex=False)

geo_risk['GPRD'] = pd.to_numeric(num_gdrp, errors='coerce')

# Converting the other columns
for c in ['GPRD_ACT', 'GPRD_THREAT']:
    if c in geo_risk.columns and geo_risk[c].dtype == 'object':
        t = geo_risk[c].astype(str).str.strip()
        t = t.str.replace(r'[^\d,.\-]', '', regex=True)
        mask = t.str.contains(',') & ~t.str.contains(r'\.')
        t = t.where(~mask, t.str.replace(',', '.', regex=False))
        t = t.str.replace(',', '', regex=False).str.replace(' ', '', regex=False)
        geo_risk[c] = pd.to_numeric(t, errors='coerce')

# Checking data types
print(geo_risk[['GPRD','GPRD_ACT','GPRD_THREAT']].dtypes)
print(geo_risk[['GPRD','GPRD_ACT','GPRD_THREAT']].head(3))


GPRD           float64
GPRD_ACT       float64
GPRD_THREAT    float64
dtype: object
              GPRD  GPRD_ACT  GPRD_THREAT
DATE                                     
1985-01-01  230.04    275.20       153.03
1985-01-02  115.68    146.77        87.44
1985-01-03   97.43    158.94        29.46


In [4]:
measure_cols = [c for c in ['GPRD', 'GPRD_ACT', 'GPRD_THREAT'] if c in geo_risk.columns]
# Time periods
monthly = geo_risk[measure_cols].resample('MS').mean().reset_index().rename(columns={'DATE': 'date'})
annual = geo_risk[measure_cols].resample('Y').mean().reset_index().rename(columns={'DATE': 'year_end'})

# Decades
tmp = geo_risk[measure_cols].copy()
tmp['year'] = geo_risk.index.year
tmp['decade'] = pd.cut(tmp['year'], bins=[1989, 1999, 2009, 2019, 2025], labels=['1989–1999','2000–2009','2010–2019', '2020-2025'])
decadal = (tmp.dropna(subset=['decade'])
           .groupby('decade')[measure_cols]
           .mean()
           .reset_index())

print(decadal)

      decade        GPRD    GPRD_ACT  GPRD_THREAT
0  1989–1999   91.692607   77.853450   101.818469
1  2000–2009  114.240750  135.223567    99.796348
2  2010–2019   93.286295   87.270140    97.574765
3  2020-2025  117.090019  101.883585   135.866083


  annual = geo_risk[measure_cols].resample('Y').mean().reset_index().rename(columns={'DATE': 'year_end'})
  .groupby('decade')[measure_cols]


In [5]:
print(monthly)

          date        GPRD    GPRD_ACT  GPRD_THREAT
0   1985-01-01  102.877742   91.824194   106.769677
1   1985-02-01  116.861071   97.070714   125.718571
2   1985-03-01  124.430968  115.170968   128.587742
3   1985-04-01   88.570000   73.419667    95.778000
4   1985-05-01  101.943226   91.601935   108.894839
..         ...         ...         ...          ...
484 2025-05-01  164.700968  147.016129   202.527419
485 2025-06-01  218.458000  179.304333   281.886667
486 2025-07-01  133.862903  119.761613   155.873226
487 2025-08-01  146.931290  133.121935   166.171935
488 2025-09-01  133.366250  121.438750   151.542500

[489 rows x 4 columns]


In [6]:
alt.Chart(monthly).mark_line().encode(
    x='date:T',
    y='GPRD:Q'   
)

alt.Chart(monthly).mark_line().encode(
        x='Date:T',
        y='GPRD_ACT:Q',
        color='risk_index:Q'
    )

In [None]:
assert {'date', 'GPRD', 'GPRD_ACT', 'GPRD_THREAT'}.issubset(set(monthly.columns)), monthly.columns
indexes = ['GPRD', 'GPRD_ACT', 'GPRD_THREAT']

# ChatGPT "How can I make a graph in Altair that puts the three indexes in a single chart? I'm trying this syntax but is not working"
risk_behavior = (
    alt.Chart(monthly)
    .transform_fold(indexes, as_=['metric', 'value'])
    .mark_line()
    .encode(
        x=alt.X('date:T', title='Date'),
          y=alt.Y('value:Q', title='Index'),
          color=alt.Color('metric:N', title='Serie'),
          tooltip=[
              alt.Tooltip('yearmonth(date):O', title='Month'),
              alt.Tooltip('metric:N', title='Serie'),
              alt.Tooltip('value:Q', title='Value', format='.2f')
          ]
      )
      .properties(title='Monthly Geopolitical Risks Index (1989-2025)')
      .interactive()
)
risk_behavior