## AidData Core Dyadic Thin data exploration & preprocessing

In [2]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parents[1]
csv_path = PROJECT_ROOT / "data" / "raw" / "economic" / "aiddata-dyadic-thin-raw.csv"

# Load the thin version
aiddata_df = pd.read_csv(csv_path)

In [3]:
# Check structure
print(aiddata_df.shape)
print(aiddata_df.columns.tolist())
print(aiddata_df.head())

# Filter to 1990+ and aggregate to donor-recipient-year
aiddata_df = aiddata_df[aiddata_df['year'] >= 1990]

# Aggregate (in case there are multiple rows per donor-recipient-year)
aid_flows = (aiddata_df
    .groupby(['donor', 'recipient', 'year'])['commitment_amount_usd_constant']
    .sum()
    .reset_index())

aid_flows.columns = ['donor', 'recipient', 'year', 'aid_amount']


# Check size
print(aid_flows.shape)

# Save processed version (should be much smaller)
output_path = PROJECT_ROOT / "data" / "processed" / "aiddata_bilateral_flows.csv"
aid_flows.to_csv(output_path, index=False)

(1561039, 8)
['aiddata_id', 'aiddata_2_id', 'year', 'donor', 'recipient', 'commitment_amount_usd_constant', 'coalesced_purpose_code', 'coalesced_purpose_name']
   aiddata_id  aiddata_2_id  year                            donor  \
0           1           NaN  2003  African Development Bank (AFDB)   
1           2           NaN  1990  African Development Bank (AFDB)   
2           3           NaN  1991  African Development Bank (AFDB)   
3           4           NaN  1992  African Development Bank (AFDB)   
4           5           NaN  1992  African Development Bank (AFDB)   

       recipient  commitment_amount_usd_constant  coalesced_purpose_code  \
0           Togo                        29589911                   24030   
1        Burundi                         9713596                   31100   
2  Cote d`Ivoire                       148139421                   31120   
3       Cameroon                        24693752                   31120   
4          Gabon                       

In [4]:
# Load the saved file
aid_flows = pd.read_csv(output_path)

# Filter out invalid years
aid_flows = aid_flows[aid_flows['year'] <= 2024]

# Overwrite with clean version
aid_flows.to_csv(output_path, index=False)

print(aid_flows.shape)
print(aid_flows['year'].max())

(82112, 4)
2013
