In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer

In [37]:
og = pd.read_csv('/Users/gracesaunders/Downloads/World Development Indicators Jan 28 2025 (1)/WDICSV.csv')

In [38]:
og.head(5)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,17.488497,18.001597,18.558234,19.043572,19.586457,20.192064,20.828814,21.372164,22.100884,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.811504,7.096003,7.406706,7.666648,8.020952,8.403358,8.718306,9.097176,9.473374,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,38.15209,38.488233,38.779953,39.068462,39.445526,39.818645,40.276374,40.687817,41.211606,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.871956,33.922276,38.859598,40.223744,43.035073,44.390861,46.282371,48.127211,48.742043,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,17.672943,16.527554,24.627753,25.432092,27.061929,29.154282,31.022083,32.809138,33.760782,


In [39]:
# remove other ODA data
exclude_keywords = r"(?i)\baid\b|development assistance|ODA|primary income|official flows"
non_oda_data = og[~og['Indicator Name'].str.contains(exclude_keywords, na=False)]

target_oda_data = og[og['Indicator Code'] == "DT.ODA.ODAT.KD"]

df_filtered = pd.concat([non_oda_data, target_oda_data])

df = df_filtered.loc[:,['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']]
df = df[df.index >= 72281] # only countries, no regions

df_long = df.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    var_name="Year",
    value_name="Value"
)

df_pivot = df_long.pivot_table(
    index=["Country Name", "Country Code", "Year"],
    columns="Indicator Name",
    values="Value"
).reset_index()
df_pivot = df_pivot[df_pivot['Net official development assistance received (constant 2021 US$)'].notna()]
df_pivot = df_pivot.sort_values(["Country Code", "Year"])

print("Potential ODA columns remaining:")
print([col for col in df_pivot.columns if 'assistance' in str(col).lower() or 'ODA' in str(col)])


Potential ODA columns remaining:
['Net official development assistance received (constant 2021 US$)']


In [40]:

# Load data
df = df_pivot.copy()

# Define target variable
target = 'Net official development assistance received (constant 2021 US$)'

# Data preprocessing
def preprocess_data(df, target, split_year=2016, lags=1):
    # Sort and create lagged features
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df = df.sort_values(['Country Code', 'Year'])
    features_to_lag = [col for col in df.columns if col not in 
                      ['Country Name', 'Country Code', 'Year', target]]
    
    # Create lagged features only for numeric columns
    numeric_cols = df[features_to_lag].select_dtypes(include=np.number).columns.tolist()
    for col in numeric_cols:
        df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
    
    # Filter to numeric features only
    all_features = [col for col in df.columns if col not in 
                   ['Country Name', 'Country Code', 'Year', target]]
    numeric_features = df[all_features].select_dtypes(include=np.number).columns.tolist()

    # Drop rows with missing target
    df = df.dropna(subset=[target])
    
    # Create final dataset
    X = df[numeric_features]
    y = df[target]
    
    # Separate imputation for numeric and categorical
    numeric_imputer = SimpleImputer(strategy='median')
    X_imputed = numeric_imputer.fit_transform(X)
    
    # Maintain column consistency
    X = pd.DataFrame(X_imputed, columns=numeric_features)
    
    df = df.reset_index(drop=True)  # Ensures indices are clean and aligned
    
    # Time-based split
    train_mask = df['Year'] <= split_year
    test_mask = df['Year'] > split_year
    
    return X[train_mask], X[test_mask], y[train_mask], y[test_mask]

# Split data (using 2016 as the last training year)
X_train, X_test, y_train, y_test = preprocess_data(df, target)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Lasso with cross-validation
lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
lasso.fit(X_train_scaled, y_train)

# Evaluate model
y_pred = lasso.predict(X_test_scaled)
print(f'R² Score: {r2_score(y_test, y_pred):.3f}')
print(f'MSE: {mean_squared_error(y_test, y_pred):,.0f}')
print(f'alpha: {lasso.alpha}')

# Get top predictors
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lasso.coef_,
    'abs_coeff': np.abs(lasso.coef_)
})

top_predictors = feature_importance[feature_importance['coefficient'] != 0]\
    .sort_values('abs_coeff', ascending=False)\
    .drop('abs_coeff', axis=1)

print('\nTop Predictors of Foreign Aid:')
print(top_predictors.head(10))

  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f'{col}_lag{lags}'] = df.groupby('Country Code')[col].shift(lags)
  df[f

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).