In [65]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit

In [66]:
og = pd.read_csv('/Users/gracesaunders/Downloads/World Development Indicators Jan 28 2025 (1)/WDICSV.csv')
og.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,17.488497,18.001597,18.558234,19.043572,19.586457,20.192064,20.828814,21.372164,22.100884,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.811504,7.096003,7.406706,7.666648,8.020952,8.403358,8.718306,9.097176,9.473374,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,38.15209,38.488233,38.779953,39.068462,39.445526,39.818645,40.276374,40.687817,41.211606,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.871956,33.922276,38.859598,40.223744,43.035073,44.390861,46.282371,48.127211,48.742043,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,17.672943,16.527554,24.627753,25.432092,27.061929,29.154282,31.022083,32.809138,33.760782,


In [67]:
# Split into two clear steps
non_oda_data = og[~og['Indicator Code'].str.contains('ODA', case=False, na=False)]
target_oda_data = og[og['Indicator Code'] == "DT.ODA.ODAT.KD"]

# Combine results
df_filtered = pd.concat([non_oda_data, target_oda_data])

In [68]:
df = df_filtered.loc[:,['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']]

In [69]:
df = df[df.index >= 72281]

In [70]:
df_long = df.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    var_name="Year",
    value_name="Value"
)

df_pivot = df_long.pivot_table(
    index=["Country Name", "Country Code", "Year"],
    columns="Indicator Name",
    values="Value"
).reset_index()

In [71]:
df_pivot = df_pivot[df_pivot['Net official development assistance received (constant 2021 US$)'].notna()]

In [72]:
df_pivot = df_pivot.sort_values(["Country Code", "Year"])

In [77]:
# Load data
df = df_pivot.copy()

# 1. Target Engineering
y = np.arcsinh(df['Net official development assistance received (constant 2021 US$)'])
X = df.drop(columns=['Net official development assistance received (constant 2021 US$)', 'Country Name', 'Country Code', 'Year'])

# 2. Memory-Efficient Missing Value Handling
missing_mask = X.isnull()
X = X.fillna(0)
for col in X.columns[missing_mask.any()]:
    X[f'{col}_missing'] = missing_mask[col].astype(int)

# 3. Feature Pre-Selection (Reduce 1500 -> ~300)
pre_selector = SelectFromModel(
    Lasso(alpha=0.1, max_iter=1000, random_state=42),
    max_features=300
)
X_reduced = pre_selector.fit_transform(X, y)

# 4. Temporal Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

# 5. Main LassoCV with Parallel Processing
final_lasso = make_pipeline(
    RobustScaler(),
    LassoCV(
        alphas=np.logspace(-3, 3, 50),
        cv=tscv,
        n_jobs=-1,
        max_iter=5000,
        random_state=42,
        selection='random'  # Faster convergence
    )
)

# 6. Fit Model
final_lasso.fit(X_reduced, y)

# 7. Get Selected Features
selected_features = X.columns[pre_selector.get_support()]
coefs = final_lasso.named_steps['lassocv'].coef_

print(f"Train R^2: {final_lasso.score(X_reduced, y)}")
print(f"Test R^2: {final_lasso.score(X_reduced, y)}")

  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = missing_mask[col].astype(int)
  X[f'{col}_missing'] = miss

Train R^2: 0.0
Test R^2: 0.0


  model = cd_fast.enet_coordinate_descent(


In [76]:
# Get top 20 features
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': coefs,
    'Absolute_Impact': np.abs(coefs)
}).sort_values('Absolute_Impact', ascending=False)

top_20 = feature_importance.head(20)
print(top_20)

                                               Feature  Coefficient  \
0    ARI treatment (% of children under 5 taken to ...          0.0   
206  Prevalence of overweight (modeled estimate, % ...          0.0   
204         Pregnant women receiving prenatal care (%)         -0.0   
203  Power outages in firms in a typical month (num...          0.0   
202  Population living in slums (% of urban populat...          0.0   
201  Population in the largest city (% of urban pop...          0.0   
200  Political Stability and Absence of Violence/Te...         -0.0   
199  Persistence to last grade of primary, total (%...          0.0   
198  Persistence to last grade of primary, female (...          0.0   
197        Persistence to grade 5, total (% of cohort)         -0.0   
196  People using safely managed sanitation service...         -0.0   
195  People using safely managed sanitation service...          0.0   
194  People using safely managed sanitation service...          0.0   
193  P