In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("data_2.csv")

In [4]:
df.columns.tolist()

['Unnamed: 0',
 'Country Name',
 'Country Code',
 'Year',
 'Control of Corruption: Percentile Rank',
 'GDP growth (annual %)',
 'Gini index',
 'Net official development assistance received (constant 2021 US$)',
 'Political Stability and Absence of Violence/Terrorism: Percentile Rank',
 'Population, total',
 'Poverty headcount ratio at societal poverty line (% of population)']

In [5]:
df = df.drop(columns=['Unnamed: 0'])

In [None]:
# Sort data by country and year for lagging
df['Year'] = df['Year'].astype(int)
df = df.sort_values(['Country Code', 'Year'])

# Create 1-year lagged features for economic indicators
lag_years = 1
features_to_lag = [
    'Control of Corruption: Percentile Rank',
    'GDP growth (annual %)',
    'Gini index',
    'Political Stability and Absence of Violence/Terrorism: Percentile Rank',
    'Population, total',
    'Poverty headcount ratio at societal poverty line (% of population)'
]

for col in features_to_lag:
    df[f'{col}_lag{lag_years}'] = df.groupby('Country Code')[col].shift(lag_years)

# Drop original features
df = df.drop(columns=features_to_lag)

# Separate features from target
y = np.arcsinh(df['Net official development assistance received (constant 2021 US$)'])
X = df.drop(columns=['Net official development assistance received (constant 2021 US$)', 'Country Name', 'Country Code', 'Year']).copy()

# Handle missing values with dummies and zero imputation
missing_cols = X.columns[X.isnull().any()]
for col in missing_cols:
    X[f'{col}_missing'] = X[col].isnull().astype(int)
    X[col] = X[col].fillna(0)  # Proper in-place replacement

# Time-based split (using original year column)
sorted_years = sorted(df['Year'].unique())
cutoff = sorted_years[int(0.8 * len(sorted_years))]
train_mask = df['Year'] <= cutoff
test_mask = df['Year'] > cutoff

X_train, X_test = X.loc[train_mask], X.loc[test_mask]
y_train, y_test = y.loc[train_mask], y.loc[test_mask]

# scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

reg = LinearRegression()

# evaluating best model
best_lasso = grid_search.best_estimator_
train_pred = best_lasso.predict(X_train_scaled)
test_pred = best_lasso.predict(X_test_scaled)

print(f"Best alpha: {grid_search.best_params_['alpha']}")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, train_pred)):.4f}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, test_pred)):.4f}")

print(f"Train R^2: {best_lasso.score(X_train_scaled, y_train):.4f}")
print(f"Test R^2: {best_lasso.score(X_test_scaled, y_test):.4f}")