In [1]:
import pandas as pd
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

# Machine Learning Modelling

In [2]:
# Reading in the csv file 
df = pd.read_csv('data/df_transformed.csv', index_col=0).sort_values(by='rank', ascending=True).reset_index()
# Filtering out the country named "Not classified", as it has no values for any of the indicators
df = df[df['Country Name'] != 'Not classified']

In [3]:
df['cluster'].value_counts()

cluster
1    152
0    113
Name: count, dtype: int64

In [4]:
# List of non-feature columns (identifiers)
non_features = ['Country Name', 'Country Code', 'mean_rank', 'rank', 'cluster']

# Extract feature columns
features = list(set(df.columns) - set(non_features))

# Set index temporarily
df_indexed = df.set_index(non_features)

# KNN Imputer (k=3 means it uses the 3 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=3)

# Perform imputation only on feature columns
df_updated_features = pd.DataFrame(knn_imputer.fit_transform(df_indexed[features]), 
                                   columns=features, 
                                   index=df_indexed.index)  # Keep the index

# Assign imputed values back (using .loc to avoid SettingWithCopyWarning)
df_indexed.loc[:, features] = df_updated_features  

# Reset index to restore the original DataFrame structure
df = df_indexed.reset_index()

In [5]:
df

Unnamed: 0,Country Name,Country Code,mean_rank,rank,cluster,2022_Access to clean fuels and technologies for cooking (% of population),"2022_Access to clean fuels and technologies for cooking, rural (% of rural population)","2022_Access to clean fuels and technologies for cooking, urban (% of urban population)",2022_Access to electricity (% of population),"2022_Access to electricity, rural (% of rural population)",...,2023_Unemployment with basic education (% of total labor force with basic education),2023_Unemployment with intermediate education (% of total labor force with intermediate education),"2023_Unemployment, total (% of total labor force) (national estimate)",2023_Urban population (% of total population),"2023_Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","2023_Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","2023_Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",2023_Wanted fertility rate (births per woman),2023_Women who were first married by age 15 (% of women ages 20-24),2023_Women who were first married by age 18 (% of women ages 20-24)
0,Singapore,SGP,25.553209,1,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,3.732000,3.719000,3.444000,100.000000,91.244274,82.802156,86.379577,3.308760,0.000000,0.100000
1,Switzerland,CHE,26.884541,2,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,7.640000,3.750000,4.043000,74.202000,85.957597,81.635457,83.649166,3.308760,0.633333,9.566667
2,Cayman Islands,CYM,27.838874,3,1,87.466667,77.566667,94.300000,100.000000,100.000000,...,5.720000,4.820000,4.240000,100.000000,53.169692,55.241580,53.842886,2.033333,2.466667,17.633333
3,"Korea, Rep.",KOR,29.188156,4,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,2.842000,2.490000,2.675000,81.456000,79.710211,74.167398,76.547952,3.308760,4.100000,10.733333
4,Netherlands,NLD,29.406586,5,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,5.739000,3.216000,3.537000,93.179000,87.344277,80.333192,83.640352,3.308760,0.633333,2.566667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,Africa Eastern and Southern,AFE,135.600571,261,0,21.863139,9.261320,41.011132,48.711995,33.747907,...,6.121333,2.883000,3.207333,38.335337,20.164671,29.393837,25.069640,2.033333,1.533333,12.000000
261,Least developed countries: UN classification,LDC,139.357120,262,0,19.988544,9.890226,38.596563,56.826401,45.175976,...,9.167228,8.663381,8.069812,36.376819,17.900014,29.574557,24.722147,2.665092,2.333333,7.633333
262,Pre-demographic dividend,PRE,139.821747,263,0,22.819071,9.877199,41.158037,51.120342,29.453874,...,9.167228,8.663381,8.069812,44.148376,15.205223,25.847024,21.208575,2.665092,2.333333,7.633333
263,Heavily indebted poor countries (HIPC),HPC,141.609746,264,0,16.514066,6.565583,32.251006,48.655393,30.833905,...,9.167228,8.663381,8.069812,38.517364,14.424572,24.326447,19.914647,2.665092,2.333333,7.633333


In [6]:
# Prepare features and target by dropping non-numeric identifier columns
X = df.drop(non_features, axis=1)
y = df['cluster']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Logistic Regression

In [7]:
# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the linear regression model.
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

preds = lr_model.predict(X_test_scaled)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract coefficients
coefs = lr_model.coef_

# For a simple feature importance, take the average absolute coefficient across classes:
feature_importance = np.mean(np.abs(coefs), axis=0)

# Create a DataFrame to display feature names and their importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort by importance (highest first)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance (Average Absolute Coefficient):")
print(importance_df.head(20))


Logistic Regression Accuracy: 0.9245
Feature Importance (Average Absolute Coefficient):
                                               Feature  Importance
3         2022_Access to electricity (% of population)    0.794330
90        2023_Access to electricity (% of population)    0.794330
78   2022_Unemployment with intermediate education ...    0.646726
74       2022_Terms of trade adjustment (constant LCU)    0.643784
65   2022_People using at least basic sanitation se...    0.613565
152  2023_People using at least basic sanitation se...    0.613565
161      2023_Terms of trade adjustment (constant LCU)    0.589425
66   2022_People using safely managed drinking wate...    0.585089
153  2023_People using safely managed drinking wate...    0.585089
88   2023_Access to clean fuels and technologies fo...    0.505115
1    2022_Access to clean fuels and technologies fo...    0.505115
30   2022_Children in employment, wage workers, mal...    0.470867
117  2023_Children in employment, wage wo

### XGBoost Classifier

In [8]:
# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the XGBoost classifier for multiclass prediction.
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = xgb_model.predict(X_test_scaled)
print(f"XGBoost Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained XGBoost model.
# The booster returns importance keys in the format 'f0', 'f1', etc.
booster = xgb_model.get_booster()
importance_dict = booster.get_score(importance_type='gain')

# Convert the importance dictionary into a DataFrame.
importance_df = pd.DataFrame(list(importance_dict.items()), columns=['FeatureKey', 'Importance'])

# Map the XGBoost feature keys to the original feature names.
# The keys are in the form 'f0', 'f1', ... where the number corresponds to the index in X_train.columns.
importance_df['Feature'] = importance_df['FeatureKey'].apply(lambda x: X_train.columns[int(x[1:])])

# Rearrange columns and sort by importance in descending order.
importance_df = importance_df[['Feature', 'Importance']].sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance (Gain):")
print(importance_df.head(20))


XGBoost Accuracy: 0.9623
Top 20 Features by Importance (Gain):
                                              Feature  Importance
3        2022_Access to electricity (% of population)    7.556928
39  2022_Literacy rate, youth female (% of females...    4.012352
17        2022_Cause of death, by injury (% of total)    2.471730
9   2022_Adjusted savings: education expenditure (...    1.785714
6   2022_Adequacy of social protection and labor p...    1.583333
42  2022_People using safely managed drinking wate...    1.239861
51  2022_Unemployment with basic education (% of t...    1.215049
40  2022_People using at least basic drinking wate...    0.897486
13  2022_Age dependency ratio, young (% of working...    0.735929
15  2022_Births attended by skilled health staff (...    0.705088
16  2022_Cause of death, by communicable diseases ...    0.679423
4   2022_Access to electricity, rural (% of rural ...    0.660878
44  2022_Proportion of people living below 50 perc...    0.628648
19  2022_Chil

Parameters: { "use_label_encoder" } are not used.



In [11]:
importance_df.to_csv('importance_df_xgboost.csv')

### Random Forest Classifier

In [10]:
# Scale features so that each has zero mean and unit variance.
# (Random forests typically don't need scaling, but we include it for consistency.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the RandomForest classifier for multiclass prediction.
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = rf_model.predict(X_test_scaled)
print(f"RandomForest Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained RandomForest model.
# The attribute `feature_importances_` gives an array of importance scores for each feature.
importances = rf_model.feature_importances_

# Create a DataFrame to display feature names and their importance.
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order.
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance:")
print(importance_df.head(20))


RandomForest Accuracy: 0.9623
Top 20 Features by Importance:
                                               Feature  Importance
3         2022_Access to electricity (% of population)    0.093235
90        2023_Access to electricity (% of population)    0.077955
4    2022_Access to electricity, rural (% of rural ...    0.056678
91   2023_Access to electricity, rural (% of rural ...    0.056096
65   2022_People using at least basic sanitation se...    0.051682
64   2022_People using at least basic drinking wate...    0.044320
151  2023_People using at least basic drinking wate...    0.042990
5    2022_Access to electricity, urban (% of urban ...    0.033528
106          2023_Birth rate, crude (per 1,000 people)    0.029969
88   2023_Access to clean fuels and technologies fo...    0.021219
0    2022_Access to clean fuels and technologies fo...    0.020867
19           2022_Birth rate, crude (per 1,000 people)    0.016937
133  2023_Current health expenditure per capita (cu...    0.015508
2