In [1]:
import pandas as pd
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Machine Learning Modelling

In [2]:
# Reading in the csv file 
df = pd.read_csv('data/df_transformed.csv', index_col=0).sort_values(by='rank', ascending=True).reset_index()
# Filtering out the country named "Not classified", as it has no values for any of the indicators
df = df[df['Country Name'] != 'Not classified']

In [3]:
df.head()

Unnamed: 0,Country Name,Country Code,mean_rank,rank,2022_Access to clean fuels and technologies for cooking (% of population),"2022_Access to clean fuels and technologies for cooking, rural (% of rural population)","2022_Access to clean fuels and technologies for cooking, urban (% of urban population)",2022_Access to electricity (% of population),"2022_Access to electricity, rural (% of rural population)","2022_Access to electricity, urban (% of urban population)",...,2023_Unemployment with intermediate education (% of total labor force with intermediate education),"2023_Unemployment, total (% of total labor force) (national estimate)",2023_Urban population (% of total population),"2023_Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","2023_Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","2023_Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",2023_Wanted fertility rate (births per woman),2023_Women who were first married by age 15 (% of women ages 20-24),2023_Women who were first married by age 18 (% of women ages 20-24),cluster
0,Singapore,SGP,25.553209,1,100.0,100.0,100.0,100.0,100.0,100.0,...,3.719,3.444,100.0,91.244274,82.802156,86.379577,,0.0,0.1,1
1,Switzerland,CHE,26.884541,2,100.0,100.0,100.0,100.0,100.0,100.0,...,3.75,4.043,74.202,85.957597,81.635457,83.649166,,,,1
2,Cayman Islands,CYM,27.838874,3,,,,100.0,100.0,100.0,...,4.82,4.24,100.0,,,,,,,1
3,"Korea, Rep.",KOR,29.188156,4,100.0,100.0,100.0,100.0,100.0,100.0,...,2.49,2.675,81.456,79.710211,74.167398,76.547952,,,,1
4,Netherlands,NLD,29.406586,5,100.0,100.0,100.0,100.0,100.0,100.0,...,3.216,3.537,93.179,87.344277,80.333192,83.640352,,,,1


In [4]:
# Creating a list of non-feature columns
non_features = ['Country Name', 'Country Code', 'mean_rank', 'rank', 'cluster']
# Filling in missing values for a specific feature with the average for that feature
features = df.drop(non_features, axis=1).columns
df[features] = df[features].apply(lambda col: col.fillna(col.mean()), axis=0)


In [5]:
df.head()

Unnamed: 0,Country Name,Country Code,mean_rank,rank,2022_Access to clean fuels and technologies for cooking (% of population),"2022_Access to clean fuels and technologies for cooking, rural (% of rural population)","2022_Access to clean fuels and technologies for cooking, urban (% of urban population)",2022_Access to electricity (% of population),"2022_Access to electricity, rural (% of rural population)","2022_Access to electricity, urban (% of urban population)",...,2023_Unemployment with intermediate education (% of total labor force with intermediate education),"2023_Unemployment, total (% of total labor force) (national estimate)",2023_Urban population (% of total population),"2023_Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","2023_Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","2023_Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",2023_Wanted fertility rate (births per woman),2023_Women who were first married by age 15 (% of women ages 20-24),2023_Women who were first married by age 18 (% of women ages 20-24),cluster
0,Singapore,SGP,25.553209,1,100.0,100.0,100.0,100.0,100.0,100.0,...,3.719,3.444,100.0,91.244274,82.802156,86.379577,3.198431,0.0,0.1,1
1,Switzerland,CHE,26.884541,2,100.0,100.0,100.0,100.0,100.0,100.0,...,3.75,4.043,74.202,85.957597,81.635457,83.649166,3.198431,4.682482,20.197143,1
2,Cayman Islands,CYM,27.838874,3,68.495015,59.795465,77.364009,100.0,100.0,100.0,...,4.82,4.24,100.0,57.526174,58.779282,58.248394,3.198431,4.682482,20.197143,1
3,"Korea, Rep.",KOR,29.188156,4,100.0,100.0,100.0,100.0,100.0,100.0,...,2.49,2.675,81.456,79.710211,74.167398,76.547952,3.198431,4.682482,20.197143,1
4,Netherlands,NLD,29.406586,5,100.0,100.0,100.0,100.0,100.0,100.0,...,3.216,3.537,93.179,87.344277,80.333192,83.640352,3.198431,4.682482,20.197143,1


### Logistic Regression

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Prepare features and target by dropping non-numeric identifier columns
X = df.drop(non_features, axis=1)
y = df['cluster']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the linear regression model.
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

preds = lr_model.predict(X_test_scaled)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, preds):.4f}")



Logistic Regression Accuracy: 0.8868


In [7]:
# Extract coefficients
coefs = lr_model.coef_

# For a simple feature importance, take the average absolute coefficient across classes:
feature_importance = np.mean(np.abs(coefs), axis=0)

# Create a DataFrame to display feature names and their importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort by importance (highest first)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance (Average Absolute Coefficient):")
print(importance_df.head(20))

Feature Importance (Average Absolute Coefficient):
                                               Feature  Importance
149  2023_Multidimensional poverty headcount ratio ...    0.724257
62   2022_Multidimensional poverty headcount ratio ...    0.723586
46   2022_Current health expenditure per capita (cu...    0.680960
133  2023_Current health expenditure per capita (cu...    0.680960
153  2023_People using safely managed drinking wate...    0.642474
66   2022_People using safely managed drinking wate...    0.642474
84       2022_Wanted fertility rate (births per woman)    0.616051
171      2023_Wanted fertility rate (births per woman)    0.603338
94   2023_Adequacy of social protection and labor p...    0.571827
7    2022_Adequacy of social protection and labor p...    0.571827
121  2023_Children out of school (% of primary scho...    0.567303
148  2023_Multidimensional poverty headcount ratio ...    0.536462
61   2022_Multidimensional poverty headcount ratio ...    0.536462
71   2022_S

### XGBoost Classifier

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Assume df is your DataFrame that includes 'Country Name', 'mean_rank', 'rank', 'category', 'Country Code', etc.
# and that 'category' is your target for multiclass classification with values like 1, 2, 3.
# Prepare features and target by dropping non-numeric identifier columns.
X = df.drop(non_features, axis=1)
y = df['cluster']

# Split the data into training and testing sets (stratified by y for balanced classes).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the XGBoost classifier for multiclass prediction.
# 'num_class' is set to the number of unique classes in y (which should now be 3: 0, 1, 2).
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = xgb_model.predict(X_test_scaled)
print(f"XGBoost Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained XGBoost model.
# The booster returns importance keys in the format 'f0', 'f1', etc.
booster = xgb_model.get_booster()
importance_dict = booster.get_score(importance_type='gain')

# Convert the importance dictionary into a DataFrame.
importance_df = pd.DataFrame(list(importance_dict.items()), columns=['FeatureKey', 'Importance'])

# Map the XGBoost feature keys to the original feature names.
# The keys are in the form 'f0', 'f1', ... where the number corresponds to the index in X_train.columns.
importance_df['Feature'] = importance_df['FeatureKey'].apply(lambda x: X_train.columns[int(x[1:])])

# Rearrange columns and sort by importance in descending order.
importance_df = importance_df[['Feature', 'Importance']].sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance (Gain):")
print(importance_df.head(20))


XGBoost Accuracy: 0.9434
Top 20 Features by Importance (Gain):
                                              Feature  Importance
2        2022_Access to electricity (% of population)   14.425122
42  2022_Multidimensional poverty headcount ratio ...    3.397663
19  2022_Children in employment, study and work (%...    3.043503
41  2022_Multidimensional poverty headcount ratio ...    2.647210
17        2022_Cause of death, by injury (% of total)    2.243744
24       2022_Compensation of employees (current LCU)    1.583333
28  2022_Current health expenditure per capita (cu...    1.582231
39  2022_Literacy rate, youth female (% of females...    1.377763
44  2022_People using at least basic drinking wate...    1.304783
51  2022_Unemployment with basic education (% of t...    1.176013
43  2022_Multilateral debt service (% of public an...    1.039275
21  2022_Children in employment, wage workers (% o...    0.924709
6   2022_Adequacy of social safety net programs (%...    0.877922
36  2022_Lite

Parameters: { "use_label_encoder" } are not used.



In [9]:
importance_df.to_csv('data/importance_df_xgboost.csv')

### Random Forest Classifier

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Prepare features and target by dropping non-numeric identifier columns.
X = df.drop(non_features, axis=1)
y = df['cluster']

# Split the data into training and testing sets (stratified by y for balanced classes).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features so that each has zero mean and unit variance.
# (Random forests typically don't need scaling, but we include it for consistency.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the RandomForest classifier for multiclass prediction.
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = rf_model.predict(X_test_scaled)
print(f"RandomForest Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained RandomForest model.
# The attribute `feature_importances_` gives an array of importance scores for each feature.
importances = rf_model.feature_importances_

# Create a DataFrame to display feature names and their importance.
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order.
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance:")
print(importance_df.head(20))


RandomForest Accuracy: 0.9434
Top 20 Features by Importance:
                                               Feature  Importance
3         2022_Access to electricity (% of population)    0.085782
90        2023_Access to electricity (% of population)    0.071702
91   2023_Access to electricity, rural (% of rural ...    0.066012
4    2022_Access to electricity, rural (% of rural ...    0.043799
5    2022_Access to electricity, urban (% of urban ...    0.039969
151  2023_People using at least basic drinking wate...    0.038712
65   2022_People using at least basic sanitation se...    0.037513
88   2023_Access to clean fuels and technologies fo...    0.029121
83   2022_Wage and salaried workers, total (% of to...    0.026780
0    2022_Access to clean fuels and technologies fo...    0.026577
64   2022_People using at least basic drinking wate...    0.024253
168  2023_Wage and salaried workers, female (% of f...    0.022906
46   2022_Current health expenditure per capita (cu...    0.020709
9