In [1]:
import pandas as pd
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

# Machine Learning Modelling

### Preparing the data

In [2]:
# Reading in the csv file 
df = pd.read_csv('data/df_transformed.csv', index_col=0).sort_values(by='rank', ascending=True).reset_index()
# Filtering out the country named "Not classified", as it has no values for any of the indicators
df = df[df['Country Name'] != 'Not classified']

In [3]:
df['cluster'].value_counts()

cluster
1    152
0    113
Name: count, dtype: int64

In [4]:
# List of non-feature columns (identifiers)
non_features = ['Country Name', 'Country Code', 'mean_rank', 'rank', 'cluster']

# Extract feature columns
features = list(set(df.columns) - set(non_features))

# Set index temporarily
df_indexed = df.set_index(non_features)

# KNN Imputer (k=3 means it uses the 3 nearest neighbors)
knn_imputer = KNNImputer(n_neighbors=3)

# Perform imputation only on feature columns
df_updated_features = pd.DataFrame(knn_imputer.fit_transform(df_indexed[features]), 
                                   columns=features, 
                                   index=df_indexed.index)  # Keep the index

# Assign imputed values back (using .loc to avoid SettingWithCopyWarning)
df_indexed.loc[:, features] = df_updated_features  

# Reset index to restore the original DataFrame structure
df = df_indexed.reset_index()

In [5]:
df

Unnamed: 0,Country Name,Country Code,mean_rank,rank,cluster,2022_Access to clean fuels and technologies for cooking (% of population),"2022_Access to clean fuels and technologies for cooking, rural (% of rural population)","2022_Access to clean fuels and technologies for cooking, urban (% of urban population)",2022_Access to electricity (% of population),"2022_Access to electricity, rural (% of rural population)",...,2023_Unemployment with basic education (% of total labor force with basic education),2023_Unemployment with intermediate education (% of total labor force with intermediate education),"2023_Unemployment, total (% of total labor force) (national estimate)",2023_Urban population (% of total population),"2023_Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","2023_Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","2023_Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",2023_Wanted fertility rate (births per woman),2023_Women who were first married by age 15 (% of women ages 20-24),2023_Women who were first married by age 18 (% of women ages 20-24)
0,Singapore,SGP,25.553209,1,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,3.732000,3.719000,3.444000,100.000000,91.244274,82.802156,86.379577,3.308760,0.000000,0.100000
1,Switzerland,CHE,26.884541,2,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,7.640000,3.750000,4.043000,74.202000,85.957597,81.635457,83.649166,3.308760,0.633333,9.566667
2,Cayman Islands,CYM,27.838874,3,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,5.720000,4.820000,4.240000,100.000000,88.998724,83.690570,86.093519,1.766667,0.300000,9.333333
3,"Korea, Rep.",KOR,29.188156,4,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,2.842000,2.490000,2.675000,81.456000,79.710211,74.167398,76.547952,1.953064,4.100000,10.733333
4,Netherlands,NLD,29.406586,5,1,100.000000,100.000000,100.000000,100.000000,100.000000,...,5.739000,3.216000,3.537000,93.179000,87.344277,80.333192,83.640352,3.308760,0.633333,2.566667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,Africa Eastern and Southern,AFE,135.600571,261,0,21.863139,9.261320,41.011132,48.711995,33.747907,...,2.781333,2.436000,2.730000,38.335337,20.164671,29.393837,25.069640,2.233333,0.933333,8.633333
261,Least developed countries: UN classification,LDC,139.357120,262,0,19.988544,9.890226,38.596563,56.826401,45.175976,...,2.781333,2.436000,2.730000,36.376819,17.900014,29.574557,24.722147,2.233333,0.933333,8.633333
262,Pre-demographic dividend,PRE,139.821747,263,0,22.819071,9.877199,41.158037,51.120342,29.453874,...,2.781333,2.436000,2.730000,44.148376,15.205223,25.847024,21.208575,2.233333,0.933333,8.633333
263,Heavily indebted poor countries (HIPC),HPC,141.609746,264,0,16.514066,6.565583,32.251006,48.655393,30.833905,...,6.308667,3.106333,3.243667,38.517364,14.424572,24.326447,19.914647,2.233333,0.933333,8.633333


### Splitting the data

In [6]:
# Prepare features and target by dropping non-numeric identifier columns
X = df.drop(non_features, axis=1)
y = df['cluster']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Logistic Regression

In [7]:
# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the linear regression model.
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

preds = lr_model.predict(X_test_scaled)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract coefficients
coefs = lr_model.coef_

# For a simple feature importance, take the average absolute coefficient across classes:
feature_importance = np.mean(np.abs(coefs), axis=0)

# Create a DataFrame to display feature names and their importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort by importance (highest first)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance (Average Absolute Coefficient):")
print(importance_df.head(20))


Logistic Regression Accuracy: 0.9245
Feature Importance (Average Absolute Coefficient):
                                               Feature  Importance
40                2022_Control of Corruption: Estimate    0.911357
118               2023_Control of Corruption: Estimate    0.911357
82   2023_Access to electricity, rural (% of rural ...    0.739885
4    2022_Access to electricity, rural (% of rural ...    0.739885
143  2023_Teenage mothers (% of women ages 15-19 wh...    0.739600
148  2023_Unemployment, total (% of total labor for...    0.690620
3         2022_Access to electricity (% of population)    0.686753
81        2023_Access to electricity (% of population)    0.686753
65   2022_Teenage mothers (% of women ages 15-19 wh...    0.669649
70   2022_Unemployment, total (% of total labor for...    0.653715
138  2023_People using at least basic sanitation se...    0.621057
60   2022_People using at least basic sanitation se...    0.621057
68   2022_Unemployment with basic educati

### Random Forest Classifier

In [8]:
# Scale features so that each has zero mean and unit variance.
# (Random forests typically don't need scaling, but we include it for consistency.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the RandomForest classifier for multiclass prediction.
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = rf_model.predict(X_test_scaled)
print(f"RandomForest Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained RandomForest model.
# The attribute `feature_importances_` gives an array of importance scores for each feature.
importances = rf_model.feature_importances_

# Create a DataFrame to display feature names and their importance.
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order.
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance:")
print(importance_df.head(20))


RandomForest Accuracy: 0.9623
Top 20 Features by Importance:
                                               Feature  Importance
81        2023_Access to electricity (% of population)    0.078667
3         2022_Access to electricity (% of population)    0.045364
82   2023_Access to electricity, rural (% of rural ...    0.036258
5    2022_Access to electricity, urban (% of urban ...    0.036050
19   2022_Births attended by skilled health staff (...    0.034252
4    2022_Access to electricity, rural (% of rural ...    0.032539
97   2023_Births attended by skilled health staff (...    0.030932
127  2023_Literacy rate, adult female (% of females...    0.030061
1    2022_Access to clean fuels and technologies fo...    0.029600
137  2023_People using at least basic drinking wate...    0.024414
49   2022_Literacy rate, adult female (% of females...    0.023191
59   2022_People using at least basic drinking wate...    0.021628
138  2023_People using at least basic sanitation se...    0.020998
1

In [None]:
importance_df.to_csv('importance_df_randomforest.csv')