In [247]:
import pandas as pd
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Machine Learning Modelling

In [248]:
# Reading in the csv file 
df = pd.read_csv('data/df_transformed.csv', index_col=0).sort_values(by='rank', ascending=True).reset_index()
# Filtering out the country named "Not classified", as it has no values for any of the indicators
df = df[df['Country Name'] != 'Not classified']
# Filling in missing values for a specific feature with the average for that feature
features = df.drop(['Country Name', 'Country Code', 'mean_rank', 'rank'], axis=1).columns
df[features] = df[features].apply(lambda col: col.fillna(col.mean()), axis=0)


In [250]:
# Creating classes for the countries
# We create 3 classes that are supposed to represent: Higher, Middle, Lower
sections = np.array_split(df['Country Name'],3)
section_dict = {}
for rank, section in enumerate(sections):
    section_dict[rank+1] = section.to_list()

inverted_dict = {}
for key, value in section_dict.items():
    for country in value:
        inverted_dict[country] = key

# Creating a column that assigns a country a specific class based on our multidimensional index
df['category'] = df['Country Name'].map(inverted_dict)

  df['category'] = df['Country Name'].map(inverted_dict)


In [252]:
df.head()

Unnamed: 0,Country Name,Country Code,mean_rank,rank,2022_Access to clean fuels and technologies for cooking (% of population),"2022_Access to clean fuels and technologies for cooking, rural (% of rural population)","2022_Access to clean fuels and technologies for cooking, urban (% of urban population)",2022_Access to electricity (% of population),"2022_Access to electricity, rural (% of rural population)","2022_Access to electricity, urban (% of urban population)",...,2023_Unemployment with intermediate education (% of total labor force with intermediate education),"2023_Unemployment, total (% of total labor force) (national estimate)",2023_Urban population (% of total population),"2023_Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","2023_Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","2023_Wage and salaried workers, total (% of total employment) (modeled ILO estimate)",2023_Wanted fertility rate (births per woman),2023_Women who were first married by age 15 (% of women ages 20-24),2023_Women who were first married by age 18 (% of women ages 20-24),category
0,Singapore,SGP,25.553209,1,100.0,100.0,100.0,100.0,100.0,100.0,...,3.719,3.444,100.0,91.244274,82.802156,86.379577,3.198431,0.0,0.1,1
1,Switzerland,CHE,26.884541,2,100.0,100.0,100.0,100.0,100.0,100.0,...,3.75,4.043,74.202,85.957597,81.635457,83.649166,3.198431,4.682482,20.197143,1
2,Cayman Islands,CYM,27.838874,3,68.495015,59.795465,77.364009,100.0,100.0,100.0,...,4.82,4.24,100.0,57.526174,58.779282,58.248394,3.198431,4.682482,20.197143,1
3,"Korea, Rep.",KOR,29.188156,4,100.0,100.0,100.0,100.0,100.0,100.0,...,2.49,2.675,81.456,79.710211,74.167398,76.547952,3.198431,4.682482,20.197143,1
4,Netherlands,NLD,29.406586,5,100.0,100.0,100.0,100.0,100.0,100.0,...,3.216,3.537,93.179,87.344277,80.333192,83.640352,3.198431,4.682482,20.197143,1


### Logistic Regression

In [253]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Prepare features and target by dropping non-numeric identifier columns
X = df.drop(['Country Name', 'mean_rank', 'rank', 'category', 'Country Code'], axis=1)
y = df['category']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the linear regression model.
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

preds = lr_model.predict(X_test_scaled)
print(f"accuracy: {accuracy_score(y_test, preds)}")



accuracy: 0.8301886792452831


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [254]:
# Extract coefficients
coefs = lr_model.coef_

# For a simple feature importance, take the average absolute coefficient across classes:
feature_importance = np.mean(np.abs(coefs), axis=0)

# Create a DataFrame to display feature names and their importance
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importance
})

# Sort by importance (highest first)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importance (Average Absolute Coefficient):")
print(importance_df.head(20))

Feature Importance (Average Absolute Coefficient):
                                               Feature  Importance
114  2023_Children in employment, total (% of child...    0.375371
27   2022_Children in employment, total (% of child...    0.375371
76   2022_Unemployment with advanced education (% o...    0.360586
53             2022_International migrant stock, total    0.336985
140            2023_International migrant stock, total    0.336985
158  2023_School enrollment, secondary (gross), gen...    0.331095
71   2022_School enrollment, secondary (gross), gen...    0.331095
91   2023_Access to electricity, rural (% of rural ...    0.327656
4    2022_Access to electricity, rural (% of rural ...    0.327656
139  2023_Expenditure on tertiary education (% of g...    0.323854
52   2022_Expenditure on tertiary education (% of g...    0.323854
153  2023_People using safely managed drinking wate...    0.301674
66   2022_People using safely managed drinking wate...    0.301674
133  2023_C

### XGBoost Classifier

In [255]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Assume df is your DataFrame that includes 'Country Name', 'mean_rank', 'rank', 'category', 'Country Code', etc.
# and that 'category' is your target for multiclass classification with values like 1, 2, 3.
# Prepare features and target by dropping non-numeric identifier columns.
X = df.drop(['Country Name', 'mean_rank', 'rank', 'category', 'Country Code'], axis=1)
y = df['category']

# Adjust the target to be 0-based. For example, convert [1,2,3] -> [0,1,2].
y = y - 1

# Split the data into training and testing sets (stratified by y for balanced classes).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features so that each has zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the XGBoost classifier for multiclass prediction.
# 'num_class' is set to the number of unique classes in y (which should now be 3: 0, 1, 2).
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = xgb_model.predict(X_test_scaled)
print(f"XGBoost Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained XGBoost model.
# The booster returns importance keys in the format 'f0', 'f1', etc.
booster = xgb_model.get_booster()
importance_dict = booster.get_score(importance_type='gain')

# Convert the importance dictionary into a DataFrame.
importance_df = pd.DataFrame(list(importance_dict.items()), columns=['FeatureKey', 'Importance'])

# Map the XGBoost feature keys to the original feature names.
# The keys are in the form 'f0', 'f1', ... where the number corresponds to the index in X_train.columns.
importance_df['Feature'] = importance_df['FeatureKey'].apply(lambda x: X_train.columns[int(x[1:])])

# Rearrange columns and sort by importance in descending order.
importance_df = importance_df[['Feature', 'Importance']].sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance (Gain):")
print(importance_df.head(20))


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7547
Top 20 Features by Importance (Gain):
                                              Feature  Importance
4   2022_Access to electricity, rural (% of rural ...    5.045046
54  2022_Multidimensional poverty headcount ratio ...    4.286923
0   2022_Access to clean fuels and technologies fo...    3.540901
3        2022_Access to electricity (% of population)    2.592249
18  2022_Age dependency ratio, young (% of working...    2.562855
6   2022_Adequacy of social insurance programs (% ...    1.851766
27  2022_Children in employment, total (% of child...    1.848853
24  2022_Children in employment, study and work (%...    1.732705
73  2022_Wage and salaried workers, female (% of f...    1.620213
31  2022_Children out of school, male (% of male p...    1.598302
23  2022_Cause of death, by non-communicable disea...    1.388348
13  2022_Adolescents out of school (% of lower sec...    1.274869
12  2022_Adolescent fertility rate (births per 1,0...    1.261766
51  2022_Lite

### Random Forest Classifier

In [256]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Prepare features and target by dropping non-numeric identifier columns.
X = df.drop(['Country Name', 'mean_rank', 'rank', 'category', 'Country Code'], axis=1)
y = df['category']

# Adjust the target to be 0-based (e.g., convert [1,2,3] -> [0,1,2]).
y = y - 1

# Split the data into training and testing sets (stratified by y for balanced classes).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features so that each has zero mean and unit variance.
# (Random forests typically don't need scaling, but we include it for consistency.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the RandomForest classifier for multiclass prediction.
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy.
preds = rf_model.predict(X_test_scaled)
print(f"RandomForest Accuracy: {accuracy_score(y_test, preds):.4f}")

# Extract feature importance from the trained RandomForest model.
# The attribute `feature_importances_` gives an array of importance scores for each feature.
importances = rf_model.feature_importances_

# Create a DataFrame to display feature names and their importance.
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort the DataFrame by importance in descending order.
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Top 20 Features by Importance:")
print(importance_df.head(20))


RandomForest Accuracy: 0.7736
Top 20 Features by Importance:
                                               Feature  Importance
91   2023_Access to electricity, rural (% of rural ...    0.041776
3         2022_Access to electricity (% of population)    0.041305
4    2022_Access to electricity, rural (% of rural ...    0.034906
88   2023_Access to clean fuels and technologies fo...    0.031452
90        2023_Access to electricity (% of population)    0.029671
170  2023_Wage and salaried workers, total (% of to...    0.029497
0    2022_Access to clean fuels and technologies fo...    0.025888
1    2022_Access to clean fuels and technologies fo...    0.023811
2    2022_Access to clean fuels and technologies fo...    0.022635
87   2023_Access to clean fuels and technologies fo...    0.020463
99   2023_Adolescent fertility rate (births per 1,0...    0.018643
97   2023_Adjusted net national income per capita (...    0.016333
152  2023_People using at least basic sanitation se...    0.015482
1