# Imports and Cleaning

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning, UndefinedMetricWarning

# Suppress specific warnings
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  
from sklearn.linear_model import RidgeClassifier  
from sklearn.linear_model import LassoLars 
from sklearn.naive_bayes import GaussianNB  
from sklearn.svm import SVC  
from sklearn.metrics import classification_report

In [4]:
from termcolor import colored as cl  # text customization
from sklearn.metrics import roc_auc_score  # AUC metric
from sklearn.linear_model import LogisticRegression  # Logistic Regression for classification
from sklearn.linear_model import RidgeClassifier  # Ridge Classifier for classification
from sklearn.naive_bayes import GaussianNB  # Naive Bayes for classification
from sklearn.svm import SVC  # Support Vector Classifier

In [5]:
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler

In [6]:
#for random state (to have it consistent throughout the model)
seed = 9

In [7]:
data = pd.read_csv("../World_Datasets/final_dataset_world.csv")

In [8]:
data

Unnamed: 0,Country,S&P Rating,Country Name,country_code,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,...,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,Unnamed: 73,value_counts
0,Australia,AAA,Australia,AUS,6.288000,6.078000,6.055000,5.711000,5.592000,5.300000,...,5.248079e+10,6.565372e+10,5.390953e+10,5.799469e+10,4.254463e+10,5.787750e+10,5.670190e+10,6.170335e+10,3.085167e+10,73.0
1,Canada,AAA,Canada,CAN,6.829000,7.023000,6.945000,7.038000,6.426000,5.837000,...,8.271811e+10,8.667771e+10,8.392560e+10,8.529711e+10,9.042814e+10,1.066151e+11,1.069524e+11,1.175509e+11,5.877545e+10,73.0
2,Denmark,AAA,Denmark,DNK,4.476000,6.925000,6.278000,5.989000,5.833000,5.131000,...,6.421581e+10,7.524445e+10,7.094208e+10,6.683555e+10,7.282335e+10,8.223584e+10,9.607255e+10,1.093708e+11,5.468542e+10,73.0
3,Germany,AAA,Germany,DEU,7.917000,4.981000,4.624000,4.122000,3.746000,3.384000,...,1.840313e+11,1.999831e+11,1.980271e+11,2.240280e+11,2.684086e+11,2.957362e+11,2.939137e+11,3.227001e+11,1.613500e+11,62.0
4,Liechtenstein,AAA,Liechtenstein,LIE,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,...,2.453835e+09,2.181187e+09,1.908538e+09,1.635890e+09,1.363242e+09,1.090593e+09,8.179449e+08,5.452966e+08,2.726483e+08,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Sri Lanka,SD,Sri Lanka,LKA,7.740000,4.157000,4.519000,4.243000,4.046000,4.318000,...,6.008199e+09,7.959048e+09,6.920826e+09,7.648305e+09,5.663994e+09,3.136992e+09,2.352744e+09,1.568496e+09,7.842479e+08,71.0
127,Ukraine,SD,Ukraine,UKR,11.707000,9.270000,9.140000,9.350000,9.500000,8.799000,...,1.553726e+10,1.881093e+10,2.081790e+10,2.531700e+10,2.913754e+10,3.096667e+10,2.850593e+10,4.051011e+10,2.025505e+10,67.0
128,Lebanon,D,Lebanon,LBN,8.594000,8.796000,9.270000,9.760000,10.236000,10.741000,...,5.390551e+10,5.541153e+10,5.238061e+10,5.221348e+10,4.244040e+10,3.523922e+10,3.251288e+10,2.167525e+10,1.083763e+10,66.0
129,Puerto Rico,D,Puerto Rico,PRI,10.080000,13.900000,12.000000,11.800000,10.800000,9.200000,...,4.084615e+10,3.630769e+10,3.176923e+10,2.723077e+10,2.269231e+10,1.815385e+10,1.361538e+10,9.076923e+09,4.538462e+09,39.0


In [9]:
#dropping non-relevant columns
data = data.drop(['Unnamed: 73', 'value_counts', 'Country Name', 'country_code'], axis = 1)

In [10]:
print('Number of unique values: ', data['S&P Rating'].nunique())
print(data['S&P Rating'].unique())

Number of unique values:  38
['AAA' 'AA+' 'AA+\xa0' 'AA-' 'AA' 'AA\xa0' 'A+' 'NR' 'A\xa0' 'A' 'AA-\xa0'
 'A-' 'A-\xa0' 'BBB+' 'BBB' 'BBB\xa0' 'BBB+\xa0' 'BBB-' 'BBB-\xa0'
 'BB+\xa0' 'BB+' 'BB\xa0' 'BB' 'BB-\xa0' 'BB-' 'B+' 'B+\xa0' 'SD' 'B\xa0'
 'B-' 'B' 'B-\xa0' 'CCC+\xa0' 'CCC+' 'CCC' 'D' 'D\xa0' 'SD\xa0']


As 38 is too many values for a classification, we will classify between 'Investment Grade' (from AAA to BBB-) and High Yield (from BB+ to D). As the non-rated values belong to none of these categories, we will remove the NR values.

In [11]:
data = data[data['S&P Rating'] != 'NR'] #dropping non-rated

In [12]:
data['target'] = np.where(
    data['S&P Rating'].isin([
        'AAA', 'AA+', 'AA+\xa0', 'AA-', 'AA', 'AA\xa0', 'A+', 'NR', 'A\xa0', 
        'A', 'AA-\xa0', 'A-', 'A-\xa0', 'BBB+', 'BBB', 'BBB\xa0', 'BBB+\xa0', 
        'BBB-', 'BBB-\xa0']),
    0,  # Investment Grade
    1   # High Yield
)


We now have 3 classification possibilities. 

# Base Model

In [13]:
data

Unnamed: 0,Country,S&P Rating,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,unemployment_2019,unemployment_2020,...,total_reserves_2015,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,target
0,Australia,AAA,6.288000,6.078000,6.055000,5.711000,5.592000,5.300000,5.159000,6.456000,...,4.540604e+10,5.248079e+10,6.565372e+10,5.390953e+10,5.799469e+10,4.254463e+10,5.787750e+10,5.670190e+10,6.170335e+10,0
1,Canada,AAA,6.829000,7.023000,6.945000,7.038000,6.426000,5.837000,5.690000,9.657000,...,7.975352e+10,8.271811e+10,8.667771e+10,8.392560e+10,8.529711e+10,9.042814e+10,1.066151e+11,1.069524e+11,1.175509e+11,0
2,Denmark,AAA,4.476000,6.925000,6.278000,5.989000,5.833000,5.131000,5.018000,5.637000,...,6.518509e+10,6.421581e+10,7.524445e+10,7.094208e+10,6.683555e+10,7.282335e+10,8.223584e+10,9.607255e+10,1.093708e+11,0
3,Germany,AAA,7.917000,4.981000,4.624000,4.122000,3.746000,3.384000,3.136000,3.856000,...,1.737309e+11,1.840313e+11,1.999831e+11,1.980271e+11,2.240280e+11,2.684086e+11,2.957362e+11,2.939137e+11,3.227001e+11,0
4,Liechtenstein,AAA,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,8.244777,9.305229,...,2.726483e+09,2.453835e+09,2.181187e+09,1.908538e+09,1.635890e+09,1.363242e+09,1.090593e+09,8.179449e+08,5.452966e+08,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Sri Lanka,SD,7.740000,4.157000,4.519000,4.243000,4.046000,4.318000,4.670000,5.365000,...,7.302097e+09,6.008199e+09,7.959048e+09,6.920826e+09,7.648305e+09,5.663994e+09,3.136992e+09,2.352744e+09,1.568496e+09,1
127,Ukraine,SD,11.707000,9.270000,9.140000,9.350000,9.500000,8.799000,8.194000,9.475000,...,1.330088e+10,1.553726e+10,1.881093e+10,2.081790e+10,2.531700e+10,2.913754e+10,3.096667e+10,2.850593e+10,4.051011e+10,1
128,Lebanon,D,8.594000,8.796000,9.270000,9.760000,10.236000,10.741000,11.301000,13.235000,...,4.853139e+10,5.390551e+10,5.541153e+10,5.238061e+10,5.221348e+10,4.244040e+10,3.523922e+10,3.251288e+10,2.167525e+10,1
129,Puerto Rico,D,10.080000,13.900000,12.000000,11.800000,10.800000,9.200000,8.300000,8.890000,...,4.538462e+10,4.084615e+10,3.630769e+10,3.176923e+10,2.723077e+10,2.269231e+10,1.815385e+10,1.361538e+10,9.076923e+09,1


In [14]:
data = data.set_index(data['Country'])
data = data.drop('Country', axis = 1)

In [15]:
data.head()

Unnamed: 0_level_0,S&P Rating,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,unemployment_2019,unemployment_2020,unemployment_2021,...,total_reserves_2015,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,target
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,AAA,6.288,6.078,6.055,5.711,5.592,5.3,5.159,6.456,5.116,...,45406040000.0,52480790000.0,65653720000.0,53909530000.0,57994690000.0,42544630000.0,57877500000.0,56701900000.0,61703350000.0,0
Canada,AAA,6.829,7.023,6.945,7.038,6.426,5.837,5.69,9.657,7.527,...,79753520000.0,82718110000.0,86677710000.0,83925600000.0,85297110000.0,90428140000.0,106615100000.0,106952400000.0,117550900000.0,0
Denmark,AAA,4.476,6.925,6.278,5.989,5.833,5.131,5.018,5.637,5.043,...,65185090000.0,64215810000.0,75244450000.0,70942080000.0,66835550000.0,72823350000.0,82235840000.0,96072550000.0,109370800000.0,0
Germany,AAA,7.917,4.981,4.624,4.122,3.746,3.384,3.136,3.856,3.638,...,173730900000.0,184031300000.0,199983100000.0,198027100000.0,224028000000.0,268408600000.0,295736200000.0,293913700000.0,322700100000.0,0
Liechtenstein,AAA,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,8.244777,9.305229,9.015766,...,2726483000.0,2453835000.0,2181187000.0,1908538000.0,1635890000.0,1363242000.0,1090593000.0,817944900.0,545296600.0,0


## Train Test Split 

In [16]:
X = data.drop(['target', 'S&P Rating'], axis = 1)
y = data[['target']]

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=seed)

## Intial Modeling and Results 

In [18]:
# 1. Logistic Regression
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
logistic_test_yhat = logistic.predict(X_test)
logistic_train_yhat = logistic.predict(X_train)

# 2. Ridge Classifier
ridge_classifier = RidgeClassifier(alpha=0.5)
ridge_classifier.fit(X_train, y_train)
ridge_classifier_test_yhat = ridge_classifier.predict(X_test)
ridge_classifier_train_yhat = ridge_classifier.predict(X_train)

# 3. LassoLars (for classification)
lasso_lars = LassoLars(alpha=0.01)
lasso_lars.fit(X_train, y_train)
lasso_lars_test_yhat = lasso_lars.predict(X_test)
lasso_lars_train_yhat = lasso_lars.predict(X_train)

# 4. Gaussian Naive Bayes
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
naive_bayes_test_yhat = naive_bayes.predict(X_test)
naive_bayes_train_yhat = naive_bayes.predict(X_train)

# 5. Support Vector Classifier
svc = SVC()
svc.fit(X_train, y_train)
svc_test_yhat = svc.predict(X_test)
svc_train_yhat = svc.predict(X_train)

In [19]:
# Initialize models
logistic = LogisticRegression()
ridge_classifier = RidgeClassifier()
naive_bayes = GaussianNB()
svc = SVC(probability=True)  # Enable probability estimates for AUC calculation
lasso_lars = LassoLars(alpha=0.01)  # Lasso Lars for classification

print(cl('AUC SCORE:', attrs=['bold']))
print('-------------------------------------------------------------------------------')

# 1. Logistic Regression
logistic.fit(X_train, y_train)  # Ensure the model is fitted
logistic_train_auc = roc_auc_score(y_train, logistic.predict_proba(X_train)[:, 1])
logistic_test_auc = roc_auc_score(y_test, logistic.predict_proba(X_test)[:, 1])
print(cl('Logistic model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(logistic_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(logistic_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 2. Ridge Classifier
ridge_classifier.fit(X_train, y_train)  # Ensure the model is fitted
ridge_classifier_train_auc = roc_auc_score(y_train, ridge_classifier.decision_function(X_train))
ridge_classifier_test_auc = roc_auc_score(y_test, ridge_classifier.decision_function(X_test))
print(cl('Ridge model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(ridge_classifier_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(ridge_classifier_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 3. LassoLars (for classification)
lasso_lars.fit(X_train, y_train)  # Ensure the model is fitted
# Calculate AUC scores for Lasso Lars
lasso_lars_train_predictions = lasso_lars.predict(X_train)
lasso_lars_test_predictions = lasso_lars.predict(X_test)
lasso_lars_train_auc = roc_auc_score(y_train, lasso_lars_train_predictions)
lasso_lars_test_auc = roc_auc_score(y_test, lasso_lars_test_predictions)
print(cl('LassoLars model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(lasso_lars_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(lasso_lars_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 4. Gaussian Naive Bayes
naive_bayes.fit(X_train, y_train)  # Ensure the model is fitted
naive_bayes_train_auc = roc_auc_score(y_train, naive_bayes.predict_proba(X_train)[:, 1])
naive_bayes_test_auc = roc_auc_score(y_test, naive_bayes.predict_proba(X_test)[:, 1])
print(cl('Naive Bayes model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(naive_bayes_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(naive_bayes_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 5. Support Vector Classifier
svc.fit(X_train, y_train)  # Ensure the model is fitted
svc_train_auc = roc_auc_score(y_train, svc.predict_proba(X_train)[:, 1])  # Use predict_proba for AUC
svc_test_auc = roc_auc_score(y_test, svc.predict_proba(X_test)[:, 1])
print(cl('SVC model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(svc_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(svc_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')


[1mAUC SCORE:[0m
-------------------------------------------------------------------------------
[1mLogistic model:[0m
[1mTrain - AUC score is 0.9422[0m
[1mTest - AUC score is 0.8339[0m
-------------------------------------------------------------------------------
[1mRidge model:[0m
[1mTrain - AUC score is 1.0000[0m
[1mTest - AUC score is 0.5878[0m
-------------------------------------------------------------------------------
[1mLassoLars model:[0m
[1mTrain - AUC score is 0.9380[0m
[1mTest - AUC score is 0.7179[0m
-------------------------------------------------------------------------------
[1mNaive Bayes model:[0m
[1mTrain - AUC score is 0.7854[0m
[1mTest - AUC score is 0.6779[0m
-------------------------------------------------------------------------------
[1mSVC model:[0m
[1mTrain - AUC score is 0.8084[0m
[1mTest - AUC score is 0.6850[0m
-------------------------------------------------------------------------------


In [20]:
print(cl('Logistic Model: Other Metrics:', attrs=['bold']))
# 1. Logistic Regression
logistic.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, logistic_test_yhat))

[1mLogistic Model: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.68      0.93      0.78        29
           1       0.82      0.41      0.55        22

    accuracy                           0.71        51
   macro avg       0.75      0.67      0.66        51
weighted avg       0.74      0.71      0.68        51



In [21]:
print(cl('Lasso Lars Classifier: Other Metrics:', attrs=['bold']))
# 2. Ridge Classifier
ridge_classifier.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, ridge_classifier_test_yhat))

[1mLasso Lars Classifier: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.72      0.62      0.67        29
           1       0.58      0.68      0.62        22

    accuracy                           0.65        51
   macro avg       0.65      0.65      0.65        51
weighted avg       0.66      0.65      0.65        51



In [22]:
# 3. Lasso Lars Classifier (as this is originally designed for regressions, we have a few changes to make to adapt it to binary classification)
lasso_lars.fit(X_train, y_train)
lasso_lars_test_yhat = lasso_lars.predict(X_test) # Get continuous predictions
lasso_lars_test_labels = (lasso_lars_test_yhat >= 0.5).astype(int) # Convert continuous predictions to class labels using a threshold (0.5)

# Print the classification report
print(cl('Lasso Lars Classifier:', attrs=['bold']))
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, lasso_lars_test_labels))


[1mLasso Lars Classifier:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.76      0.66      0.70        29
           1       0.62      0.73      0.67        22

    accuracy                           0.69        51
   macro avg       0.69      0.69      0.69        51
weighted avg       0.70      0.69      0.69        51



In [23]:
print(cl('Gaussian Naive Bayes: Other Metrics:', attrs=['bold']))
# 4. Gaussian Naive Bayes
naive_bayes.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, naive_bayes_test_yhat))

[1mGaussian Naive Bayes: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.79      0.52      0.62        29
           1       0.56      0.82      0.67        22

    accuracy                           0.65        51
   macro avg       0.68      0.67      0.65        51
weighted avg       0.69      0.65      0.64        51



In [24]:
print(cl('Support Vector Classifier: Other Metrics:', attrs=['bold']))
# 5. Support Vector Classifier
svc.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test,svc_test_yhat))

[1mSupport Vector Classifier: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.80      0.28      0.41        29
           1       0.49      0.91      0.63        22

    accuracy                           0.55        51
   macro avg       0.64      0.59      0.52        51
weighted avg       0.67      0.55      0.51        51



In [25]:
# AUC scores from all models
model_names = ['Logistic Regression', 'Ridge Classifier', 'Lasso Lars', 'Naive Bayes', 'SVC']
train_auc_scores = [logistic_train_auc, ridge_classifier_train_auc, lasso_lars_train_auc, naive_bayes_train_auc, svc_train_auc]
test_auc_scores = [logistic_test_auc, ridge_classifier_test_auc, lasso_lars_test_auc, naive_bayes_test_auc, svc_test_auc]

# Create a DataFrame for visualization
auc_data = pd.DataFrame({
    'Model': model_names * 2,
    'Dataset': ['Train'] * len(model_names) + ['Test'] * len(model_names),
    'AUC Score': train_auc_scores + test_auc_scores})

# Create the bar plot with the plotly_dark template
fig = px.bar(
    auc_data, 
    x='Model', 
    y='AUC Score', 
    color='Dataset', 
    barmode='group',
    title='Comparison of AUC Scores Across Models',
    labels={'AUC Score': 'AUC Score', 'Model': 'Classification Model'},
    template='plotly_dark'
)

# Customize layout
fig.update_layout(
    title_font_size=20,
    xaxis_title='Model',
    yaxis_title='AUC Score',
    legend_title='Dataset',
)

# Display the plot
fig.show()


# Improved Modeling

## Feature Engineering 

Creation of relevant ratios recarding country ratings. 

In [26]:
df = data.copy()

In [27]:
df.head()

Unnamed: 0_level_0,S&P Rating,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,unemployment_2019,unemployment_2020,unemployment_2021,...,total_reserves_2015,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,target
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,AAA,6.288,6.078,6.055,5.711,5.592,5.3,5.159,6.456,5.116,...,45406040000.0,52480790000.0,65653720000.0,53909530000.0,57994690000.0,42544630000.0,57877500000.0,56701900000.0,61703350000.0,0
Canada,AAA,6.829,7.023,6.945,7.038,6.426,5.837,5.69,9.657,7.527,...,79753520000.0,82718110000.0,86677710000.0,83925600000.0,85297110000.0,90428140000.0,106615100000.0,106952400000.0,117550900000.0,0
Denmark,AAA,4.476,6.925,6.278,5.989,5.833,5.131,5.018,5.637,5.043,...,65185090000.0,64215810000.0,75244450000.0,70942080000.0,66835550000.0,72823350000.0,82235840000.0,96072550000.0,109370800000.0,0
Germany,AAA,7.917,4.981,4.624,4.122,3.746,3.384,3.136,3.856,3.638,...,173730900000.0,184031300000.0,199983100000.0,198027100000.0,224028000000.0,268408600000.0,295736200000.0,293913700000.0,322700100000.0,0
Liechtenstein,AAA,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,8.244777,9.305229,9.015766,...,2726483000.0,2453835000.0,2181187000.0,1908538000.0,1635890000.0,1363242000.0,1090593000.0,817944900.0,545296600.0,0


In [28]:
df.columns

Index(['S&P Rating', 'unemployment_2000', 'unemployment_2014',
       'unemployment_2015', 'unemployment_2016', 'unemployment_2017',
       'unemployment_2018', 'unemployment_2019', 'unemployment_2020',
       'unemployment_2021', 'unemployment_2022', 'unemployment_2023',
       'current_account_balance_1990', 'current_account_balance_2000',
       'current_account_balance_2014', 'current_account_balance_2015',
       'current_account_balance_2016', 'current_account_balance_2017',
       'current_account_balance_2018', 'current_account_balance_2019',
       'current_account_balance_2020', 'current_account_balance_2021',
       'current_account_balance_2022', 'current_account_balance_2023',
       'exchange_rate_usd_1990', 'exchange_rate_usd_2000',
       'exchange_rate_usd_2014', 'exchange_rate_usd_2015',
       'exchange_rate_usd_2016', 'exchange_rate_usd_2017',
       'exchange_rate_usd_2018', 'exchange_rate_usd_2019',
       'exchange_rate_usd_2020', 'exchange_rate_usd_2021',
      

#### Current Account Balance to GDP Ratio

In [29]:
# years for which we have data
years = [1990, 2000, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

for year in years:
    cab_col = f'current_account_balance_{year}'
    gdp_col = f'gdp_{year}'
    ratio_col = f'cab_to_gdp_{year}'
    
    # Ensure the columns exist before creating the ratio
    if cab_col in df.columns and gdp_col in df.columns:
        df[ratio_col] = df[cab_col] / df[gdp_col]

#### Total reserves to GDP Ratio

In [30]:
for year in years:
    reserves_col = f'total_reserves_{year}'
    gdp_col = f'gdp_{year}'
    ratio_col = f'reserves_to_gdp_{year}'
    
    if reserves_col in df.columns and gdp_col in df.columns:
        df[ratio_col] = df[reserves_col] / df[gdp_col]

#### GDP Growth Rate 

In [31]:
#only the years for which we can calculate a contiguous change
growth_years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

for i in range(1, len(growth_years)):
    previous = growth_years[i - 1]
    current = growth_years[i]
    gdp_prev = f'gdp_{previous}'
    gdp_curr = f'gdp_{current}'
    growth_col = f'gdp_growth_{previous}_{current}'
    
    if gdp_prev in df.columns and gdp_curr in df.columns:
        df[growth_col] = (df[gdp_curr] - df[gdp_prev]) / df[gdp_prev]

### Inflation to Unemployment Ratio

In [32]:
for year in years:
    inflation_col = f'inflation_{year}'
    unemployment_col = f'unemployment_{year}'
    ratio_col = f'inflation_to_unemployment_{year}'
    
    if inflation_col in df.columns and unemployment_col in df.columns:
        df[ratio_col] = df[inflation_col] / df[unemployment_col]

In [33]:
df

Unnamed: 0_level_0,S&P Rating,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,unemployment_2019,unemployment_2020,unemployment_2021,...,inflation_to_unemployment_2014,inflation_to_unemployment_2015,inflation_to_unemployment_2016,inflation_to_unemployment_2017,inflation_to_unemployment_2018,inflation_to_unemployment_2019,inflation_to_unemployment_2020,inflation_to_unemployment_2021,inflation_to_unemployment_2022,inflation_to_unemployment_2023
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,AAA,6.288000,6.078000,6.055000,5.711000,5.592000,5.300000,5.159000,6.456000,5.116000,...,4.093325e-01,2.491109e-01,2.236020e-01,3.484706e-01,3.606417e-01,3.122248e-01,1.311812e-01,5.597948e-01,1.781707e+00,1.526320e+00
Canada,AAA,6.829000,7.023000,6.945000,7.038000,6.426000,5.837000,5.690000,9.657000,7.527000,...,2.714845e-01,1.620218e-01,2.030065e-01,2.485036e-01,3.885944e-01,3.425780e-01,7.424662e-02,4.510686e-01,1.288409e+00,7.228851e-01
Denmark,AAA,4.476000,6.925000,6.278000,5.989000,5.833000,5.131000,5.018000,5.637000,5.043000,...,8.144701e-02,7.200289e-02,4.174320e-02,1.966625e-01,1.585674e-01,1.510824e-01,7.463402e-02,3.674490e-01,1.735807e+00,6.427807e-01
Germany,AAA,7.917000,4.981000,4.624000,4.122000,3.746000,3.384000,3.136000,3.856000,3.638000,...,1.820506e-01,1.112513e-01,1.192982e-01,4.029618e-01,5.118702e-01,4.609884e-01,3.757208e-02,8.429540e-01,2.192209e+00,1.952853e+00
Liechtenstein,AAA,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,8.244777,9.305229,9.015766,...,6.863893e+08,6.626408e+08,6.430527e+08,6.293473e+08,6.178920e+08,5.952459e+08,4.981093e+08,4.838605e+08,4.968830e+08,4.791054e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sri Lanka,SD,7.740000,4.157000,4.519000,4.243000,4.046000,4.318000,4.670000,5.365000,5.258000,...,7.647347e-01,8.338942e-01,9.330399e-01,1.904137e+00,4.944506e-01,7.555447e-01,1.147054e+00,1.334116e+00,7.854834e+00,2.600814e+00
Ukraine,SD,11.707000,9.270000,9.140000,9.350000,9.500000,8.799000,8.194000,9.475000,9.834000,...,1.302250e+00,5.328213e+00,1.487990e+00,1.519823e+00,1.244671e+00,9.624991e-01,2.883897e-01,9.521191e-01,2.399446e+00,1.838311e+00
Lebanon,D,8.594000,8.796000,9.270000,9.760000,10.236000,10.741000,11.301000,13.235000,12.621000,...,2.108463e-01,-4.044385e-01,-8.026226e-02,4.221720e-01,5.657750e-01,2.659401e-01,6.412114e+00,1.226179e+01,1.476037e+01,1.913892e+01
Puerto Rico,D,10.080000,13.900000,12.000000,11.800000,10.800000,9.200000,8.300000,8.890000,7.900000,...,7.509685e+09,8.320513e+09,8.076923e+09,8.404558e+09,9.372910e+09,9.842447e+09,8.678723e+09,9.191821e+09,1.134615e+10,1.065724e+10


## Correlations maps

In [34]:
df = df.drop('S&P Rating', axis = 1)

In [35]:
# Variables to compare
variables_to_compare = [
    'unemployment_2000', 'unemployment_2014', 'unemployment_2015', 'unemployment_2016',
    'unemployment_2017', 'unemployment_2018', 'unemployment_2019', 'unemployment_2020',
    'unemployment_2021', 'unemployment_2022', 'unemployment_2023'
]

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Unemployment Variables", 
                template='plotly_dark', 
                width=1000, 
                height=600)  

fig.show()


In [36]:
# Variables to compare
variables_to_compare = ['current_account_balance_1990', 'current_account_balance_2000',
       'current_account_balance_2014', 'current_account_balance_2015',
       'current_account_balance_2016', 'current_account_balance_2017',
       'current_account_balance_2018', 'current_account_balance_2019',
       'current_account_balance_2020', 'current_account_balance_2021',
       'current_account_balance_2022', 'current_account_balance_2023'
]

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Current Account Balance Variables", 
                template='plotly_dark', 
                width=1000, 
                height=600) 

fig.show()


In [37]:
# Variables to compare
variables_to_compare = ['exchange_rate_usd_1990', 'exchange_rate_usd_2000',
       'exchange_rate_usd_2014', 'exchange_rate_usd_2015',
       'exchange_rate_usd_2016', 'exchange_rate_usd_2017',
       'exchange_rate_usd_2018', 'exchange_rate_usd_2019',
       'exchange_rate_usd_2020', 'exchange_rate_usd_2021',
       'exchange_rate_usd_2022', 'exchange_rate_usd_2023',
]

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Exchange Rate Variables", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [38]:
# Variables to compare
variables_to_compare = ['inflation_1990', 'inflation_2000', 'inflation_2014', 'inflation_2015',
       'inflation_2016', 'inflation_2017', 'inflation_2018', 'inflation_2019',
       'inflation_2020', 'inflation_2021', 'inflation_2022', 'inflation_2023'
]

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Inflation Variables", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [39]:
# Variables to compare
variables_to_compare = ['gdp_1990',
       'gdp_2000', 'gdp_2014', 'gdp_2015', 'gdp_2016', 'gdp_2017', 'gdp_2018',
       'gdp_2019', 'gdp_2020', 'gdp_2021', 'gdp_2022', 'gdp_2023'
]

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs GDP Variables", 
                template='plotly_dark', 
                width=1000, 
                height=600) 

fig.show()


In [40]:
# Variables to compare
variables_to_compare = ['total_reserves_1990', 'total_reserves_2000', 'total_reserves_2014',
       'total_reserves_2015', 'total_reserves_2016', 'total_reserves_2017',
       'total_reserves_2018', 'total_reserves_2019', 'total_reserves_2020',
       'total_reserves_2021', 'total_reserves_2022', 'total_reserves_2023'
]

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Total Reserves Variables", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [41]:
# Variables to compare
variables_to_compare = [ 'inflation_2023', 'gdp_1990', 'gdp_2023', 'unemployment_2023', 'current_account_balance_2023',
                        'total_reserves_1990', 'total_reserves_2023']

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs 2023 (and 1990) Variables", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [42]:
# Variables to compare
variables_to_compare = [ 'cab_to_gdp_1990', 'cab_to_gdp_2000', 'cab_to_gdp_2014', 'cab_to_gdp_2015', 'cab_to_gdp_2016', 
                        'cab_to_gdp_2017', 'cab_to_gdp_2018', 'cab_to_gdp_2019', 'cab_to_gdp_2020', 'cab_to_gdp_2021', 
                        'cab_to_gdp_2022', 'cab_to_gdp_2023']

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Current Account Balance to GDP Ratio", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [43]:
# Variables to compare
variables_to_compare = [ 'reserves_to_gdp_1990', 'reserves_to_gdp_2000', 'reserves_to_gdp_2014', 'reserves_to_gdp_2015', 
                        'reserves_to_gdp_2016', 'reserves_to_gdp_2017', 'reserves_to_gdp_2018', 'reserves_to_gdp_2019', 
                        'reserves_to_gdp_2020', 'reserves_to_gdp_2021', 'reserves_to_gdp_2022', 'reserves_to_gdp_2023']

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Reserves to GDP Ratio", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [44]:
# Variables to compare
variables_to_compare = [ 'gdp_growth_2014_2015', 'gdp_growth_2015_2016', 'gdp_growth_2016_2017', 'gdp_growth_2017_2018', 
                        'gdp_growth_2018_2019', 'gdp_growth_2019_2020', 'gdp_growth_2020_2021', 'gdp_growth_2021_2022', 
                        'gdp_growth_2022_2023']

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()


fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs GDP Growth", 
                template='plotly_dark', 
                width=1000,  
                height=600)  

fig.show()


In [45]:
# Variables to compare
variables_to_compare = [ 'inflation_to_unemployment_2000', 'inflation_to_unemployment_2014', 'inflation_to_unemployment_2015', 
                        'inflation_to_unemployment_2016', 'inflation_to_unemployment_2017', 'inflation_to_unemployment_2018', 
                        'inflation_to_unemployment_2019', 'inflation_to_unemployment_2020', 'inflation_to_unemployment_2021', 
                        'inflation_to_unemployment_2022', 'inflation_to_unemployment_2023']

# Compute the correlation matrix for 'target' and the selected variables
columns_to_use = ['target'] + variables_to_compare
df_corr_matrix = df[columns_to_use].corr()

# Take the absolute values of the correlation matrix
df_corr_matrix_abs = df_corr_matrix.abs()

fig = px.imshow(df_corr_matrix_abs, 
                color_continuous_scale='RdBu_r', 
                title="Correlation Heatmap: Target vs Inflation to Unemployment Ratio", 
                template='plotly_dark', 
                width=1000, 
                height=600)  

fig.show()


In [46]:
column_list = df.columns.tolist()
print(column_list)

['unemployment_2000', 'unemployment_2014', 'unemployment_2015', 'unemployment_2016', 'unemployment_2017', 'unemployment_2018', 'unemployment_2019', 'unemployment_2020', 'unemployment_2021', 'unemployment_2022', 'unemployment_2023', 'current_account_balance_1990', 'current_account_balance_2000', 'current_account_balance_2014', 'current_account_balance_2015', 'current_account_balance_2016', 'current_account_balance_2017', 'current_account_balance_2018', 'current_account_balance_2019', 'current_account_balance_2020', 'current_account_balance_2021', 'current_account_balance_2022', 'current_account_balance_2023', 'exchange_rate_usd_1990', 'exchange_rate_usd_2000', 'exchange_rate_usd_2014', 'exchange_rate_usd_2015', 'exchange_rate_usd_2016', 'exchange_rate_usd_2017', 'exchange_rate_usd_2018', 'exchange_rate_usd_2019', 'exchange_rate_usd_2020', 'exchange_rate_usd_2021', 'exchange_rate_usd_2022', 'exchange_rate_usd_2023', 'gdp_1990', 'gdp_2000', 'gdp_2014', 'gdp_2015', 'gdp_2016', 'gdp_2017', 

## Feature Selection 

Our goal is to remove redundant and irrelevant variables from our dataframe to avoid having too much noise and use too much computational power for nothing. 
As we have many values with a correlation with the target below 0.3, we will put a low threshold for irrelevance. 

In [47]:
irrelevance_threshold = 0.05
redundancy_threshold = 0.95

In [48]:
# Remove irrelevant variables
correlations_with_target = df.corr()['target'].abs()  # Absolute correlations with the target
relevant_vars = correlations_with_target[correlations_with_target >= irrelevance_threshold].index.tolist()

# Filter the dataframe to include only relevant variables
filtered_df = df[relevant_vars]

In [49]:
# Remove redundant variables (highly correlated with each other, but not with the target)
correlation_matrix = filtered_df.corr().abs()  # Compute absolute correlation matrix
np.fill_diagonal(correlation_matrix.values, 0)  # Fill diagonal with 0 to ignore self-correlation

# Keep track of variables to drop
variables_to_drop = set()

# Iterate through variables to find redundancies
for col in correlation_matrix.columns:
    if col not in variables_to_drop:
        # Find variables highly correlated with the current variable
        redundant_vars = correlation_matrix[col][correlation_matrix[col] > redundancy_threshold].index.tolist()
        # For redundant variables, keep the one most correlated with the target
        for redundant_var in redundant_vars:
            if redundant_var != col:
                # Compare correlation with target
                if correlations_with_target[col] >= correlations_with_target[redundant_var]:
                    variables_to_drop.add(redundant_var)  # Drop the redundant variable
                else:
                    variables_to_drop.add(col)

# Final list of variables after removing irrelevant and redundant variables
final_vars = [var for var in filtered_df.columns if var not in variables_to_drop]

# Create the final dataframe
feature_selection_df = df[final_vars].copy()
feature_selection_df['target'] = df['target']


In [50]:
feature_selection_df

Unnamed: 0_level_0,unemployment_2000,unemployment_2019,exchange_rate_usd_1990,exchange_rate_usd_2022,gdp_1990,gdp_2014,total_reserves_1990,total_reserves_2000,total_reserves_2021,target,...,reserves_to_gdp_2015,gdp_growth_2014_2015,gdp_growth_2015_2016,gdp_growth_2016_2017,gdp_growth_2017_2018,gdp_growth_2018_2019,gdp_growth_2019_2020,gdp_growth_2020_2021,gdp_growth_2021_2022,inflation_to_unemployment_2023
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,6.288000,5.159000,1.281057e+00,1.441664e+00,3.114267e+11,1.468265e+12,1.931874e+10,1.882155e+10,5.787750e+10,0,...,0.033602,-0.079665,-0.106904,0.098394,0.077118,-0.024573,-0.046176,0.171875,0.086156,1.526320e+00
Canada,6.829000,5.690000,1.166774e+00,1.301555e+00,5.960756e+11,1.805750e+12,2.352952e+10,3.242727e+10,1.066151e+11,0,...,0.051239,-0.138026,-0.018319,0.079366,0.046120,0.010662,-0.050490,0.212472,0.076719,7.228851e-01
Denmark,4.476000,5.018000,6.188558e+00,7.076152e+00,1.382177e+11,3.528326e+11,1.122584e+10,1.569595e+10,8.223584e+10,0,...,0.216017,-0.144753,0.034541,0.062235,0.071417,-0.027842,0.029616,0.148320,-0.015752,6.427807e-01
Germany,7.917000,3.136000,1.615733e+00,1.481802e+12,1.778162e+12,3.965801e+12,1.045473e+11,8.749687e+10,2.957362e+11,0,...,0.050746,-0.136727,0.033362,0.063686,0.076776,-0.023396,-0.004313,0.103589,-0.042477,1.952853e+00
Liechtenstein,9.640537,8.244777,1.840786e+08,2.368620e+10,1.421509e+09,6.657527e+09,3.544428e+09,3.271780e+09,1.090593e+09,0,...,0.434949,-0.058432,-0.004979,0.037998,0.033720,-0.038274,-0.004754,0.203643,-0.045248,4.791054e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sri Lanka,7.740000,4.670000,4.006292e+01,3.226327e+02,8.032551e+09,8.252854e+10,4.470305e+08,1.131355e+09,3.136992e+09,1,...,0.085765,0.031655,0.033724,0.072308,0.001246,-0.057981,-0.052920,0.051065,-0.163239,2.600814e+00
Ukraine,11.707000,8.194000,1.952346e-02,3.234230e+01,8.139356e+10,1.335039e+11,7.386012e+08,1.477202e+09,3.096667e+10,1,...,0.146114,-0.318140,0.025540,0.200680,0.167727,0.175657,0.017771,0.275500,-0.189103,1.838311e+00
Lebanon,8.594000,11.301000,6.950892e+02,1.507500e+03,2.838485e+09,4.809521e+10,4.210389e+09,8.474638e+09,3.523922e+10,1,...,0.972001,0.038135,0.024394,0.036764,0.035337,-0.060027,-0.385495,-0.270565,-0.092492,1.913892e+01
Puerto Rico,10.080000,8.300000,1.591404e+10,2.815561e+10,3.060392e+10,1.020000e+11,5.900000e+10,5.446154e+10,1.815385e+10,1,...,0.440627,0.009804,0.009709,-0.009615,-0.019417,0.039604,-0.019048,0.029126,0.075472,1.065724e+10


In [51]:
# putting the target at the beginning of the dataframe to make it easier to see
columns_ordered = ['target'] + [col for col in feature_selection_df.columns if col != 'target']
feature_selection_df = feature_selection_df[columns_ordered]

# Compute the correlation matrix for all variables in feature_selection_df
corr_matrix = feature_selection_df.corr()

# Take the absolute values of the correlation matrix (optional, for easier interpretation)
corr_matrix_abs = corr_matrix.abs()

# Create a heatmap using plotly express
fig = px.imshow(
    corr_matrix_abs, 
    color_continuous_scale='RdBu_r', 
    title="Correlation Heatmap: All Variables (Target at Beginning)", 
    template='plotly_dark', 
    labels=dict(color="Correlation"),  # Add a label for the color bar
    width=1000, 
    height=800
)

# Display the figure
fig.show()


With this new correlation matric we can see that there are no very low correlations with the target, nor is there very high correlations between other variables.  

## Oversampling and Subsampling

In [52]:
#getting the percentage of the values of 0 and 1
100*df.groupby(['target'])['target'].agg(['count'])/df.shape[0]

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,50.393701
1,49.606299


The data is has almost exactly the same number of each category, there is therefore no need for oversampling or subsampling. 

## Train test Split 

We are doing Train, Test Split within the preprocessing steps to allow to fit the scaler only onto the training data, to avoid leaking from the test set. We have replaced the null values in the dataset using chatGPT, but if we were to use an algorithm, we would also have had to do this step before. 

In [53]:
## Train Test Split 
X_improved = feature_selection_df.drop(['target'], axis = 1)
y_improved = feature_selection_df[['target']]
X_train_new,X_test_new,y_train_new,y_test_new = train_test_split(X_improved,y_improved,test_size=0.4,random_state=seed)

## Scaling

Scaling is important as we have values from very different magnitudes. Furthermore, we are using min-max scaling as the standard scaler was giving warning due to this important magnitude. 

In [54]:
# Fit the scaler on X_train
scaler = MinMaxScaler()
scaler.fit(X_train_new)

# Scale X_train and X_test separately
X_train_scaled = pd.DataFrame(scaler.transform(X_train_new), columns=X_train_new.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_new), columns=X_test_new.columns)


As we scaled the data on the training dataset, we will not be able to use cross-validation. 

# Modeling and Results 

In [55]:
# Initialize models
logistic = LogisticRegression()
ridge_classifier = RidgeClassifier()
lasso_lars = LassoLars(alpha=0.01)
naive_bayes = GaussianNB()
svc = SVC(probability=True)  # Enable probability estimates for AUC calculation

print(cl('AUC SCORE:', attrs=['bold']))
print('-------------------------------------------------------------------------------')

# 1. Logistic Regression
logistic.fit(X_train_scaled, y_train_new)
logistic_train_auc_new = roc_auc_score(y_train_new, logistic.predict_proba(X_train_scaled)[:, 1])
logistic_test_auc_new = roc_auc_score(y_test_new, logistic.predict_proba(X_test_scaled)[:, 1])
print(cl('Logistic model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(logistic_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(logistic_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 2. Ridge Classifier
ridge_classifier.fit(X_train_scaled, y_train_new)
ridge_classifier_train_auc_new = roc_auc_score(y_train_new, ridge_classifier.decision_function(X_train_scaled))
ridge_classifier_test_auc_new = roc_auc_score(y_test_new, ridge_classifier.decision_function(X_test_scaled))
print(cl('Ridge model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(ridge_classifier_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(ridge_classifier_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 3. Lasso Lars Classifier
lasso_lars.fit(X_train_scaled, y_train_new)
lasso_lars_train_pred_new = (lasso_lars.predict(X_train_scaled) > 0.5).astype(int)  # Threshold at 0.5
lasso_lars_test_pred_new = (lasso_lars.predict(X_test_scaled) > 0.5).astype(int)    # Threshold at 0.5

lasso_lars_train_auc_new = roc_auc_score(y_train_new, lasso_lars.predict(X_train_scaled))
lasso_lars_test_auc_new = roc_auc_score(y_test_new, lasso_lars.predict(X_test_scaled))
print(cl('Lasso Lars model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(lasso_lars_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(lasso_lars_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 4. Gaussian Naive Bayes
naive_bayes.fit(X_train_scaled, y_train_new)
naive_bayes_train_auc_new = roc_auc_score(y_train_new, naive_bayes.predict_proba(X_train_scaled)[:, 1])
naive_bayes_test_auc_new = roc_auc_score(y_test_new, naive_bayes.predict_proba(X_test_scaled)[:, 1])
print(cl('Naive Bayes model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(naive_bayes_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(naive_bayes_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 5. Support Vector Classifier
svc.fit(X_train_scaled, y_train_new)
svc_train_auc_new = roc_auc_score(y_train_new, svc.predict_proba(X_train_scaled)[:, 1])
svc_test_auc_new = roc_auc_score(y_test_new, svc.predict_proba(X_test_scaled)[:, 1])
print(cl('SVC model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(svc_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(svc_test_auc), attrs=['bold']))


[1mAUC SCORE:[0m
-------------------------------------------------------------------------------
[1mLogistic model:[0m
[1mTrain - AUC score is 0.9422[0m
[1mTest - AUC score is 0.8339[0m
-------------------------------------------------------------------------------
[1mRidge model:[0m
[1mTrain - AUC score is 1.0000[0m
[1mTest - AUC score is 0.5878[0m
-------------------------------------------------------------------------------
[1mLasso Lars model:[0m
[1mTrain - AUC score is 0.9380[0m
[1mTest - AUC score is 0.7179[0m
-------------------------------------------------------------------------------
[1mNaive Bayes model:[0m
[1mTrain - AUC score is 0.7854[0m
[1mTest - AUC score is 0.6779[0m
-------------------------------------------------------------------------------
[1mSVC model:[0m
[1mTrain - AUC score is 0.8084[0m
[1mTest - AUC score is 0.6850[0m


In [56]:
print(cl('Logistic Model: Other Metrics:', attrs=['bold']))
# 1. Logistic Regression
logistic.fit(X_train_scaled, y_train_new)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test_new, logistic_test_yhat))

[1mLogistic Model: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.68      0.93      0.78        29
           1       0.82      0.41      0.55        22

    accuracy                           0.71        51
   macro avg       0.75      0.67      0.66        51
weighted avg       0.74      0.71      0.68        51



In [57]:
print(cl('Lasso Lars Classifier: Other Metrics:', attrs=['bold']))
# 2. Ridge Classifier
ridge_classifier.fit(X_train_scaled, y_train_new)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test_new, ridge_classifier_test_yhat))

[1mLasso Lars Classifier: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.72      0.62      0.67        29
           1       0.58      0.68      0.62        22

    accuracy                           0.65        51
   macro avg       0.65      0.65      0.65        51
weighted avg       0.66      0.65      0.65        51



In [58]:
# 3. Lasso Lars Classifier (as this is originally designed for regressions, we have a few changes to make to adapt it to binary classification)
lasso_lars.fit(X_train_scaled, y_train_new)
lasso_lars_test_yhat = lasso_lars.predict(X_test_scaled) # Get continuous predictions
lasso_lars_test_labels = (lasso_lars_test_yhat >= 0.5).astype(int) # Convert continuous predictions to class labels using a threshold (0.5)

# Print the classification report
print(cl('Lasso Lars Classifier:', attrs=['bold']))
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test_new, lasso_lars_test_labels))


[1mLasso Lars Classifier:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.86      0.62      0.72        29
           1       0.63      0.86      0.73        22

    accuracy                           0.73        51
   macro avg       0.75      0.74      0.73        51
weighted avg       0.76      0.73      0.72        51



In [59]:
print(cl('Gaussian Naive Bayes: Other Metrics:', attrs=['bold']))
# 4. Gaussian Naive Bayes
naive_bayes.fit(X_train_scaled, y_train_new)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test_new, naive_bayes_test_yhat))

[1mGaussian Naive Bayes: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.79      0.52      0.62        29
           1       0.56      0.82      0.67        22

    accuracy                           0.65        51
   macro avg       0.68      0.67      0.65        51
weighted avg       0.69      0.65      0.64        51



In [60]:
print(cl('Support Vector Classifier: Other Metrics:', attrs=['bold']))
# 5. Support Vector Classifier
svc.fit(X_train_scaled, y_train_new)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test_new,svc_test_yhat))

[1mSupport Vector Classifier: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.80      0.28      0.41        29
           1       0.49      0.91      0.63        22

    accuracy                           0.55        51
   macro avg       0.64      0.59      0.52        51
weighted avg       0.67      0.55      0.51        51



In [70]:
# AUC scores from all models
model_names = ['Logistic Regression', 'Ridge Classifier', 'Lasso Lars', 'Naive Bayes', 'SVC']
train_auc_scores = [logistic_train_auc, ridge_classifier_train_auc, lasso_lars_train_auc, naive_bayes_train_auc, svc_train_auc]
test_auc_scores = [logistic_test_auc, ridge_classifier_test_auc, lasso_lars_test_auc, naive_bayes_test_auc, svc_test_auc]
train_auc_scores_new = [logistic_train_auc_new, ridge_classifier_train_auc_new, lasso_lars_train_auc_new, naive_bayes_train_auc_new, svc_train_auc_new]
test_auc_scores_new = [logistic_test_auc_new, ridge_classifier_test_auc_new, lasso_lars_test_auc_new, naive_bayes_test_auc_new, svc_test_auc_new]

# Create a DataFrame for visualization
auc_data = pd.DataFrame({
    'Model': model_names * 4,
    'Dataset': ['Train'] * len(model_names) + ['Test'] * len(model_names) + ['Train New'] * len(model_names) + ['Test New'] * len(model_names),
    'AUC Score': train_auc_scores + test_auc_scores + train_auc_scores_new + test_auc_scores_new})

# Create the bar plot with the plotly_dark template
fig = px.bar(
    auc_data, 
    x='Model', 
    y='AUC Score', 
    color='Dataset', 
    barmode='group',
    title='Comparison of AUC Scores Across Models (before and after improvements)',
    labels={'AUC Score': 'AUC Score', 'Model': 'Classification Model'},
    template='plotly_dark'
)

# Customize layout
fig.update_layout(
    title_font_size=20,
    xaxis_title='Model',
    yaxis_title='AUC Score',
    legend_title='Dataset',
)

# Display the plot
fig.show()


Although the train value did not always improve, the test AUC waas always better after teh improvements. The changes also allowed to correct the overfitting present in the Ridge Classifier. 

With these new udpated models, SVC performs best regarding the AUC score. 