# Base: Imports and Cleaning

In [119]:
import pandas as pd
import numpy as np
import re

In [120]:
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning, UndefinedMetricWarning

# Suppress specific warnings
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  
from sklearn.linear_model import RidgeClassifier  
from sklearn.naive_bayes import GaussianNB  
from sklearn.svm import SVC  

In [122]:
from termcolor import colored as cl  # text customization
from sklearn.metrics import roc_auc_score  # AUC metric
from sklearn.linear_model import LogisticRegression  # Logistic Regression for classification
from sklearn.linear_model import RidgeClassifier  # Ridge Classifier for classification
from sklearn.naive_bayes import GaussianNB  # Naive Bayes for classification
from sklearn.svm import SVC  # Support Vector Classifier
from sklearn.preprocessing import label_binarize #as we have multi-class classification

In [123]:
#for random state (to have it consistent throughout the model)
seed = 9

In [124]:
data = pd.read_csv("../World_Datasets/final_dataset_world.csv")

In [125]:
data

Unnamed: 0,Country,S&P Rating,Country Name,country_code,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,...,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,Unnamed: 73,value_counts
0,Australia,AAA,Australia,AUS,6.288000,6.078000,6.055000,5.711000,5.592000,5.300000,...,5.248079e+10,6.565372e+10,5.390953e+10,5.799469e+10,4.254463e+10,5.787750e+10,5.670190e+10,6.170335e+10,3.085167e+10,73.0
1,Canada,AAA,Canada,CAN,6.829000,7.023000,6.945000,7.038000,6.426000,5.837000,...,8.271811e+10,8.667771e+10,8.392560e+10,8.529711e+10,9.042814e+10,1.066151e+11,1.069524e+11,1.175509e+11,5.877545e+10,73.0
2,Denmark,AAA,Denmark,DNK,4.476000,6.925000,6.278000,5.989000,5.833000,5.131000,...,6.421581e+10,7.524445e+10,7.094208e+10,6.683555e+10,7.282335e+10,8.223584e+10,9.607255e+10,1.093708e+11,5.468542e+10,73.0
3,Germany,AAA,Germany,DEU,7.917000,4.981000,4.624000,4.122000,3.746000,3.384000,...,1.840313e+11,1.999831e+11,1.980271e+11,2.240280e+11,2.684086e+11,2.957362e+11,2.939137e+11,3.227001e+11,1.613500e+11,62.0
4,Liechtenstein,AAA,Liechtenstein,LIE,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,...,2.453835e+09,2.181187e+09,1.908538e+09,1.635890e+09,1.363242e+09,1.090593e+09,8.179449e+08,5.452966e+08,2.726483e+08,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Sri Lanka,SD,Sri Lanka,LKA,7.740000,4.157000,4.519000,4.243000,4.046000,4.318000,...,6.008199e+09,7.959048e+09,6.920826e+09,7.648305e+09,5.663994e+09,3.136992e+09,2.352744e+09,1.568496e+09,7.842479e+08,71.0
127,Ukraine,SD,Ukraine,UKR,11.707000,9.270000,9.140000,9.350000,9.500000,8.799000,...,1.553726e+10,1.881093e+10,2.081790e+10,2.531700e+10,2.913754e+10,3.096667e+10,2.850593e+10,4.051011e+10,2.025505e+10,67.0
128,Lebanon,D,Lebanon,LBN,8.594000,8.796000,9.270000,9.760000,10.236000,10.741000,...,5.390551e+10,5.541153e+10,5.238061e+10,5.221348e+10,4.244040e+10,3.523922e+10,3.251288e+10,2.167525e+10,1.083763e+10,66.0
129,Puerto Rico,D,Puerto Rico,PRI,10.080000,13.900000,12.000000,11.800000,10.800000,9.200000,...,4.084615e+10,3.630769e+10,3.176923e+10,2.723077e+10,2.269231e+10,1.815385e+10,1.361538e+10,9.076923e+09,4.538462e+09,39.0


In [126]:
#dropping non-relevant columns
data = data.drop(['Unnamed: 73', 'value_counts'], axis = 1)

In [127]:
print('Number of unique values: ', data['S&P Rating'].nunique())
print(data['S&P Rating'].unique())

Number of unique values:  38
['AAA' 'AA+' 'AA+\xa0' 'AA-' 'AA' 'AA\xa0' 'A+' 'NR' 'A\xa0' 'A' 'AA-\xa0'
 'A-' 'A-\xa0' 'BBB+' 'BBB' 'BBB\xa0' 'BBB+\xa0' 'BBB-' 'BBB-\xa0'
 'BB+\xa0' 'BB+' 'BB\xa0' 'BB' 'BB-\xa0' 'BB-' 'B+' 'B+\xa0' 'SD' 'B\xa0'
 'B-' 'B' 'B-\xa0' 'CCC+\xa0' 'CCC+' 'CCC' 'D' 'D\xa0' 'SD\xa0']


As 38 is too many values for a classification, we will regroup first, and drop the non-rated 'NR'.

In [128]:
data = data[data['S&P Rating'] != 'NR'] #dropping non-rated

In [129]:
data['target'] = np.where(data['S&P Rating'].str.match(r'.*A.*', na=False), 0, 
                          np.where(data['S&P Rating'].str.match(r'.*B.*', na=False), 1, 2))

We now have 3 classification possibilities. 

# Base Model

In [130]:
data

Unnamed: 0,Country,S&P Rating,Country Name,country_code,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,...,total_reserves_2015,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,target
0,Australia,AAA,Australia,AUS,6.288000,6.078000,6.055000,5.711000,5.592000,5.300000,...,4.540604e+10,5.248079e+10,6.565372e+10,5.390953e+10,5.799469e+10,4.254463e+10,5.787750e+10,5.670190e+10,6.170335e+10,0
1,Canada,AAA,Canada,CAN,6.829000,7.023000,6.945000,7.038000,6.426000,5.837000,...,7.975352e+10,8.271811e+10,8.667771e+10,8.392560e+10,8.529711e+10,9.042814e+10,1.066151e+11,1.069524e+11,1.175509e+11,0
2,Denmark,AAA,Denmark,DNK,4.476000,6.925000,6.278000,5.989000,5.833000,5.131000,...,6.518509e+10,6.421581e+10,7.524445e+10,7.094208e+10,6.683555e+10,7.282335e+10,8.223584e+10,9.607255e+10,1.093708e+11,0
3,Germany,AAA,Germany,DEU,7.917000,4.981000,4.624000,4.122000,3.746000,3.384000,...,1.737309e+11,1.840313e+11,1.999831e+11,1.980271e+11,2.240280e+11,2.684086e+11,2.957362e+11,2.939137e+11,3.227001e+11,0
4,Liechtenstein,AAA,Liechtenstein,LIE,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,...,2.726483e+09,2.453835e+09,2.181187e+09,1.908538e+09,1.635890e+09,1.363242e+09,1.090593e+09,8.179449e+08,5.452966e+08,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Sri Lanka,SD,Sri Lanka,LKA,7.740000,4.157000,4.519000,4.243000,4.046000,4.318000,...,7.302097e+09,6.008199e+09,7.959048e+09,6.920826e+09,7.648305e+09,5.663994e+09,3.136992e+09,2.352744e+09,1.568496e+09,2
127,Ukraine,SD,Ukraine,UKR,11.707000,9.270000,9.140000,9.350000,9.500000,8.799000,...,1.330088e+10,1.553726e+10,1.881093e+10,2.081790e+10,2.531700e+10,2.913754e+10,3.096667e+10,2.850593e+10,4.051011e+10,2
128,Lebanon,D,Lebanon,LBN,8.594000,8.796000,9.270000,9.760000,10.236000,10.741000,...,4.853139e+10,5.390551e+10,5.541153e+10,5.238061e+10,5.221348e+10,4.244040e+10,3.523922e+10,3.251288e+10,2.167525e+10,2
129,Puerto Rico,D,Puerto Rico,PRI,10.080000,13.900000,12.000000,11.800000,10.800000,9.200000,...,4.538462e+10,4.084615e+10,3.630769e+10,3.176923e+10,2.723077e+10,2.269231e+10,1.815385e+10,1.361538e+10,9.076923e+09,2


In [131]:
data = data.set_index(data['Country'])
data = data.drop(['Country', 'Country Name', 'country_code'], axis = 1)

In [132]:
data.head()

Unnamed: 0_level_0,S&P Rating,unemployment_2000,unemployment_2014,unemployment_2015,unemployment_2016,unemployment_2017,unemployment_2018,unemployment_2019,unemployment_2020,unemployment_2021,...,total_reserves_2015,total_reserves_2016,total_reserves_2017,total_reserves_2018,total_reserves_2019,total_reserves_2020,total_reserves_2021,total_reserves_2022,total_reserves_2023,target
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,AAA,6.288,6.078,6.055,5.711,5.592,5.3,5.159,6.456,5.116,...,45406040000.0,52480790000.0,65653720000.0,53909530000.0,57994690000.0,42544630000.0,57877500000.0,56701900000.0,61703350000.0,0
Canada,AAA,6.829,7.023,6.945,7.038,6.426,5.837,5.69,9.657,7.527,...,79753520000.0,82718110000.0,86677710000.0,83925600000.0,85297110000.0,90428140000.0,106615100000.0,106952400000.0,117550900000.0,0
Denmark,AAA,4.476,6.925,6.278,5.989,5.833,5.131,5.018,5.637,5.043,...,65185090000.0,64215810000.0,75244450000.0,70942080000.0,66835550000.0,72823350000.0,82235840000.0,96072550000.0,109370800000.0,0
Germany,AAA,7.917,4.981,4.624,4.122,3.746,3.384,3.136,3.856,3.638,...,173730900000.0,184031300000.0,199983100000.0,198027100000.0,224028000000.0,268408600000.0,295736200000.0,293913700000.0,322700100000.0,0
Liechtenstein,AAA,9.640537,9.136085,9.052059,8.903803,8.664479,8.383856,8.244777,9.305229,9.015766,...,2726483000.0,2453835000.0,2181187000.0,1908538000.0,1635890000.0,1363242000.0,1090593000.0,817944900.0,545296600.0,0


## Train Test Split 

In [133]:
X = data.drop(['target', 'S&P Rating'], axis = 1)
y = data[['target']]

In [134]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=seed)

## Intial Modeling and Results 

In [135]:
# 1. Logistic Regression
logistic = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=seed)
logistic.fit(X_train, y_train)
logistic_test_yhat = logistic.predict(X_test)
logistic_train_yhat = logistic.predict(X_train)


# 2. Ridge Classifier
ridge_classifier = RidgeClassifier(alpha = 0.5, random_state=seed)
ridge_classifier.fit(X_train, y_train)
ridge_classifier_test_yhat = ridge_classifier.predict(X_test)
ridge_classifier_train_yhat = ridge_classifier.predict(X_train)

# 3. Gaussian Naive Bayes
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)
naive_bayes_test_yhat = naive_bayes.predict(X_test)
naive_bayes_train_yhat = naive_bayes.predict(X_train)

# 4. Support Vector Classifier
svc = SVC(probability = True, random_state=seed)
svc.fit(X_train, y_train)
svc_test_yhat = svc.predict(X_test)
svc_train_yhat = svc.predict(X_train)

In [147]:
# Binarize the output for multi-class AUC
y_train_binarized = label_binarize(y_train, classes=[0, 1, 2])  # Adjust class labels as needed
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])

print(cl('AUC SCORE (Multi-Class):', attrs=['bold']))
print('-------------------------------------------------------------------------------')

# 1. Logistic Regression
logistic.fit(X_train, y_train)
logistic_train_auc = roc_auc_score(y_train_binarized, logistic.predict_proba(X_train), multi_class='ovr')
logistic_test_auc = roc_auc_score(y_test_binarized, logistic.predict_proba(X_test), multi_class='ovr')
print(cl('Logistic model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(logistic_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(logistic_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 2. Ridge Classifier
ridge_classifier.fit(X_train, y_train)
ridge_train_scores = ridge_classifier.decision_function(X_train)  # Multiclass decision scores
ridge_test_scores = ridge_classifier.decision_function(X_test)
ridge_classifier_train_auc = roc_auc_score(y_train_binarized, ridge_train_scores, multi_class='ovr')
ridge_classifier_test_auc = roc_auc_score(y_test_binarized, ridge_test_scores, multi_class='ovr')
print(cl('Ridge model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(ridge_classifier_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(ridge_classifier_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 3. Gaussian Naive Bayes
naive_bayes.fit(X_train, y_train)
naive_bayes_train_auc = roc_auc_score(y_train_binarized, naive_bayes.predict_proba(X_train), multi_class='ovr')
naive_bayes_test_auc = roc_auc_score(y_test_binarized, naive_bayes.predict_proba(X_test), multi_class='ovr')
print(cl('Naive Bayes model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(naive_bayes_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(naive_bayes_test_auc), attrs=['bold']))

print('-------------------------------------------------------------------------------')

# 4. Support Vector Classifier
svc.fit(X_train, y_train)
svc_train_auc = roc_auc_score(y_train_binarized, svc.decision_function(X_train), multi_class='ovr')  # Use decision_function
svc_test_auc = roc_auc_score(y_test_binarized, svc.decision_function(X_test), multi_class='ovr')
print(cl('SVC model:', attrs=['bold']))
print(cl('Train - AUC score is {:.4f}'.format(svc_train_auc), attrs=['bold']))
print(cl('Test - AUC score is {:.4f}'.format(svc_test_auc), attrs=['bold']))



[1mAUC SCORE (Multi-Class):[0m
-------------------------------------------------------------------------------
[1mLogistic model:[0m
[1mTrain - AUC score is 0.9148[0m
[1mTest - AUC score is 0.8102[0m
-------------------------------------------------------------------------------
[1mRidge model:[0m
[1mTrain - AUC score is 1.0000[0m
[1mTest - AUC score is 0.4185[0m
-------------------------------------------------------------------------------
[1mNaive Bayes model:[0m
[1mTrain - AUC score is 0.6967[0m
[1mTest - AUC score is 0.7161[0m
-------------------------------------------------------------------------------
[1mSVC model:[0m
[1mTrain - AUC score is 0.7028[0m
[1mTest - AUC score is 0.5403[0m


In [155]:
print(cl('Logistic Model: Other Metrics:', attrs=['bold']))
# 1. Logistic Regression
logistic.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, logistic_test_yhat))

[1mLogistic Model: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.70      0.61      0.65        23
           1       0.75      0.67      0.71        27
           2       0.14      1.00      0.25         1

    accuracy                           0.65        51
   macro avg       0.53      0.76      0.54        51
weighted avg       0.72      0.65      0.67        51



In [None]:
print(cl('Ridge Classifier: Other Metrics:', attrs=['bold']))
# 2. Ridge Classifier
ridge_classifier.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, ridge_classifier_test_yhat))

[1mRidge Classifier: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.62      0.35      0.44        23
           1       0.63      0.44      0.52        27
           2       0.00      0.00      0.00         1

    accuracy                           0.39        51
   macro avg       0.42      0.26      0.32        51
weighted avg       0.61      0.39      0.48        51



In [158]:
print(cl('Gaussian Naive Bayes: Other Metrics:', attrs=['bold']))
# 3. Gaussian Naive Bayes
naive_bayes.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test, naive_bayes_test_yhat))

[1mGaussian Naive Bayes: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.83      0.22      0.34        23
           1       0.57      0.85      0.69        27
           2       0.20      1.00      0.33         1

    accuracy                           0.57        51
   macro avg       0.54      0.69      0.45        51
weighted avg       0.68      0.57      0.53        51



In [159]:
print(cl('Support Vector Classifier: Other Metrics:', attrs=['bold']))
# 4. Support Vector Classifier
svc.fit(X_train, y_train)
print('     ')
print(cl('Classification Report (Test):'))
print(classification_report(y_test,svc_test_yhat))

[1mSupport Vector Classifier: Other Metrics:[0m
     
Classification Report (Test):[0m
              precision    recall  f1-score   support

           0       0.80      0.17      0.29        23
           1       0.57      0.96      0.71        27
           2       0.00      0.00      0.00         1

    accuracy                           0.59        51
   macro avg       0.46      0.38      0.33        51
weighted avg       0.66      0.59      0.51        51

