In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Cleansed Set B Corporate Rating.csv')

In [5]:
#brief look into the data
df.head()

Unnamed: 0,Bond Rating,Cash,Earnings Before Interest,Gross Profit (Loss),Retained Earnings,EBTI Margin (Revenue),Dividends per Share - Pay Date - Calendar,Total Stockholders Equity,Total Market Value (Fiscal Years),Total Revenue,Financing Activities - Net Cash Flow,Net Cash Flow,Debt to Equity Ratio,Return on Asset,Interest Coverage,Current Ratio,Return on Equity,Quick Ratio,Risk Rating
0,B+,994.0,6585.0,13139.0,-13121.0,0.154396,0.2,2021.0,37405.5843,42650.0,-315.0,2765.0,8.013855,0.150442,0.138952,0.078187,6.048491,0.038898,High Risk
1,A-,7.604,1228.6,1228.6,1857.924,0.35187,2.2975,4519.102,7553.105,3491.632,-178.881,920.746,0.670756,0.085835,0.16356,0.038974,0.227401,0.001124,Low Risk
2,A+,4063.0,4475.0,12088.0,17821.0,0.22102,0.88,21639.0,67891.7357,20247.0,-2742.0,933.0,0.157493,0.108419,0.03352,0.413236,0.551689,0.25037,Low Risk
3,BB+,466.829,134.767,308.828,1759.236,0.20412,0.6838,2045.311,2667.9542,660.235,-45.802,87.781,0.0,0.060959,0.020762,3.72435,0.149625,2.821401,High Risk
4,A,1420.4,4848.7,15091.5,9940.4,0.083594,0.9,14551.8,31072.734,58003.2,-1235.0,2137.8,0.53959,0.090796,0.067915,0.045822,1.014459,0.045822,Low Risk


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [10]:
custom_mapping = {
    'In Default': 5,
    'Highest Risk': 4,
    'High Risk': 3,
    'Medium Risk': 2,
    'Low Risk': 1,
    'Lowest Risk': 0
}

X = df.drop(columns=['Risk Rating', 'Bond Rating'])
y = df['Risk Rating'].map(custom_mapping)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Model 1: Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

# Model 2: Logistic Regression
logreg_model = LogisticRegression(solver='liblinear', random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# Model 3: Decision Tree
dectree_model = DecisionTreeClassifier(random_state=42)
dectree_model.fit(X_train, y_train)
dectree_pred = dectree_model.predict(X_test)

# Model 4: XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)



In [12]:
pred1 = rf.predict_proba(X_test)
pred2 = logreg_model.predict_proba(X_test)
pred3 = dectree_model.predict_proba(X_test)
pred4 = xgb_model.predict_proba(X_test)


#Plot AUC-ROC
false_positive_rate_1, true_positive_rate_1, thresholds_1 = roc_curve(y_test, pred1[:,1])
roc_auc_1 = auc(false_positive_rate_1, true_positive_rate_1)

false_positive_rate_2, true_positive_rate_2, thresholds_2 = roc_curve(y_test, pred2[:,1])
roc_auc_2 = auc(false_positive_rate_2, true_positive_rate_2)

false_positive_rate_4, true_positive_rate_4, thresholds_4 = roc_curve(y_test, pred4[:,1])
roc_auc_4 = auc(false_positive_rate_4, true_positive_rate_4)

plt.figure(figsize=(7,7))
plt.title('AUC-ROC of Benchmark Models')
plt.plot(false_positive_rate_1, true_positive_rate_1, 'b', label = 'Random Forest'  % roc_auc_1)
plt.plot(false_positive_rate_2, true_positive_rate_2, 'y', label = 'Logit Model'  % roc_auc_2)
plt.plot(false_positive_rate_4, true_positive_rate_4, 'orange', label = 'XGBoost'  % roc_auc_4)

plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

ValueError: multiclass format is not supported