In [2]:
import pandas as pd
import pickle as pkl
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [3]:
ratings = pd.read_csv("Cleansed Set B Corporate Rating.csv")

df = pd.DataFrame(ratings)

x = ratings.drop(columns=['Risk Rating', 'Bond Rating'])
y = ratings['Risk Rating']

custom_mapping = {
    'In Default': 5,
    'Highest Risk': 4,
    'High Risk': 3,
    'Medium Risk': 2,
    'Low Risk': 1,
    'Lowest Risk': 0
}

ros = RandomOverSampler()

x_resampled, y_resampled = ros.fit_resample(x, y)
ratings_resampled = pd.concat([pd.DataFrame(x_resampled), pd.DataFrame(y_resampled, columns=['Risk Rating'])], axis=1)
ratings_resampled['Risk Rating'] = ratings_resampled['Risk Rating'].map(custom_mapping)
#print(ratings_resampled)

# Import the ClassificationExperiment
from pycaret.classification import ClassificationExperiment

# Create an experiment instance
exp = ClassificationExperiment()

# Initialize the experiment
exp.setup(ratings_resampled, target='Risk Rating', session_id=123)

# Compare models
best_model = exp.compare_models()
XGBoost_model = exp.create_model('xgboost')
# View results
print(best_model)


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Risk Rating
2,Target type,Multiclass
3,Original data shape,"(7938, 18)"
4,Transformed data shape,"(7938, 18)"
5,Transformed train set shape,"(5556, 18)"
6,Transformed test set shape,"(2382, 18)"
7,Numeric features,17
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9676,0.9983,0.9676,0.968,0.9676,0.9611,0.9612,0.052
lightgbm,Light Gradient Boosting Machine,0.9597,0.9972,0.9597,0.9601,0.9596,0.9516,0.9517,0.486
xgboost,Extreme Gradient Boosting,0.9591,0.9971,0.9591,0.9598,0.959,0.951,0.9511,0.174
rf,Random Forest Classifier,0.9581,0.9976,0.9581,0.9584,0.9579,0.9497,0.9498,0.111
dt,Decision Tree Classifier,0.9363,0.9618,0.9363,0.9363,0.9357,0.9235,0.9238,0.01
gbc,Gradient Boosting Classifier,0.9228,0.0,0.9228,0.9234,0.9224,0.9073,0.9076,1.089
knn,K Neighbors Classifier,0.8769,0.9743,0.8769,0.8792,0.8738,0.8523,0.854,0.157
lda,Linear Discriminant Analysis,0.6785,0.0,0.6785,0.6979,0.6821,0.6142,0.6166,0.005
ridge,Ridge Classifier,0.6386,0.0,0.6386,0.6189,0.614,0.5663,0.574,0.007
lr,Logistic Regression,0.6233,0.0,0.6233,0.6359,0.6021,0.548,0.5584,0.417


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9568,0.9967,0.9568,0.957,0.9568,0.9482,0.9482
1,0.9604,0.9982,0.9604,0.96,0.96,0.9525,0.9526
2,0.9712,0.9987,0.9712,0.9725,0.9713,0.9655,0.9657
3,0.9388,0.9963,0.9388,0.9398,0.9384,0.9266,0.927
4,0.964,0.9967,0.964,0.9645,0.9638,0.9568,0.957
5,0.9604,0.9971,0.9604,0.9624,0.9605,0.9525,0.9529
6,0.9658,0.9975,0.9658,0.9664,0.9655,0.9589,0.9592
7,0.9586,0.996,0.9586,0.9592,0.9587,0.9503,0.9504
8,0.9586,0.9981,0.9586,0.9595,0.9585,0.9503,0.9505
9,0.9568,0.9956,0.9568,0.9564,0.9565,0.9481,0.9481


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)


In [4]:
# to export data to pickle file
#ratings_resampled.to_pickle('df.pkl')