In [22]:
# load neccessary packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

In [18]:
# load the data
CLEAN_DATAFILE = "BDP_CLEAN.csv"
df = pd.read_csv(CLEAN_DATAFILE)

# lets drop the columns we dont want to predict on
# these columns are only for identification
drop_columns = ['Ticker', 'Rating Date', 'Fiscal Year']
y_variable = "RTG_SP_LT_LC_ISSUER_CREDIT"
df = df.drop(columns=drop_columns)
x = df.drop(columns=[y_variable])
y = df[y_variable]

# within the x variables there are 2 columns that are categorical data
# lets one hot encode the categorical data
x = pd.get_dummies(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)

This project is a classification problem where we want to predict the column "RTG_SP_LT_LC_ISSUER_CREDIT".

One interesting application for this problem is to use a model to predict the credit rating of the issuer and then compare this to the assigned credit rating given by a credit rating agency. This would allow us to expose possible opportunities where the assigned credit rating is low but our model assigns a high credit rating. The bonds issued would be seen as high risk high reward but in reality we know that the risk is less than expected.

# Tree based models

### NOTE: If the optimal parameters have been printed for any of the models, DO NOT RUN AGAIN. It takes a long ass time for each one to run. Just add another cell below and create a new model object with the optimal parameters plugged in.

## Random Forest

In [None]:
# find the best parameters first
# random forest parameters
kFold = 5
param_grid = {'n_estimators': np.arange(200, 900, 100),
                'max_features': np.array(['auto', 'sqrt', 'log2']),
                'max_depth': np.arange(2, 30, 3)}
forest_grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=kFold)

# test using training data
forest_grid.fit(x_train, y_train)
best_n = forest_grid.best_params_['n_estimators']
best_f = forest_grid.best_params_['max_features']
best_d = forest_grid.best_params_['max_depth']

print("Best n estimators:   %f" % best_n)
print("Best max features:   %s" % best_f)
print("Best max depth:      %f" % best_d)

In [None]:
# use the best parameters to test accuracy on test set
# look through GridSearchCV results and make some graphs
# TODO: make graphs using the results, make confusion matrix
# TODO: check the accuracy of the model on the test set

In [None]:
# fit a random forest classifier using the best parameters
# I want to graph and view what the top predictors are
# TODO: graph the top predictors

## Adaboost

In [None]:
# find the best parameters first
kFold = 5
param_grid = {'n_estimators': np.arange(200, 900, 100),
                'learning_rate': np.arange(0.25, 1.25, 0.25)}
adaboost_grid = GridSearchCV(AdaBoostClassifier(), param_grid, cv=kFold)

# test using training data
adaboost_grid.fit(x_train, y_train)
best_n = adaboost_grid.best_params_['n_estimators']
best_l = adaboost_grid.best_params_['learning_rate']

print("Best n estimators:    %f" % best_n)
print("Best learning rate:   %f" % best_l)

In [None]:
# TODO: make graphs using the results, make confusion matrix
# TODO: check the accuracy of the model on the test set

## Gradientboosting

In [None]:
# find the best parameters first
kFold = 5
param_grid = {'n_estimators': np.arange(200, 900, 100),
                'learning_rate': np.arange(0.25, 1.25, 0.25),
                'max_depth': np.arange(5, 35, 5)}
gradient_grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=kFold)

# test using training data
gradient_grid.fit(x_train, y_train)
best_n = gradient_grid.best_params_['n_estimators']
best_l = gradient_grid.best_params_['learning_rate']
best_d = gradient_grid.best_params_['max_depth']

print("Best n estimators:    %f" % best_n)
print("Best learning rate:   %f" % best_l)
print("Best max depth:       %f" % best_d)

In [None]:
# TODO: make graphs using the results, make confusion matrix
# TODO: check the accuracy of the model on the test set