In [2]:
# load neccessary packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

In [3]:
# load the data
CLEAN_DATAFILE = "BDP_CLEAN.csv"
df = pd.read_csv(CLEAN_DATAFILE)

# lets drop the columns we dont want to predict on
# these columns are only for identification
drop_columns = ['Ticker', 'Rating Date', 'Fiscal Year']
y_variable = "RTG_SP_LT_LC_ISSUER_CREDIT"
df = df.drop(columns=drop_columns)
x = df.drop(columns=[y_variable])
y = df[y_variable]

# within the x variables there are 2 columns that are categorical data
# lets one hot encode the categorical data
x = pd.get_dummies(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)

### I am using this file to run other classification models in parallel with the models running in models.ipynb because the notebook won't run multiple code blocks at once

In [5]:
# this is adaboost

# find the best parameters first
kFold = 5
param_grid = {'n_estimators': np.arange(200, 800, 100),
                'learning_rate': np.arange(0.25, 1.25, 0.25)}
adaboost_grid = GridSearchCV(AdaBoostClassifier(), param_grid, cv=kFold)

# test using training data
adaboost_grid.fit(x_train, y_train)
best_n = adaboost_grid.best_params_['n_estimators']
best_l = adaboost_grid.best_params_['learning_rate']

print("Best n estimators:    %f" % best_n)
print("Best learning rate:   %f" % best_l)



Best n estimators:    200.000000
Best learning rate:   0.250000


In [9]:
# test the accuracy of the model on the test set
adaboost_score = adaboost_grid.score(x_test, y_test)
print("Accuracy of AdaBoost on test set: %f" % adaboost_score)

Accuracy of AdaBoost on test set: 0.115449


In [10]:
adaboost_grid.cv_results_



{'mean_fit_time': array([ 5.30855756,  7.97637382, 10.55586929, 13.19462719, 15.8273417 ,
        18.47492461,  5.26933699,  7.90738482, 10.56902876, 13.17866931,
        15.85375681, 18.47049475,  5.26585355,  7.90462942, 10.51354742,
        13.15230699, 15.79740324, 18.43070908,  5.27188425,  7.90415726,
        10.52476482, 13.16258807, 15.80681472, 18.42431593]),
 'mean_score_time': array([0.09678984, 0.14191236, 0.1877419 , 0.23458762, 0.27994242,
        0.32697878, 0.09488301, 0.14073081, 0.18692408, 0.23406048,
        0.27937417, 0.32469907, 0.09395733, 0.14001832, 0.18686867,
        0.23262496, 0.27991014, 0.32338586, 0.09347868, 0.14003272,
        0.18518262, 0.23198638, 0.28086324, 0.3245091 ]),
 'mean_test_score': array([0.15239822, 0.14654211, 0.13720022, 0.13148355, 0.12841606,
        0.12576687, 0.14054657, 0.132599  , 0.12757948, 0.12716118,
        0.12674289, 0.12506972, 0.13873397, 0.14138316, 0.13984941,
        0.1366425 , 0.13650307, 0.13552705, 0.13775795, 0

In [11]:
y_pred = adaboost_grid.predict(x_test)