# 1 Imports and Global Settings

In [1]:
# importing libraries
from platform import python_version
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

# import sklearn functions
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, SCORERS
from sklearn.model_selection import \
    cross_validate, cross_val_score, cross_val_predict, GridSearchCV, \
    StratifiedKFold, train_test_split, StratifiedShuffleSplit, TimeSeriesSplit

# apply plt style
plt.style.use('seaborn-whitegrid')

# force jupyter to output all statements
InteractiveShell.ast_node_interactivity = 'all'

# printing the versions of python, numpy, pandas and sklearn
from platform import python_version
print('Python version: ' + python_version())
print('Numpy version: ' + np.__version__)
print('Pandas version: ' + pd.__version__)
print('SKlearn version: ' + sklearn.__version__)

Python version: 3.6.10
Numpy version: 1.18.1
Pandas version: 1.0.3
SKlearn version: 0.22.1


# 2 Preparing Data

## 2.1 Loading Files

In [2]:
# loading already cleaned file exported from data_preparation.ipynb
data = pd.read_csv('data/cleaned_data.csv')

# loading file with only the most important features
data_top_features = pd.read_csv('data/data_top_features.csv')

# naming the data frames
data.name, data_top_features.name = 'data', 'data_top_features'

## 2.2 Indexing, Sorting and Inspecting Data Frames

In [3]:
# order splticrm variable
splticrm_categories = pd.CategoricalDtype(categories=[
    'CCC or lower', 'CCC+', 'B-', 'B', 'B+', 'BB-', 'BB', 'BB+', 'BBB-', 'BBB', 'BBB+',
    'A-', 'A', 'A+', 'AA-', 'AA', 'AA+', 'AAA'], ordered=True)

for df in [data, data_top_features]:
    
    # set multi-index for both data frames
    df.set_index(['PERMNO', 'date'], inplace=True)
    
    # sort both data frames by multiindex
    df.sort_index(inplace=True)
    
    # Transfer ordered categories to splticrm
    df["splticrm"] = df["splticrm"].astype(splticrm_categories)
    
    #print the dimensions for both  data sets
    print('There are ' + str(len(set([i[0] for i in df.index.tolist()]))) + \
      ' companies and ' + str(df.shape[1]) + ' variables in "' + df.name + '".')
    
    # ensuring that there are no null values left in both data frames
    if (not df.isnull().values.any()):
        print('The data frame "' + df.name + '" has 0 missing values.')

There are 906 companies and 189 variables in "data".
The data frame "data" has 0 missing values.
There are 301 companies and 66 variables in "data_top_features".
The data frame "data_top_features" has 0 missing values.


In [4]:
# look at the SP500 companies
SP500_percentage = round((data[data['in_sp500'] == 1].shape[0] / data.shape[0]) * 100, 2)
print('S&P 500 observations make up ' + str(SP500_percentage) + ' % of total observations.')

SP500_total = data[data['in_sp500'] == 1].shape[0]
print('In total, there are ' + str(SP500_total) + ' observations of S&P500 companies.')

S&P 500 observations make up 37.81 % of total observations.
In total, there are 16639 observations of S&P500 companies.


## 2.3 Determining the Naive Classifier (Benchmark)

In [5]:
# determining absolute and relative frequencies
absolute_frequencies = [r[1] for r in Counter(data.splticrm).most_common()]
relative_frequencies = [(af / data.shape[0]) for af in absolute_frequencies]
for nr in [5, 10]:
    print('The ' + str(nr) + ' most frequent ratings make up ' + \
          str(round(sum(relative_frequencies[:nr]) * 100, 2)) + ' % of all ratings.')

# determining the naive classifier
naive_classifier = Counter(data.splticrm).most_common()[0][0]
print('The naive classifier would always predict rating ' + naive_classifier + '.')

# determine the benchmark
total_observations = data.shape[0]
most_frequent_response = Counter(data.splticrm).most_common()[0][1]

benchmark = round(most_frequent_response / total_observations * 100, 2)
print('The share of correct predictions (benchmark) to beat is ' + str(benchmark) + ' %.')

The 5 most frequent ratings make up 54.78 % of all ratings.
The 10 most frequent ratings make up 90.84 % of all ratings.
The naive classifier would always predict rating BBB.
The share of correct predictions (benchmark) to beat is 14.11 %.


# 4 Training

## 3.1 Splitting Training and Testing Data

In [8]:
# considering only sp500 companies
SP500 = data[data['in_sp500'] == 1]

# separate response variable
SP500_y, SP500_X = SP500['splticrm'], SP500.drop(columns=['splticrm'])

# test train split
SP500_X_train, SP500_X_test, SP500_y_train, SP500_y_test = train_test_split(
    SP500_X, SP500_y, test_size=0.2, random_state=21, stratify=SP500_y, shuffle=True)

## 3.2 Pipeline

In [9]:
# create pipeline
random_forest_pipe = Pipeline([('rf', RandomForestClassifier(\
    random_state=21, oob_score=True))])

# create k-fold object
k_fold = StratifiedKFold(n_splits=5)

# fit pipeline
random_forest_pipe.fit(SP500_X_train, SP500_y_train);

## 3.3 Metrics

In [10]:
# looking at oob score to get a first idea of how this model generalizes
print('The oob score is ' + str(round(random_forest_pipe['rf'].oob_score_, 2)) + '.\n')

# some multi-class metrics on the training set
multi_class_metrics = ['precision_macro', 'recall_macro', 'accuracy']
cv_scores = cross_validate(random_forest_pipe, SP500_X_train, SP500_y_train, cv=k_fold,
                           scoring=multi_class_metrics, n_jobs=-1);

# print the metrics
for metric_name, metric_values in cv_scores.items():
    print('The ' + metric_name + 's are: ' + \
          str([round(value, 3) for value in metric_values.tolist()]))

# print the mean accuracy
cv_mean, cv_std = cv_scores["test_accuracy"].mean(), cv_scores["test_accuracy"].std()
print(f'\nThe mean accuracy is {cv_mean:.3f} +/- {cv_std:.3f}')

The oob score is 0.97.



The fit_times are: [29.551, 29.629, 29.562, 29.478, 18.178]
The score_times are: [0.475, 0.465, 0.451, 0.457, 0.258]
The test_precision_macros are: [0.908, 0.975, 0.951, 0.963, 0.959]
The test_recall_macros are: [0.917, 0.976, 0.93, 0.943, 0.949]
The test_accuracys are: [0.969, 0.968, 0.967, 0.964, 0.971]

The mean accuracy is 0.968 +/- 0.003


## 3.4 Parameter Tuning with Grid Search

In [12]:
# set parameters to do gridsearchcv over
max_depth = np.array([10, 30, 50, 70])
class_weight = ['balanced', None]

# Minimum number of samples required to split any internal node 
min_samples_split = np.array([1, 2, 5]) 

# The minimum number of samples required to be at a leaf/terminal node
min_samples_leaf = np.array([1, 2, 5])

# define parameter grid
param_grid = {'rf__max_depth': max_depth,
              'rf__class_weight': class_weight,
              'rf__min_samples_split': min_samples_split,
              'rf__min_samples_leaf': min_samples_leaf
             }

random_forest_grid_search = GridSearchCV(random_forest_pipe, param_grid=param_grid,
                                         scoring='accuracy', cv=k_fold, n_jobs=-1)

In [13]:
# fitting the grid search
random_forest_grid_search.fit(SP500_X_train, SP500_y_train)

In [14]:
# save cv_results_ to a .csv file
rf_GridSearchCV_cv_results_ = pd.DataFrame(random_forest_grid_search.cv_results_)
rf_GridSearchCV_cv_results_.to_csv('gridsearch_outputs/rf_GridSearchCV_cv_results_.csv')

# 4 Final Model

## 4.1 Train Model

In [16]:
# create pipeline and kFold object
pipe_final = Pipeline([('rf', RandomForestClassifier(random_state=21, oob_score=True,
    class_weight=None, max_depth=30, min_samples_leaf=1, min_samples_split=2))])

k_fold = StratifiedKFold(n_splits=5, random_state=21)

_ = pipe_final.fit(SP500_X_train, SP500_y_train);

# looking at oob score
print('The oob score is ' + str(round(pipe_final['rf'].oob_score_, 2)) + '.\n')

The oob score is 0.97.



## 4.2 Test Set Prediction

In [17]:
# predict test data
SP500_y_test_pred = pipe_final.predict(SP500_X_test)

# accuracy of final prediction on test data
print('The prediction accuracy of our final model is ' + \
     str(round(accuracy_score(SP500_y_test, SP500_y_test_pred) * 100, 3)) + ' %')

The prediction accuracy of our final model is 97.175 %


In [18]:
# create a list containing tuples for true and predicted ratings
prediction_accuracies = []
for prediction in range(SP500_y_test.shape[0]):
    if SP500_y_test.tolist()[prediction] != SP500_y_test_pred.tolist()[prediction]:
        prediction_accuracies.append([SP500_y_test.tolist()[prediction],
                                     SP500_y_test_pred.tolist()[prediction]])

# get an ordered list for all the ratings
rating_categories = list(splticrm_categories.categories)

# checking how far off the predictors are
off_by_one, off_by_two, off_more = 0, 0, 0
for tup in prediction_accuracies:
    if abs(rating_categories.index(tup[0]) - rating_categories.index(tup[1])) < 2:
        off_by_one += 1
    elif abs(rating_categories.index(tup[0]) - rating_categories.index(tup[1])) < 3:
        off_by_two += 1
    else:
        off_more += 1

# get relative values
off_by_one_relative = round(off_by_one / len(prediction_accuracies) * 100, 2)
off_by_two_relative = round(off_by_two / len(prediction_accuracies) * 100, 2)
off_more_relative = round(off_more / len(prediction_accuracies) * 100, 2)

print(str(off_by_one_relative) + ' % of the false predictions are only one category off.')
print(str(off_by_two_relative) + ' % of the false predictions are two categories off.')
print(str(off_more_relative) + ' % of the false predictions are more than two categories off.')

88.3 % of the false predictions are only one category off.
11.7 % of the false predictions are two categories off.
0.0 % of the false predictions are more than two categories off.
