# Part II B - CF969-7-SP-CO
## Big Data for Computational Finance

### a linear regression approach with Ridge (or L1) and Lasso (or L2) regularisation to predict whether a firm is in an investment grade or not

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Loading the dataset into a pandas DataFrame
df = pd.read_csv('MLF_GP1_CreditScore.csv')

# Separate features and target variable
# Features are the independent variables that we will use to predict the target variable.
features = df.drop(['InvGrd', 'Rating'], axis=1)
target = df['Rating']

# Encode target variable
# This is necessary because the target variable is a categorical variable.
le = LabelEncoder()
target_encoded = le.fit_transform(target)

# Split the dataset into training and test sets
# This is done so that we can train the model on the training set and then evaluate the model on the test set.
# The test set is used to measure the accuracy of the model on unseen data.
X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.2, random_state=42)

# Scale the features
# This is done to normalize the features so that they are on a similar scale.
# This is important for the linear regression model to work properly.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit a linear regression model with Ridge regularization
# Ridge regularization is a technique that helps to prevent overfitting.
# Overfitting is a problem that occurs when the model learns the training data too well and is not able to generalize to new data.
ridge = Ridge(alpha=8.0)
ridge.fit(X_train, y_train)

# Predict target variable for test set and convert to categories
# We need to convert this to a binary classification.
y_pred_ridge = le.inverse_transform(np.round(ridge.predict(X_test)).astype(int))
y_test = le.inverse_transform(y_test)

# Generate binary classification list for predicted and actual target variable
Good = ['Aaa', 'Aa1', 'Aa2', 'Aa3', 'A1', 'A2', 'A3', 'Baa1', 'Baa2', 'Baa3']
y_pred_ridge_grade = list(map(lambda x: 1 if x in Good else 0, y_pred_ridge))
y_test_grade = list(map(lambda x: 1 if x in Good else 0, y_test))

# Fit a linear regression model with Lasso regularization
lasso = Lasso(alpha=0.02)
lasso.fit(X_train, y_train)

# Predict target variable for test set and convert to categories
y_pred_lasso = le.inverse_transform(np.round(lasso.predict(X_test)).astype(int))

# Generate binary classification list for predicted target variable
y_pred_lasso_grade = list(map(lambda x: 1 if x in Good else 0, y_pred_lasso))

# Evaluate the performance of the models on the test data
# The accuracy score is a measure of how well the model predicts the target variable.
accuracy_ridge = accuracy_score(y_test_grade, y_pred_ridge_grade)
accuracy_lasso = accuracy_score(y_test_grade, y_pred_lasso_grade)

# Print the accuracies
print('Percentage Ridge accuracy:', 100*accuracy_ridge)
print('Percentage Lasso accuracy:', 100*accuracy_lasso)


Percentage Ridge accuracy: 26.176470588235297
Percentage Lasso accuracy: 25.588235294117645


### a logistic regression approach with Ridge (or L1) and Lasso (or L2) regularisation to predict whether a firm is in an investment grade or not

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the credit score dataset into a pandas DataFrame
credit_df = pd.read_csv('MLF_GP1_CreditScore.csv')

# Drop the 'InvGrd' and 'Rating' columns from the input features
X = credit_df.drop(['InvGrd', 'Rating'], axis=1)

# Encode the target variable 'Rating' using LabelEncoder
le = LabelEncoder()
y = le.fit_transform(credit_df['Rating'])

# Split the dataset into training and test sets with 80-20 split ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Scale the input features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit a logistic regression model with Ridge regularization to the training data
ridge = LogisticRegression(penalty='l1', solver='saga', max_iter=10000, C=0.1, random_state=42)
ridge.fit(X_train, y_train)

# Make predictions on the test data using the trained Ridge logistic regression model
y_pred_ridge = ridge.predict(X_test)

# Inverse transform the predicted and actual target variables from encoded to original values
y_pred_ridge = le.inverse_transform(y_pred_ridge)
y_test = le.inverse_transform(y_test)

# Map the target variable values to binary values (1 for investment grade and 0 for non-investment grade)
yes_labels = ['Aaa', 'Aa1', 'Aa2', 'Aa3', 'A1', 'A2', 'A3', 'Baa1', 'Baa2', 'Baa3']
y_pred_ridge_grade = list(map(lambda x: 1 if x in yes_labels else 0, y_pred_ridge))
y_test_grade = list(map(lambda x: 1 if x in yes_labels else 0, y_test))

# Fit a logistic regression model with Lasso regularization to the training data
lasso = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=10000, C=1.0, random_state=42)
lasso.fit(X_train, y_train)

# Make predictions on the test data using the trained Lasso logistic regression model
y_pred_lasso = lasso.predict(X_test)

# Inverse transform the predicted target variables from encoded to original values
y_pred_lasso = le.inverse_transform(y_pred_lasso)

# Map the target variable values to binary values (1 for investment grade and 0 for non-investment grade)
y_pred_lasso_grade = list(map(lambda x: 1 if x in yes_labels else 0, y_pred_lasso))

# Evaluate the performance of the models on the test data using accuracy score metric
accuracy_ridge = accuracy_score(y_test_grade, y_pred_ridge_grade)
accuracy_lasso = accuracy_score(y_test_grade, y_pred_lasso_grade)

# Print the accuracies
print('Percentage Ridge accuracy:', 100*accuracy_ridge)
print('Percentage Lasso accuracy:', 100*accuracy_lasso)


Percentage Ridge accuracy: 80.58823529411765
Percentage Lasso accuracy: 80.0


### A Neural Networks based approach to classify the firm’s rating into one of the rating categories and predict if it is in an investment grade

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
import numpy as np

# Load the dataset into a pandas DataFrame
df = pd.read_csv('MLF_GP1_CreditScore.csv')

# Encode the rating column as categorical labels
le = LabelEncoder()
df['Rating_Encoded'] = le.fit_transform(df['Rating'])

# Split the dataset into training and test sets
X = df.drop(['InvGrd', 'Rating_Encoded','Rating'], axis=1)
y_rating = df['Rating_Encoded']
y_investment_grade = df['InvGrd']
X_train, X_test, y_train_rating, y_test_rating, y_train_grade, y_test_grade = train_test_split(X, y_rating, y_investment_grade, test_size=0.2, random_state=42)

# Scale the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a neural network model for the rating classification task
model_rating = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(len(le.classes_), activation='softmax')
])

# Compile the model with categorical cross-entropy loss and Adam optimizer
model_rating.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the training data
model_rating.fit(X_train, y_train_rating, validation_data=(X_test, y_test_rating), epochs=50, batch_size=32)

# Predict the ratings for the test data
y_pred_rating = model_rating.predict(X_test)
print(y_pred_rating)

y_pred_rating = le.inverse_transform(y_pred_rating.argmax(axis=1))
y_test_rating = le.inverse_transform(y_test_rating)

#print(y_pred_rating)

yes=['Aaa', 'Aa1', 'Aa2', 'Aa3', 'A1', 'A2', 'A3', 'Baa1', 'Baa2', 'Baa3']
y_pred_rating = list(map(lambda x: 1 if x in yes else 0, y_pred_rating))
y_test_rating = list(map(lambda x: 1 if x in yes else 0, y_test_rating))

#y_pred_rating = y_pred_rating.map(lambda x: 1 if x in yes else 0)
#y_test_rating = y_test_rating.map(lambda x: 1 if x in yes else 0)

# Define a neural network model for the investment grade classification task
model_grade = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model with binary cross-entropy loss and Adam optimizer
model_grade.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the training data
model_grade.fit(X_train, y_train_grade, validation_data=(X_test, y_test_grade), epochs=50, batch_size=32)

# Predict the investment grade for the test data
y_pred_grade = model_grade.predict(X_test)
y_pred_grade = (y_pred_grade > 0.5).astype(int)

y_pred_rating=np.array(y_pred_rating)
y_test_rating=np.array(y_test_rating)

# Evaluate the performance of the models on the test data
accuracy_for_rating = accuracy_score(y_test_rating, y_pred_rating)
accuracy_for_grade = accuracy_score(y_test_grade, y_pred_grade)


print('percentage Rating accuracy:', accuracy_for_rating)
print(" I also calculated model accuracy when we take y as grade")
print('percentage Grade accuracy:', accuracy_for_grade)
