In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import GMM

# PART 1

1. Update the GMM model that we discussed in class by incorporating the δ term to the instrumental-variable moment expressions.

In [9]:
# Load the data
input_table = pd.read_csv('https://raw.githubusercontent.com/mn42899/predictive_modelling/refs/heads/main/midterm_partone.csv')

# Define Y, X, and Z
y_vals  = np.array(input_table["Stock Change"])
x_vals  = np.array(input_table[["Inventory Turnover", "Operating Profit", "Interaction Effect"]])
iv_vals = np.array(input_table[["Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])

# Define the biased GMM class with delta
class BiasedGMM(GMM):
    def __init__(self, endog, exog, instrument, delta):
        super().__init__(endog, exog, instrument)
        self.delta = delta

    def momcond(self, params):
        p0, p1, p2, p3 = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument

        # Bias term for each error condition
        bias = self.delta * np.ones(6)  # Adjust to match six moment conditions if necessary
        residuals = endog - (p0 + p1 * exog[:, 0] + p2 * exog[:, 1] + p3 * exog[:, 2])
        
        # Updated moment conditions with bias
        error0 = residuals - bias[0]
        error1 = residuals * exog[:, 1] - bias[1]
        error2 = residuals * exog[:, 2] - bias[2]
        error3 = residuals * inst[:, 0] - bias[3]
        error4 = residuals * inst[:, 1] - bias[4]
        error5 = residuals * inst[:, 2] - bias[5]

        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g

# Initialize the model with delta = 0.05 (adjust delta as necessary)
delta = 0.05
model = BiasedGMM(endog=y_vals, exog=x_vals, instrument=iv_vals, delta=delta)

# Initial parameter guesses
initial_params = np.array([0.1, 0.1, 0.1, 0.1])

# Use an identity matrix as an initial weighting matrix, compatible with moment conditions
initial_weights = np.eye(6)

# Fit the model with initial parameters and weights
res = model.fit(initial_params, inv_weights=initial_weights)

# Output the results
print("Estimated Parameters:", res.params)
print("Objective Function Value:", res.q)

# Print the GMM summary
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.001501
         Iterations: 11
         Function evaluations: 14
         Gradient evaluations: 14
Optimization terminated successfully.
         Current function value: 0.088572
         Iterations: 10
         Function evaluations: 15
         Gradient evaluations: 15
Optimization terminated successfully.
         Current function value: 0.070496
         Iterations: 10
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.038870
         Iterations: 8
         Function evaluations: 11
         Gradient evaluations: 11
Optimization terminated successfully.
         Current function value: 0.073935
         Iterations: 8
         Function evaluations: 12
         Gradient evaluations: 12
Optimization terminated successfully.
         Current function value: 0.096513
         Iterations: 6
         Function evaluations: 9
      

# PART 2

1.	Divide the dataset equally into two as training (50%) and test (50%) sets. Use the training set to fit a logistic regression model, where the credit rating is the dependent variable. Apply the model to the test set, and report the confusion matrix, recall, precision, and F1 score values.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Load the dataset
url = 'https://raw.githubusercontent.com/mn42899/predictive_modelling/refs/heads/main/midterm_parttwo.csv'
data = pd.read_csv(url)

# Convert categorical variables to numerical (if needed)
data['Credit Rating'] = data['Credit Rating'].apply(lambda x: 1 if x == 'Positive' else 0)

# Define features and target variable
X = pd.get_dummies(data.drop('Credit Rating', axis=1), drop_first=True)
y = data['Credit Rating']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate metrics
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Confusion Matrix:
 [[   0  577]
 [   0 3464]]
Precision: 0.8572135609997525
Recall: 1.0
F1 Score: 0.9231179213857429


2. Suppose that the bank decided to make the credit approval process more challenging such that only 15% of the applications would be granted. Calculate the threshold value for the prediction probability, so only 15% of the test set would get their applications approved. Then, update your confusion matrix, recall, precision, and F1 scores. 

In [7]:
import numpy as np

# Predict probabilities instead of labels
y_proba = model.predict_proba(X_test)[:, 1]

# Calculate threshold to classify only 15% as approved
threshold = np.percentile(y_proba, 85)

# Predict with the new threshold
y_pred_new = (y_proba >= threshold).astype(int)

# Calculate metrics with the new threshold
conf_matrix_new = confusion_matrix(y_test, y_pred_new)
precision_new = precision_score(y_test, y_pred_new)
recall_new = recall_score(y_test, y_pred_new)
f1_new = f1_score(y_test, y_pred_new)

print("\nNew Confusion Matrix (15% threshold):\n", conf_matrix_new)
print("New Precision:", precision_new)
print("New Recall:", recall_new)
print("New F1 Score:", f1_new)


New Confusion Matrix (15% threshold):
 [[ 495   82]
 [2938  526]]
New Precision: 0.8651315789473685
New Recall: 0.15184757505773672
New F1 Score: 0.2583497053045187
