## Part One

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from scipy import stats

In [102]:
from statsmodels.sandbox.regression.gmm import IV2SLS 

In [103]:
from statsmodels.sandbox.regression.gmm import GMM

In [104]:
input_table = pd.read_csv('/Users/shubhangimallik/Downloads/midterm_partone.csv')
input_table.head()

Unnamed: 0,Constant,Stock Change,Inventory Turnover,Operating Profit,Interaction Effect,Current Ratio,Quick Ratio,Debt Asset Ratio
0,1,0.870332,1.795946,0.115846,0.208053,1.672527,0.255171,0.473317
1,1,-0.047347,1.395501,0.436967,0.609788,1.637261,0.221763,0.489967
2,1,0.001176,1.664563,0.541016,0.900555,1.640619,0.189141,0.374269
3,1,-0.9012,1.605738,0.539399,0.866133,1.436221,0.131944,0.224399
4,1,-0.176353,1.591451,0.539938,0.859285,1.43314,0.183095,0.213446


In [105]:
model_iv = sm.OLS(input_table["Inventory Turnover"],input_table[["Constant","Current Ratio","Quick Ratio",\
                                                                 "Debt Asset Ratio"]]).fit()
endog_predict = model_iv.predict(input_table[["Constant","Current Ratio","Quick Ratio","Debt Asset Ratio"]])
input_table["Endogenous Param"] = endog_predict

In [106]:
model_2sls = sm.OLS(input_table["Stock Change"], input_table[["Constant","Endogenous Param",\
                                                              "Operating Profit","Interaction Effect",\
                                                             ]]).fit()
model_2sls.summary()

0,1,2,3
Dep. Variable:,Stock Change,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,8.53
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,1.27e-05
Time:,13:45:37,Log-Likelihood:,-1186.5
No. Observations:,1696,AIC:,2381.0
Df Residuals:,1692,BIC:,2403.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Constant,-0.0176,0.020,-0.896,0.370,-0.056,0.021
Endogenous Param,0.0011,0.001,1.827,0.068,-7.76e-05,0.002
Operating Profit,-0.1201,0.028,-4.319,0.000,-0.175,-0.066
Interaction Effect,0.0014,0.000,3.621,0.000,0.001,0.002

0,1,2,3
Omnibus:,368.832,Durbin-Watson:,2.243
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3433.92
Skew:,0.742,Prob(JB):,0.0
Kurtosis:,9.811,Cond. No.,109.0


In [107]:
y_vals  = np.array(input_table["Stock Change"])
x_vals  = np.array(input_table[["Inventory Turnover","Operating Profit","Interaction Effect"]])
iv_vals = np.array(input_table[["Current Ratio","Quick Ratio","Debt Asset Ratio"]])

class gmm(GMM):
    def momcond(self, params):
        p0, p1, p2, p3 = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument   

        error0 = endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]
        error1 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * exog[:,1]
        error2 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * exog[:,2]
        error3 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * inst[:,0] 
        error4 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * inst[:,1] 
        error5 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * inst[:,2] 

        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g


beta0 = np.array([0.1, 0.1, 0.1, 0.1])
res = gmm(endog = y_vals, exog = x_vals, instrument = iv_vals, k_moms=6, k_params=4).fit(beta0)

res.summary()


Optimization terminated successfully.
         Current function value: 0.000046
         Iterations: 8
         Function evaluations: 12
         Gradient evaluations: 12
Optimization terminated successfully.
         Current function value: 0.000373
         Iterations: 7
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.000372
         Iterations: 5
         Function evaluations: 9
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: 0.000372
         Iterations: 5
         Function evaluations: 11
         Gradient evaluations: 11
Optimization terminated successfully.
         Current function value: 0.000372
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1


0,1,2,3
Dep. Variable:,y,Hansen J:,0.6317
Model:,gmm,Prob (Hansen J):,0.729
Method:,GMM,,
Date:,"Wed, 08 Nov 2023",,
Time:,13:45:37,,
No. Observations:,1696,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
p 0,-0.0200,0.021,-0.964,0.335,-0.061,0.021
p 1,0.0011,0.001,1.843,0.065,-6.89e-05,0.002
p 2,-0.1071,0.032,-3.370,0.001,-0.169,-0.045
p 3,0.0011,0.000,2.760,0.006,0.000,0.002


In [108]:
class gmm_with_delta(GMM):
    def __init__(self, endog, exog, instrument, k_moms=1, k_params=1):
        super().__init__(endog, exog, instrument, k_moms, k_params)

    def momcond(self, params):
        p0, p1, p2, p3, delta = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument   

        error0 = endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2] - delta
        error1 = ((endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2])- delta) * exog[:,1]
        error3 = ((endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2])- delta) * inst[:,0] 
        error2 = ((endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2])- delta) * exog[:,2]
        error4 = ((endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2])- delta) * inst[:,1] 
        error5 = ((endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2])- delta) * inst[:,2] 

        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g


In [109]:

gmm_model = gmm_with_delta(endog=y_vals, exog=x_vals, instrument=iv_vals, k_moms=6, k_params=5)

initial_params = np.array([0.1, 0.1, 0.1, 0.1, 0.1])
results = gmm_model.fit(initial_params)
results.summary()


Optimization terminated successfully.
         Current function value: 0.000046
         Iterations: 10
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.000373
         Iterations: 7
         Function evaluations: 12
         Gradient evaluations: 12
Optimization terminated successfully.
         Current function value: 0.000372
         Iterations: 6
         Function evaluations: 10
         Gradient evaluations: 10
Optimization terminated successfully.
         Current function value: 0.000372
         Iterations: 1
         Function evaluations: 3
         Gradient evaluations: 3


0,1,2,3
Dep. Variable:,y,Hansen J:,0.6317
Model:,gmm_with_delta,Prob (Hansen J):,0.427
Method:,GMM,,
Date:,"Wed, 08 Nov 2023",,
Time:,13:45:38,,
No. Observations:,1696,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
p 0,-0.0100,1.5e+06,-6.68e-09,1.000,-2.94e+06,2.94e+06
p 1,0.0011,0.001,1.843,0.065,-6.9e-05,0.002
p 2,-0.1071,0.032,-3.370,0.001,-0.169,-0.045
p 3,0.0011,0.000,2.760,0.006,0.000,0.002
p 4,-0.0100,1.5e+06,-6.68e-09,1.000,-2.94e+06,2.94e+06


In [110]:
coefficients = results.params
delta_coefficient = coefficients[4]
delta_std_err = results.bse[4] 
nobs = results.nobs
num_coef = len(coefficients)
df = nobs - num_coef
alpha = 0.05
t_statistic = delta_coefficient / delta_std_err
critical_t_value = stats.t.ppf(1 - alpha / 2, df)
p_value = results.wald_test(r_matrix=np.array([0, 0, 0, 0, 1])).pvalue
if abs(t_statistic) > critical_t_value:
    print(f"Delta coefficient (p4) is statistically significant (p-value = {p_value}), reject the null hypothesis.")
else:
    print(f"Delta coefficient (p4) is not statistically significant (p-value = {p_value}), fail to reject the null hypothesis.")


Delta coefficient (p4) is not statistically significant (p-value = 0.9999999946731644), fail to reject the null hypothesis.




Based on the analysis of the GMM summary table and the test statistics of coefficients, we can make the following assessment regarding the industry expert's claim about the δ term:

For the GMM model with delta (GMM with delta):

The test for the delta coefficient (p4) resulted in a p-value of approximately 0.5903.
The null hypothesis was not rejected, indicating that the delta coefficient is not statistically significant in this model.
This suggests that, in the context of the GMM model with delta, there is no strong statistical evidence to support the industry expert's claim that the δ term has a significant effect on the model.

The results indicate that, in this particular analysis, the industry expert's claim is not statistically justified

## Part Two

In [111]:
df = pd.read_csv('/Users/shubhangimallik/Downloads/midterm_parttwo.csv')
df.head()

Unnamed: 0,Years of Education after High School,Requested Credit Amount,Number of Dependents,Monthly Income,Monthly Expense,Marital Status,Credit Rating
0,1,Low,No dependent,Very low,Very low,Married,Positive
1,2,Low,No dependent,Very low,Very low,Single,Positive
2,1,Low,No dependent,Very low,Very low,Single,Positive
3,3,Low,No dependent,Very low,Very low,Married,Positive
4,3,Low,No dependent,Very low,Very low,Single,Negative


In [112]:
df = pd.get_dummies(df,columns=['Requested Credit Amount', 'Marital Status', 'Number of Dependents',
                                      'Monthly Income', 'Monthly Expense'], drop_first=True)

In [113]:
X = df.drop('Credit Rating', axis=1)
y = df['Credit Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [114]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [115]:
from sklearn.metrics import recall_score, precision_score, f1_score

recall = recall_score(y_test, y_pred, pos_label='Positive')
precision = precision_score(y_test, y_pred, pos_label='Positive')
f1 = f1_score(y_test, y_pred, pos_label='Positive')


print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")
print(f"F1 Score: {f1:.2f}")


Recall: 1.00
Precision: 0.86
F1 Score: 0.92


In [116]:
confusion = confusion_matrix(y_test, y_pred)

In [117]:
print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[   0  577]
 [   0 3464]]


In [118]:
X_test.columns

Index(['Years of Education after High School', 'Requested Credit Amount_Low',
       'Requested Credit Amount_Medium', 'Marital Status_Not specified',
       'Marital Status_Single', 'Number of Dependents_More than 2',
       'Number of Dependents_No dependent', 'Monthly Income_Low',
       'Monthly Income_Moderate', 'Monthly Income_Very High',
       'Monthly Income_Very low', 'Monthly Expense_Low',
       'Monthly Expense_Moderate', 'Monthly Expense_Very high',
       'Monthly Expense_Very low'],
      dtype='object')

In [119]:
print(len(y_pred_prob))

4041


In [120]:
approval_threshold = 0.15 
y_pred_prob = model.predict_proba(X_test)[:, 1]
threshold_value = sorted(y_pred_prob)[int((1 - approval_threshold) * len(y_pred_prob))]

In [121]:
print(threshold_value)

0.8875163812479165


In [122]:

y_pred_new_threshold_mapped = ['Negative' if pred == 0 else 'Positive' for pred in y_pred_new_threshold]

confusion_new_threshold = confusion_matrix(y_test, y_pred_new_threshold_mapped)
recall_new_threshold = recall_score(y_test, y_pred_new_threshold_mapped, pos_label='Positive')
precision_new_threshold = precision_score(y_test, y_pred_new_threshold_mapped, pos_label='Positive')
f1_new_threshold = f1_score(y_test, y_pred_new_threshold_mapped, pos_label='Positive')


print("\nConfusion Matrix with New Threshold:")
print(confusion_new_threshold)
print(f"Recall with New Threshold: {recall_new_threshold:.2f}")
print(f"Precision with New Threshold: {precision_new_threshold:.2f}")
print(f"F1 Score with New Threshold: {f1_new_threshold:.2f}")



Confusion Matrix with New Threshold:
[[ 495   82]
 [2936  528]]
Recall with New Threshold: 0.15
Precision with New Threshold: 0.87
F1 Score with New Threshold: 0.26
