In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import linearmodels.iv.model as lm
from scipy import stats

In [29]:
def firstStageRegress(IVs, endo):
    X_stage1 = sm.add_constant(df[IVs])
    y_stage1 = df[endo]  # Endogenous variable

    # Fit the first-stage regression to find the predicted values of p
    results_stage1 = sm.OLS(y_stage1, X_stage1).fit()

    # Predict the values of educ using the first-stage regression model
    predictor = results_stage1.predict(X_stage1)

    return results_stage1, predictor  # Return p_hat

def secondStageRegress(predictor, endo, exo):
    df["phat"] = predictor
    
    # Stage 2: Use the predicted values of p
    X_stage2 = sm.add_constant(df[["phat"] + exo])
    y_stage2 = df[endo]  

    # Fit the second-stage regression model
    results_stage2 = sm.OLS(y_stage2, X_stage2).fit()
    
    return results_stage2

In [30]:
df = pd.read_csv("Data-GP1-1(updated).csv")
df = df.drop('Fri', axis=1)
df = df.drop('Sat', axis=1)
df = df.drop('Sun', axis=1)
df

Unnamed: 0,Mon,Tue,Wed,Thu,Date,Month,Year,Stormy,Mixed,p,q,Rainy,Cold,Wind
0,1,0,0,0,2,12,91,1,0,-0.430783,8.994421,1,0,2.995732
1,0,1,0,0,3,12,91,1,0,0.000000,7.707063,0,0,2.995732
2,0,0,1,0,4,12,91,0,1,0.072321,8.350194,1,1,2.813411
3,0,0,0,1,5,12,91,1,0,0.247139,8.656955,0,1,3.036554
4,0,0,0,0,6,12,91,1,0,0.664327,7.844241,0,1,3.036554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1,0,0,0,4,5,92,0,0,-0.798508,8.610683,0,0,2.862201
107,0,1,0,0,5,5,92,0,1,-0.087011,7.162397,0,0,2.908721
108,0,0,1,0,6,5,92,0,1,0.184922,7.362010,0,0,2.862201
109,0,0,0,1,7,5,92,0,1,0.223143,8.764053,0,0,2.813411


---



In [35]:
# Declare Variables

Exo = ["Mon", "Tue","Wed","Thu"]
IV = ["Stormy","Mixed"]
Endo = ["p"]
Predict = ["q"]


---

# Original Regression

In [32]:
X_og = sm.add_constant(df[Endo + Exo])
Y_og = df[Predict]

results_og = sm.OLS(Y_og, X_og).fit()

print(results_og.summary())


                            OLS Regression Results                            
Dep. Variable:                      q   R-squared:                       0.220
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     5.940
Date:                Mon, 18 Sep 2023   Prob (F-statistic):           7.08e-05
Time:                        20:00:59   Log-Likelihood:                -110.00
No. Observations:                 111   AIC:                             232.0
Df Residuals:                     105   BIC:                             248.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.6069      0.143     60.170      0.0

---

# 2 Step Least Square Regression

In [33]:
# First Stage Regression
result1, predictor = firstStageRegress(IV, Endo)
print(result1.summary())

# Second Stage Regression
result2 = secondStageRegress(predictor, Predict, Exo)
print(result2.summary())

                            OLS Regression Results                            
Dep. Variable:                      p   R-squared:                       0.227
Model:                            OLS   Adj. R-squared:                  0.212
Method:                 Least Squares   F-statistic:                     15.83
Date:                Mon, 18 Sep 2023   Prob (F-statistic):           9.32e-07
Time:                        20:01:01   Log-Likelihood:                -35.891
No. Observations:                 111   AIC:                             77.78
Df Residuals:                     108   BIC:                             85.91
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3919      0.051     -7.756      0.0

---

# Hausman Test and Sargan Test

In [34]:
 mlr2 = lm.IV2SLS(dependent=df[Predict], 
                 exog=df[Exo], 
                 endog=df[Endo], 
                 instruments=df[IV]).fit(cov_type="homoskedastic", debiased=True)
    
print(mlr2.wu_hausman(),"\n")
print(mlr2.sargan)

Wu-Hausman test of exogeneity
H0: All endogenous variables are exogenous
Statistic: 29.1437
P-value: 0.0000
Distributed: F(1,105) 

Sargan's test of overidentification
H0: The model is not overidentified.
Statistic: 3.1727
P-value: 0.0749
Distributed: chi2(1)
