In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import linearmodels.iv.model as lm
from scipy import stats
from itertools import combinations

In [3]:
def firstStageRegress(variables, endo):
    """First Stage Regression"""
    
    X_stage1 = sm.add_constant(df[variables])
    y_stage1 = df[endo]

    # Fit the regression to find the predicted values
    results_stage1 = sm.OLS(y_stage1, X_stage1).fit()
    predictor = results_stage1.predict(X_stage1)

    return results_stage1, predictor 

def secondStageRegress(variables, predict):  
    """Second Stage Regression"""
    
    X_stage2 = sm.add_constant(df[variables])
    y_stage2 = df[predict]  

    # Fit the regression model
    results_stage2 = sm.OLS(y_stage2, X_stage2).fit()
    
    return results_stage2

In [4]:
# Read dataset
df = pd.read_csv("Data-GP1-1(updated).csv")
df

Unnamed: 0,Mon,Tue,Wed,Thu,Date,Month,Year,Stormy,Mixed,p,q,Rainy,Cold,Wind
0,1,0,0,0,2,12,91,1,0,-0.430783,8.994421,1,0,2.995732
1,0,1,0,0,3,12,91,1,0,0.000000,7.707063,0,0,2.995732
2,0,0,1,0,4,12,91,0,1,0.072321,8.350194,1,1,2.813411
3,0,0,0,1,5,12,91,1,0,0.247139,8.656955,0,1,3.036554
4,0,0,0,0,6,12,91,1,0,0.664327,7.844241,0,1,3.036554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1,0,0,0,4,5,92,0,0,-0.798508,8.610683,0,0,2.862201
107,0,1,0,0,5,5,92,0,1,-0.087011,7.162397,0,0,2.908721
108,0,0,1,0,6,5,92,0,1,0.184922,7.362010,0,0,2.862201
109,0,0,0,1,7,5,92,0,1,0.223143,8.764053,0,0,2.813411


In [5]:
# Generate all possible combinations of variables

combinations_list = []
variables = [col for col in df.columns if col not in ["q", "p"]] # Retrieving only the columns of interest

# Permutate and append combinations into a list
for r in range(1, len(variables) + 1):
    combinations_list.extend(combinations(variables, r))

# Print total number of combinations made
print(len(combinations_list))

4095


In [6]:
# Weak Instrument Test, where significance level is deemed at 0.05%

sign_IVs = [] # IVs 
sign_phat = []
sign_result = []

for combination in combinations_list:
    # Setting IV Variables
    IV = list(combination)
    
    # Setting Exogenous Variable
    Exo = ["Mon","Tue","Wed","Thu"]
    
    # First Stage Regression
    result, p_hat = firstStageRegress(IV + Exo, "p")
    
    # Check that:
    # - the F Statistics p value is significant
    # - the IV variables are not present in the Exogenous Variables
    # - there are more than one IV variable
    if result.f_pvalue < 0.05 and all(iv not in Exo for iv in IV) and len(IV) != 1:  # Check if all p-values are less than 0.05
        # Append the corresponding IV variable into a list if true
        sign_IVs.append(IV)
        # Append the corresponding estimated coefficient into a list if true
        sign_phat.append(p_hat)
        # Append the corresponding regression result into a list if true
        sign_result.append(result)

# For combinations that fulfilled the conditionals, print out their IVs and corresponding F-statistics p value 
for idx, IV in enumerate(sign_IVs):
    print(f"{idx + 1}. IV:{IV}, Prob (F-stat):{sign_result[idx].f_pvalue}")


1. IV:['Date', 'Stormy'], Prob (F-stat):0.0007685111179881521
2. IV:['Date', 'Cold'], Prob (F-stat):0.031971181025790954
3. IV:['Date', 'Wind'], Prob (F-stat):0.00020143786735149507
4. IV:['Month', 'Year'], Prob (F-stat):0.044780095341582775
5. IV:['Month', 'Stormy'], Prob (F-stat):0.0005497216954061974
6. IV:['Month', 'Wind'], Prob (F-stat):0.00015725489861969494
7. IV:['Year', 'Stormy'], Prob (F-stat):0.0002769901540010904
8. IV:['Year', 'Cold'], Prob (F-stat):0.04228873264902102
9. IV:['Year', 'Wind'], Prob (F-stat):8.98728923144987e-05
10. IV:['Stormy', 'Mixed'], Prob (F-stat):4.34850312546488e-05
11. IV:['Stormy', 'Rainy'], Prob (F-stat):0.0019177587524664896
12. IV:['Stormy', 'Cold'], Prob (F-stat):0.0011745959177015097
13. IV:['Stormy', 'Wind'], Prob (F-stat):0.0002272844034769734
14. IV:['Mixed', 'Wind'], Prob (F-stat):0.00036753279579365537
15. IV:['Rainy', 'Wind'], Prob (F-stat):0.0005174663785250746
16. IV:['Cold', 'Wind'], Prob (F-stat):0.00042319842910023403
17. IV:['Date'

In [7]:
# Wu-Hausman Test and Sargan Test, where significance level is deemed at 0.05%

sign_IVs_2 = []
feasible_combi = []

# Iterate through the combinations of IVs identified from Weak Instrument Test
for idx, iv_combination in enumerate(sign_IVs):
    # Perform 2 Step Least Square Regression
    mlr2 = lm.IV2SLS(dependent=df[["q"]], 
                     exog=df[["Mon","Tue","Wed","Thu"]], 
                     endog=df[["p"]], 
                     instruments=df[list(iv_combination)]).fit(cov_type="homoskedastic", debiased=True)
    
    # Check that:
    # - p-value of Wu-Hausman test is significant
    # - p-value of Sargan test is insignificant
    if mlr2.wu_hausman().pval < 0.05 and mlr2.sargan.pval > 0.05:
        # Append the corresponding IV variable into a list if true
        sign_IVs_2.append(iv_combination)

# For combinations that fulfilled the conditionals, append them into a list
for idx, combination in enumerate(sign_IVs_2):
    IV = combination
    Exo = ["Mon","Tue","Wed","Thu"]
    
    # Perform First Stage Regression
    result, p_hat = firstStageRegress(IV + Exo, "p")
    
    feasible_combi.append({
        "iv": IV,
        "result": result,
        "p_hat": p_hat
    })
    
# Print out the combinations in ascending order based on the p-values of the F-statistics test
sorted_combi = sorted(feasible_combi, key=lambda x: x['result'].f_pvalue)    
for idx, combination in enumerate(sorted_combi):
    print(f"{idx+1}. IVs: {combination['iv']}, Prob (F-Stat):{combination['result'].f_pvalue}")

1. IVs: ['Stormy', 'Mixed'], Prob (F-Stat):4.34850312546488e-05
2. IVs: ['Stormy', 'Mixed', 'Cold'], Prob (F-Stat):7.948800276007862e-05
3. IVs: ['Month', 'Mixed', 'Wind'], Prob (F-Stat):0.0002673127556289193
4. IVs: ['Mixed', 'Wind'], Prob (F-Stat):0.00036753279579365537
5. IVs: ['Month', 'Rainy', 'Wind'], Prob (F-Stat):0.0003761679237628031
6. IVs: ['Rainy', 'Wind'], Prob (F-Stat):0.0005174663785250746
7. IVs: ['Month', 'Mixed', 'Rainy', 'Wind'], Prob (F-Stat):0.0006012934239275447
8. IVs: ['Mixed', 'Rainy', 'Wind'], Prob (F-Stat):0.0008551629525235174
9. IVs: ['Stormy', 'Cold'], Prob (F-Stat):0.0011745959177015097


Since it doesn't make sense for "Cold" and "Month" to be IVs, we leave out combinations that consist these variables, allowing us to settle on 4 combinations to test.