# STGE Specification Search with GNS DGP - KB Tests

The purpose of this template is to implement and analyze an updated STGE strategy based on the Koley-Bera tests from SEA 2024 for a GNS DGP with varying values for $\gamma$, $\rho$ and $\lambda$ when the model is estimated without WX, i.e., a classic regression. This includes spatial Durbin models ($\lambda = 0$), SLX error models ($\rho = 0$), standard SLX regression ($\rho = \lambda = 0$), and for $\gamma = 0$, the standard spatial error ($\rho = 0$), spatial lag ($\lambda = 0$ and standard regression model ($\rho$ = $\lambda$ = 0).

The true DGP is GNS, i.e., using dgp_gns. Model estimation is OLS, so no WX taken into account.

This design is revised from the original one to read in any spatial layout specified as a shapefile and associated spatial weights matrix.

# Modules

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import geopandas as gpd
import numpy as np
import time
import spreg
import libpysal
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.formatting.rule import CellIsRule

In [None]:
print("pandas ",pd.__version__)
print("geopandas ",gpd.__version__)
print("numpy ",np.__version__)
print("spreg ",spreg.__version__)
print("libpysal ",libpysal.__version__)

# Specify Data and Weights

### 20x20 square grid - queen contiguity - n=400

In [None]:
#infileshp = "./data_master/twentwengrid.shp"
#infilew = "./data_master/grid400_q.gal"
#layout = "20x20"

### 40x40 square grid - queen contiguity - n=1600

In [None]:
#infileshp = "./data_master/fourty40grid.shp"
#infilew = "./data_master/fourty40_q.gal"
#layout = "40x40"

### US Counties - queen contiguit - n=3085

In [None]:
infileshp = "./data_master/uscounty_nodata.shp"
infilew = "./data_master/uscounty_q.gal"
layout = "US_counties"

### Brazilian municipios - queen contiguity - n=5568

In [None]:
#infileshp = "./data_master/Brazil_nodata.shp"
#infilew = "./data_master/Braz_muni_q.gal"
#layout = "BRA_muni"

## Read in Data and Weights

In [None]:
dfs = gpd.read_file(infileshp)

print(dfs.shape)
print(list(dfs))

w = libpysal.io.open(infilew).read()
w.transform = 'r'
n = w.n
print(n)

## Forward Specification Logic with KB Tests

Revised logic, removed branch after all robust tests significant

In [None]:
def fw_spec_KB(plmtests, reps, p_value=0.01):
    """
    Forward specification: Evaluate results from LM-tests and their robust versions from spreg.OLS,
    based on the new Koley-Bera tests for spatial Durbin
    
    Arguments:
    ----------
    plmtests : reps x 3 matrix with p-values from LM tests in OLS
               p_RLM_wx,p_RLM_durlag,p_LM_spdurbin
               
    reps.    : number of replications
    
    p_value  : significance threshold
        
    Returns:
    ----------
    modfreq: the frequency of suggested DGP according to the forward 
             (specific to general) specification search
    """


    p=p_value
    reps=reps    # make sure to get the frequency correctly
    
    result = np.zeros((reps,4))   # OLS,LAG,SLX,SDM
    
    for i in range(reps):
        
        p_rlwx,p_rdury,p_spdur = plmtests[i,:] 
        
        # first check following KB(2024) - joint test on SDM
        if p_spdur > p: # not significant
            result[i,0] = 1  # OLS
        else: # joint test is significant
            if p_rlwx < p and p_rdury < p:
                result[i,3] = 1 # SDM
            elif p_rdury < p:   # only robust lag
                result[i,1] = 1  # LAG
            elif p_rlwx < p:   # only robust WX
                result[i,2] = 1   # SLX
            else:   # should never be reached
                result[i,0] = 1  # OLS
        #print("i ",i," result ",result[i,:])
        
                
    modcount = result.sum(axis=0)
    modfreq = modcount / reps
    return modfreq            
    

## Model Parameters

In [None]:
# overall random seed
rndseed = 123456789
# number of replications
reps=1000
# error process
errp = 'sar'
#errp = 'ma'
# beta and gamma
b1 = [1,1]
#b1 = [1, 1, 1, 1]
# rho range and lambda range
rho_values = [0, 0.1, 0.3, 0.5, 0.7, 0.9]
lam_values = [0, 0.1, 0.3, 0.5, 0.7, 0.9]
# gamma range
gam_values = [0, -0.5, 0.5]
# result parameter labels
diagtests = ['prwx','prdlag','pspdur']
models = ['OLS','LAG','SLX','SPDUR']
k = len(diagtests)
# Nested Dictionary to store results with the sctucture Result[rho][lam]
# assumes only a single set of simulations is run, otherwise needs to be initialized for each run
# Results1 is dictionary with p-values for tests
# Modselect is dictionary with selected model
Results1 = {gam: {rho: {lam: {diag: [] for diag in diagtests} for lam in lam_values} 
                   for rho in rho_values} for gam in gam_values}
Modselect = {gam: {rho: {lam: {model: [] for model in models} for lam in lam_values} 
                   for rho in rho_values} for gam in gam_values}
# inverse method - alternative is 'true_inv'
invmethod = 'power_exp'
# p-value
pvalue = 0.01
#pvalue = 0.05
# error distribution
errdist = 'normal'
#errdist = 'lognormal'
# number of explanatory variables
kx = len(b1) - 1

## RHS

X has variance 12, matched with variance 6 for error process, gives approximate R2 of 0.66

In [None]:
nk = n*kx
var1 = 12.0/kx
rng=np.random.default_rng(seed=rndseed) # set for X
xx = spreg.dgp.make_x(rng,nk,mu=[0],varu=[var1],method="uniform")
if kx > 1:
    x1 = np.reshape(xx,(n,kx))
else:
    x1 = xx
xb1 = spreg.dgp.make_xb(x1,b1)
wx1 = spreg.dgp.make_wx(x1,w) # default first order

## Print Settings

In [None]:
print("SETTINGS - STGE Search with K-B tests and GNS DGP - OLS Estimation")
print("Layout: ",infileshp)
print("Weights: ",infilew)
print("n: ",n)
print("k: ",kx)
print("Error Process: ",errp)
print("Error Distribution: ",errdist)
print("Replications: ",reps)
print("p-value: ",pvalue)
print("Inverse Method: ",invmethod)
print("--------------------------------------")

## Simulation Loop

In [None]:
t0 = time.time()

if errdist == 'normal':
    vv = 6.0     # var 6 for target R2 of 0.66
elif errdist == 'lognormal':
    vv = 1.1     # var 1.1 for target R2 of 0.66
else:
    print("Error distribution not recognized")    # not used

for gam in gam_values:
    gg=gam
    # create a list with multiple gamma values (all same) when more than one x
    if kx > 1:
        g1 = np.ones(kx)*gg
        g1 = g1.tolist()
    else:
        g1 = gg
        
    wxg1 = spreg.dgp.make_wxg(wx1,g1) 
    for rho in rho_values:
        rho1=rho
        for lam in lam_values:
            lam1=lam
            if not(rho1 + lam1 < 1):
                break
            else:
                print(g1,rho1,lam1)
                rng=np.random.default_rng(seed=rndseed) # reset for simulations
                for i in range (reps):
                    
                    u= spreg.dgp.make_error(rng,n,mu=0,varu=vv,method=errdist)  # error distribution as parameter
                    # DGP is GNS
                    y1 = spreg.dgp_gns(u,xb1,wxg1,w,rho1,lam1, model= errp)
                    # estimate OLS
                    model_ols_1 = spreg.OLS(y1,x1,w=w,slx_lags=0,spat_diag=True)
                    #print(model_ols_1.summary)
                    pvals = [model_ols_1.rlm_wx[1],model_ols_1.rlm_durlag[1],
                             model_ols_1.lm_spdurbin[1]]
                    
                    for j in range(k):
                        Results1[gam][rho][lam][diagtests[j]].append(pvals[j])
                    
                    

t1 = time.time()
print("time in minutes: ",(t1-t0)/60.0)

## Dictionary with Selection Frequencies

At this point, Results is a nested dictionary with rho, lam and estimates as keys. Before we carry out forward specification, this must be turned into an array to pass to the forward specification logic.

In [None]:
for gam in gam_values:
    for rho in rho_values:
        for lam in lam_values:
            if not(rho+lam < 1):
                for i in range(len(models)):
                    Modselect[gam][rho][lam][models[i]] = 0.0
            else:
                print(gam,rho,lam)
                plmt = [Results1[gam][rho][lam][i] for i in diagtests]
                plmtests = np.array(plmt).transpose()
                test1 = fw_spec_KB(plmtests,reps,p_value=pvalue)
                for i in range(len(models)):
                    Modselect[gam][rho][lam][models[i]] = test1[i]

In [None]:
lenr = len(rho_values)
lenl = len(lam_values)
for gam in gam_values:
    print("GAMMA: ",gam)
    print("------------")
    modlag = np.zeros((lenr,lenl))
    moderr = np.zeros((lenr,lenl))
    modslx = np.zeros((lenr,lenl))
    for pt in range(len(models)):
        mod = models[pt]
        modsel = np.zeros((lenr,lenl))
        for r in range(lenr):
            rr = lenr -1 -r
            for c in range(lenl):
                rho = rho_values[r]
                lam = lam_values[c]
                if not(rho+lam < 1):
                    modsel[rr,c] = np.nan
                else:
                    modsel[rr,c] = np.array(Modselect[gam][rho][lam][mod])

        print("Selection Frequency for",mod)
        print(modsel)

In [None]:
#This code is the same as previous cell, but saving in a dictionary instead of printing

lenr = len(rho_values)
lenl = len(lam_values)
data_models={}
for gam in gam_values:
    data_models[gam] = {}
    #print("GAMMA: ",gam)
    #print("------------")
    modlag = np.zeros((lenr,lenl))
    moderr = np.zeros((lenr,lenl))
    modslx = np.zeros((lenr,lenl))
    for pt in range(len(models)):
        mod = models[pt]
        modsel = np.zeros((lenr,lenl))
        for r in range(lenr):
            rr = lenr -1 -r
            for c in range(lenl):
                rho = rho_values[r]
                lam = lam_values[c]
                if not(rho+lam < 1):
                    modsel[rr,c] = np.nan
                else:
                    modsel[rr,c] = np.array(Modselect[gam][rho][lam][mod])

        data_models[gam][mod] = modsel
        #print("Selection Frequency for",mod)
        #print(modsel)

In [None]:
#Convert dictionary into dataframe and sotre dataframes in a dictionary
results_models={}
for mod in models:
    data = []
    for gam in data_models:
        for i, rho in enumerate(rho_values):
            for j, lam in enumerate(lam_values):
                if rho + lam < 1:  
                    data.append({'gamma': gam,  'rho': rho, 'lambda': lam, 'value': data_models[gam][mod][5-i,j]})
    # Create DataFrame
    df = pd.DataFrame(data)
    # Pivot the DataFrame to get the desired shape
    pivot_df = df.pivot_table(index=['lambda', 'rho'], columns='gamma', values='value', aggfunc='first')
    pivot_df = pivot_df.reset_index()[['rho', 'lambda', 0, -0.5, 0.5]]
    # Save the DataFrame in the dictionary, following a specific order
    results_models[mod] = pivot_df.iloc[[0,1,2,3,4,5,6,11,15,18,20,7,8,9,10,12,13,14,16,17,19]]

In [None]:
#Save in Excel format

with pd.ExcelWriter(f'003_STGE_KB_SDM_{layout}_{errp}_{errdist}_p_{pvalue}.xlsx', engine='openpyxl') as writer:

    ws = writer.book.create_sheet(title='Sheet1')

    startrow = 0  # Initial row to start writing the dataframe
    for name, df in results_models.items():
        
        # Write the dataframe name
        ws.cell(row=startrow + 1, column=1).value = f"{name}"
        
        # Write the dataframe content
        df.round(3).to_excel(writer, sheet_name='Sheet1', startrow=startrow + 1, index=False)       
        # Update the startrow for the next dataframe
        endrow = startrow + 2 + len(df)
        startrow += len(df) + 3  
    
    # Apply conditional formatting to columns 3, 4, and 5 for specific value ranges
    # Define fonts for each color
    red_font = Font(color='FF0000')  
    wine_font = Font(color='722F37')   
    blue_font = Font(color='0000FF')  
    green_font = Font(color='00FF00') 
    columns = ['C', 'D', 'E']  # Corresponding to Excel columns 3, 4, and 5
    for col in columns:
        
        # Rule for Red: value > 0.95
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='greaterThan', formula=['0.95'], stopIfTrue=True, font=red_font))
        # Rule for Wine: 0.9 < value <= 0.95
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='between', formula=['0.9', '0.95'], stopIfTrue=True, font=wine_font))
        # Rule for Blue: 0.75 < value <= 0.9
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='between', formula=['0.75', '0.9'], stopIfTrue=True, font=blue_font))
        # Rule for Green: 0.5 < value <= 0.75
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='between', formula=['0.5001', '0.75'], stopIfTrue=True, font=green_font))