# Hybrid GETS Specification Search with GNS DGP

Implements a hybrid GETS strategy starting with the estimation of an SDM specification. Based on the significance of $\rho$ and/or $\gamma$, this is followed by an AK test for error autocorrelation, or the estimation of an alternative OLS/SLX/Lag model, followed by an AK test.

The true DGP is GNS, i.e., using dgp_gns. All alternatives are included.

# Modules

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import geopandas as gpd
import numpy as np
import time
import spreg
import libpysal
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.formatting.rule import CellIsRule

In [None]:
print("pandas ",pd.__version__)
print("geopandas ",gpd.__version__)
print("numpy ",np.__version__)
print("spreg ",spreg.__version__)
print("libpysal ",libpysal.__version__)

# Specify Data and Weights

### 20x20 square grid - queen contiguity - n=400

In [None]:
#infileshp = "./data_master/twentwengrid.shp"
#infilew = "./data_master/grid400_q.gal"
#layout = "20x20"

### 40x40 square grid - queen contiguity - n=1600

In [None]:
#infileshp = "./data_master/fourty40grid.shp"
#infilew = "./data_master/fourty40_q.gal"
#layout = "40x40"

### US Counties - queen contiguity - n=3085

In [None]:
infileshp = "./data_master/uscounty_nodata.shp"
infilew = "./data_master/uscounty_q.gal"
layout = "US_counties"

### Brazilian municipios - queen contiguity - n=5568

In [None]:
#infileshp = "./data_master/Brazil_nodata.shp"
#infilew = "./data_master/Braz_muni_q.gal"
#layout = "BRA_muni"

## Read in Data and Weights

In [None]:
dfs = gpd.read_file(infileshp)

print(dfs.shape)
print(list(dfs))

w = libpysal.io.open(infilew).read()
w.transform = 'r'
n = w.n
print(n)

## Hybrid Specification Logic

In [None]:
def hybrid_sdm(y,x, w, wlags= 2, p_value=0.01):
    """
    Hybrid specification: Starting from the estimation of the Spatial Durbin model, 
                          it tests significance of coefficients and carries out specification
                          tests for error autocorrelation to suggest the most appropriate model
    
    Arguments:
    ----------
    x: matrix of independent variables
    y: vector of dependent variable
    w: spatial weights matrix 
    wlags: number of spatial lags to use in S2SLS
    p_value= significance threshold
        
    Returns:
    ----------
    result: the suggested DGP according to the specification search
    paths:  the decision point
            1 = common factor hypothesis in SDM = SEM
            2 = AK test in SDM = GNS
            3 = no error in SDM = SDM
            4 = OLS with error = SEM
            5 = OLS without error = OLS
            6 = SLX with error = SLXEr
            7 = SLX without error = SLX
            8 = lag with error = SAREr
            9 = lag no error = SAR
            
            
    """

    # models = ['OLS','SEM','SAR','SLX','SDM','SAREr','SLXEr','GNS']

    p=p_value
    
    k = x.shape[1]
    model_spd = spreg.GM_Lag(y,x,w=w,slx_lags=1,w_lags=wlags,hard_bound=True)
    #print(model_spd.summary)
    pstats = np.array(model_spd.z_stat)[1+k:,1]         # spatial parameters
    pk = len(pstats)
       
    if pstats.max() < p:  # least significant of two is still significant = SDM or GNS
        # check on spatial common factor
        if model_spd.cfh_test[1] >= p:    # not rejected - SEM
            result='SEM'
            paths = 1
            
        else:   # could be GNS
            ak_sdm = spreg.AKtest(model_spd,w,case='gen')
            if ak_sdm.p < p:    # remaining error
                result='GNS'
                paths = 2
            else:
                result='SDM'
                paths = 3
    
    elif pstats.min() >= p:  # none significant - OLS or SEM
        model_ols = spreg.OLS(y,x,w=w,spat_diag=True)
        #print(model_ols.summary)
        # check on LM-Error
        errtest = spreg.LMtests(model_ols,w)
        if errtest.lme[1] < p:   # SEM
            result = 'SEM'
            paths = 4
        else:        
            result='OLS'
            paths = 5
            
    else:       # one significant and one non-sign spatial parameter
        cand = pstats.argmax()  # non-significant one
        if cand == (pk - 1):   # rho not sig, SLX model
            # check error
            model_slx = spreg.OLS(y,x,w=w,slx_lags=1,spat_diag=True)
            #print(model_slx.summary)
            errtest = spreg.LMtests(model_slx,w)
            if errtest.lme[1] < p:   # SEM
                result = 'SLXEr'
                paths = 6
            else:
                result = 'SLX'
                paths = 7
        else:  # gamma not sign, lag model
            model_lag = spreg.GM_Lag(y,x,w=w,slx_lags=0,w_lags=wlags,hard_bound=True)
            #print(model_lag.summary)
            ak_lag = spreg.AKtest(model_lag,w,case='gen')
            if ak_lag.p < p:    # remaining error
                result = 'SAREr'
                paths = 8
            else:   # no error
                result = 'SAR'
                paths = 9


    return(result,paths)

## Model Parameters

In [None]:
# overall random seed
rndseed = 123456789
#rndseed = 123
# number of replications
reps=1000
# error process
errp = 'sar'
#errp = 'ma'
# beta and gamma
b1 = [1,1]
#b1 = [1, 1, 1, 1]
# rho range and lambda range
rho_values = [0, 0.1, 0.3, 0.5, 0.7, 0.9]
lam_values = [0, 0.1, 0.3, 0.5, 0.7, 0.9]
# gamma range
gam_values = [0.0, -0.5, 0.5]
# result parameter labels
models = ['OLS','SEM','SAR','SLX','SDM','SAREr','SLXEr','GNS']

# Modselect is dictionary with selected model

Modselect = {gam: {rho: {lam: {model: np.zeros(reps,dtype=int) for model in models} for lam in lam_values} 
                   for rho in rho_values} for gam in gam_values}

# Nested dictionary with path selection for Modpaths[gam][rho][lam]
Modpaths = {gam: {rho: {lam: np.zeros(reps,dtype=int) for lam in lam_values} 
                   for rho in rho_values} for gam in gam_values}

# Nested dictionary with number of model exceptions for Modexcept[gam][rho][lam]
Modexcept = {gam: {rho: {lam: 0 for lam in lam_values} 
                   for rho in rho_values} for gam in gam_values}

# inverse method - alternative is 'true_inv'
invmethod = 'power_exp'
# p-value
pvalue = 0.01
#pvalue = 0.05
# error distribution
errdist = 'normal'
#errdist = 'lognormal'
# number of explanatory variables
kx = len(b1) - 1

## RHS

X has variance 12, matched with variance 6 for error process, gives approximate R2 of 0.66

In [None]:
nk = n*kx
var1 = 12.0/kx
rng=np.random.default_rng(seed=rndseed) # set for X
xx = spreg.dgp.make_x(rng,nk,mu=[0],varu=[var1],method="uniform")
if kx > 1:
    x1 = np.reshape(xx,(n,kx))
else:
    x1 = xx
xb1 = spreg.dgp.make_xb(x1,b1)
wx1 = spreg.dgp.make_wx(x1,w) # default first order

## Print Settings

In [None]:
print("SETTINGS - GETS Hybrid Search with GNS DGP")
print("Layout: ",infileshp)
print("Weights: ",infilew)
print("n: ",n)
print("k: ",kx)
print("Error Process: ",errp)
print("Error Distribution: ",errdist)
print("Replications: ",reps)
print("p-value: ",pvalue)
print("Inverse Method: ",invmethod)
print("--------------------------------------")

## Simulation Loop

In [None]:
t0 = time.time()

if errdist == 'normal':
    vv = 6.0     # var 6 for target R2 of 0.66
elif errdist == 'lognormal':
    vv = 1.1     # var 1.1 for target R2 of 0.66
else:
    print("Error distribution not recognized")      # not used


for gam in gam_values:
    gg=gam
    # create a list with multiple gamma values (all same) when more than one x
    if kx > 1:
        g1 = np.ones(kx)*gg
        g1 = g1.tolist()
    else:
        g1 = gg
    
    #print("gam ",g1)
    wxg1 = spreg.dgp.make_wxg(wx1,g1) 
    for rho in rho_values:
        rho1=rho
        #print("rho ",rho1)
        for lam in lam_values:
            lam1=lam
            #print("lam ",lam1)
            
            if not(rho1 + lam1 < 1):
                break
            else:
                print(g1,rho1,lam1)
                rng=np.random.default_rng(seed=rndseed) # reset for simulations
                i=0
                while i < reps:
                #for i in range(reps):
                    #print("i ",i)
                    try:
                        u=spreg.dgp.make_error(rng,n,mu=0,varu=vv,method=errdist)   # errdist as parameter
                        # DGP is GNS
                        y1 = spreg.dgp_gns(u,xb1,wxg1,w,rho1,lam1, model= errp)
                        # Run backward specification
                        model_suggested,paths = hybrid_sdm(y1,x1, w, wlags= 2, p_value=pvalue)
                        #print("hybrid spec ",model_suggested," paths ",paths)
                        # Append result
                        Modselect[gam][rho][lam][model_suggested][i] = 1
                        Modpaths[gam][rho][lam][i]=paths
                        i += 1
                    except:  
                        #error_vals.append([gam,rho,lam])
                        #print("except")
                        Modexcept[gam][rho][lam]= Modexcept[gam][rho][lam]+1
                        if Modexcept[gam][rho][lam] > 10*reps:     #10.0*reps
                            print("Gamma: ",gam," Rho: ",rho," Lambda: ",lam)
                            print("Iterations stopped after ",10*reps," additional tries")
                            print("i = ",i)
                            break
                      

t1 = time.time()
print("time in minutes: ",(t1-t0)/60.0)

## Exceptions

In [None]:
for gam in gam_values:
    print("Gam: ",gam)
    for rho in rho_values:
        for lam in lam_values:
            if (rho + lam < 1):
                print("Rho: ",rho," Lambda: ",lam," Exceptions: ",Modexcept[gam][rho][lam])

## Search Paths

In [None]:
for gam in gam_values:
    print("Gam: ",gam)
    for rho in rho_values:
        for lam in lam_values:
            if (rho + lam < 1):
                pfreq = np.zeros(10,dtype=int)  # holder for counts by path
                vp = Modpaths[gam][rho][lam]
                vals,counts = np.unique(vp,return_counts=True)
                pfreq[vals]=counts
                #Modexcept[gam][rho][lam]=pfreq[0]
                print(" Rho: ",rho," Lam: ",lam," Path Counts: ",pfreq)
                #print("Modexcept ",Modexcept)

## Results

In [None]:
lenr = len(rho_values)
lenl = len(lam_values)
for gam in gam_values:
    print("GAMMA: ",gam)
    print("------------")
    for pt in range(len(models)):
        mod = models[pt]
        modsel = np.zeros((lenr,lenl))
        for r in range(lenr):
            rr = lenr - 1 -r
            for c in range(lenl):
                rho = rho_values[r]
                lam = lam_values[c]
                if not(rho+lam < 1):
                    modsel[rr,c]= np.nan
                else:    # divide model selection count by reps                  
                    modpicks = Modselect[gam][rho][lam][mod]
                    modsel[rr,c]= modpicks.sum() / reps
        print("Selection Frequency for",mod)
        print(modsel)

### Export to excel file

In [None]:
models_ordered = ['OLS','SAR','SEM','SLX','SAREr','SDM','SLXEr','GNS']
results_models={}

# Save dictionary values in lists
for mod in models_ordered:
    data = []
    for gam in gam_values:
        for rho in rho_values:
            for lam in lam_values:
                if rho + lam < 1:  
                    modpicks = Modselect[gam][rho][lam][mod]
                    result= modpicks.sum() / reps
                    data.append({'gamma': gam, 'rho': rho, 'lambda': lam, 'value': result})

    # Create DataFrame
    df = pd.DataFrame(data)
    # Pivot the DataFrame to get the desired shape
    pivot_df = df.pivot_table(index=['lambda', 'rho'], columns='gamma', values='value', aggfunc='first')
    pivot_df = pivot_df.reset_index()[['rho', 'lambda', 0, -0.5, 0.5]]
    # Save the DataFrame in the dictionary, following a specific order
    results_models[mod] = pivot_df.iloc[[0,1,2,3,4,5,6,11,15,18,20,7,8,9,10,12,13,14,16,17,19]]

In [None]:
#Save in Excel format

with pd.ExcelWriter(f'004_GETS_SDM_{layout}_{errp}_{errdist}_p_{pvalue}.xlsx', engine='openpyxl') as writer:

    ws = writer.book.create_sheet(title='Sheet1')

    startrow = 0  # Initial row to start writing the dataframe
    for name, df in results_models.items():
        
        # Write the dataframe name
        ws.cell(row=startrow + 1, column=1).value = f"{name}"
        
        # Write the dataframe content
        df.round(3).to_excel(writer, sheet_name='Sheet1', startrow=startrow + 1, index=False)       
        # Update the startrow for the next dataframe
        endrow = startrow + 2 + len(df)
        startrow += len(df) + 3  
    
    # Apply conditional formatting to columns 3, 4, and 5 for specific value ranges
    # Define fonts for each color
    red_font = Font(color='FF0000')  
    wine_font = Font(color='722F37')   
    blue_font = Font(color='0000FF')  
    green_font = Font(color='00FF00') 
    columns = ['C', 'D', 'E']  # Corresponding to Excel columns 3, 4, and 5
    for col in columns:
        
        # Rule for Red: value > 0.95
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='greaterThan', formula=['0.95'], stopIfTrue=True, font=red_font))
        # Rule for Wine: 0.9 < value <= 0.95
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='between', formula=['0.9', '0.95'], stopIfTrue=True, font=wine_font))
        # Rule for Blue: 0.75 < value <= 0.9
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='between', formula=['0.75', '0.9'], stopIfTrue=True, font=blue_font))
        # Rule for Green: 0.5 < value <= 0.75
        ws.conditional_formatting.add(f'{col}3:{col}{endrow}',
                                      CellIsRule(operator='between', formula=['0.5001', '0.75'], stopIfTrue=True, font=green_font))
