## Checking distinct plans
This notebook checks how many distinct plans our ensembles generated for each of Texas, Utah, and North Carolina. We assume the repeated plans occur during the proposal step in the ReCom chain, when GerryChain combines two adjacent districts and happens to cut them in half back into their original shapes. For more information, please see: https://arxiv.org/abs/1911.05725

In [2]:
import pandas as pd

In [3]:
state_name = "texas"
state_abbr = "TX"
election_name = "SEN12"
datadir = "./" + state_abbr + "output/"

max_steps = 100000
step_size = 10000

ts = [x*step_size for x in range(1,int(max_steps/step_size)+1)]

df = pd.DataFrame(columns = ['seats','mm','pg','vs','eg','ce'])

for t in ts:
    tempdf = pd.read_csv(datadir + state_name + election_name +"_data"+str(t)+".csv", delimiter=',')
    df = pd.concat([df, tempdf], ignore_index=True)
    
no_dups = df.drop_duplicates()
print(len(no_dups))
print("There are " + str(100000 - len(no_dups)) + " duplicate plans from " + state_abbr + "-" + election_name)

99687
There are 313 duplicate plans from TX-SEN12


In [4]:
state_name = "northcarolina"
state_abbr = "NC"
election_name = "SEN16"
datadir = "./" + state_abbr + "output/"

max_steps = 100000
step_size = 10000

ts = [x*step_size for x in range(1,int(max_steps/step_size)+1)]

df = pd.DataFrame(columns = ['seats','mm','pg','vs','eg','ce'])

for t in ts:
    tempdf = pd.read_csv(datadir + state_name + election_name +"_data"+str(t)+".csv", delimiter=',')
    df = pd.concat([df, tempdf], ignore_index=True)
    
no_dups = df.drop_duplicates()
print(len(no_dups))
print("There are " + str(100000 - len(no_dups)) + " duplicate plans from " + state_abbr + "-" + election_name)

99687
There are 313 duplicate plans from NC-SEN16


In [9]:
state_name = "utah"
state_abbr = "UT"
election_name = "SEN16"
datadir = "./" + state_abbr + "output/"

max_steps = 100000
step_size = 10000

ts = [x*step_size for x in range(1,int(max_steps/step_size)+1)]

df = pd.DataFrame(columns = ['seats','mm','pg','vs','eg','ce'])

for t in ts:
    tempdf = pd.read_csv(datadir + state_name + election_name +"_data"+str(t)+".csv", delimiter=',')
    df = pd.concat([df, tempdf], ignore_index=True)
    
no_dups = df.drop_duplicates()
print(len(no_dups))
print("There are " + str(100000 - len(no_dups)) + " duplicate plans from " + state_abbr + "-" + election_name)

99863
There are 137 duplicate plans from UT-SEN16


In [10]:
dups = df.duplicated()

In [11]:
df[df.duplicated(keep=False)][-10:]

Unnamed: 0,seats,mm,pg,vs,eg,ce
97328,4,-0.003024,0.006048,"(0.603359375817096, 0.7097142614528935, 0.7202...",0.068948,367
97329,4,-0.003024,0.006048,"(0.603359375817096, 0.7097142614528935, 0.7202...",0.068948,367
98352,4,0.006269,0.012539,"(0.7443172638857475, 0.7041018239988872, 0.634...",0.068948,366
98354,4,0.006269,0.012539,"(0.7443172638857475, 0.7041018239988872, 0.634...",0.068948,366
98637,4,0.041231,0.082463,"(0.8193826640438713, 0.7135708033641588, 0.518...",0.068948,278
98638,4,0.041231,0.082463,"(0.8193826640438713, 0.7135708033641588, 0.518...",0.068948,278
99188,4,0.000456,0.000912,"(0.7714200650469009, 0.6586807089466523, 0.687...",0.068948,229
99189,4,0.000456,0.000912,"(0.7714200650469009, 0.6586807089466523, 0.687...",0.068948,229
99681,4,0.008398,0.016796,"(0.834515080445955, 0.6925085186587433, 0.5793...",0.068948,305
99682,4,0.008398,0.016796,"(0.834515080445955, 0.6925085186587433, 0.5793...",0.068948,305
