# Vote distribution by machine in Winterville 

For each contest, randomly partition votes onto 7 machines. Condition on the number of ballots cast per machine. 

Test statistic is $\max_i |R_{im} - R_i|$, where $R_{im}$ is the fraction of Republican votes cast on machine $m$ in contest $i$, and $R_i$ is the overall fraction of Republican votes in contest $i$. 

Test for contests separately, and use Fisher's combining function for an overall test.

Compare results for the original data with results if D and R were swapped on machine 3.

In [1]:
%matplotlib inline
import math
import numpy as np
import scipy as sp
import scipy.optimize
from scipy.stats import hypergeom, binom, norm, chi2
from scipy import special
from cryptorandom.cryptorandom import SHA256
from cryptorandom import sample
import matplotlib.pyplot as plt
import pandas as pd
import csv

seed = '2018CV313418 3463593937'  # case caption number [space] 10 rolls of 10-sided dice
prng = SHA256(seed)

In [2]:
votes = pd.read_csv('../../Data/winterville.csv')
votes.head()

Unnamed: 0,Contest,Machine,Candidate,Party,Votes
0,Governor,0,B. KEMP (R),R,40
1,Governor,0,S. ABRAMS (D),D,73
2,Governor,0,T. METZ (L),L,4
3,Governor,0,Write-in,W,0
4,Governor,1,B. KEMP (R),R,51


In [3]:
statewide_contests = np.array(["Governor", "Lt Governor", "Secretary of State", \
                               "Attorney General", "Commissioner of Agriculture", \
                               "Commissioner of Insurance", "State School Superintendent", "Commissioner of Labor",
                               "PSC Eaton", "PSC Pridemore"])

In [4]:
# Number of voters per machine taken from poll tape summary 
num_voters_per_machine = [117, 135, 131, 133, 135, 144, 135]  # double-checked
cum_voters_per_machine = np.cumsum(num_voters_per_machine)
cum_voters_per_machine = np.insert(cum_voters_per_machine, 0, 0)
num_votes = np.sum(num_voters_per_machine)

# Does any race on any machine has more votes than reported for the machine?
for m in range(len(num_voters_per_machine)):
    tmp = votes[votes["Machine"]==m]
    tot = tmp.groupby("Contest").agg(np.sum)
    assert np.all(tot["Votes"] <= num_voters_per_machine[m])

In [5]:
# Find winning party within the precinct in each statewide contest
mask_D = votes['Party']=="D"
mask_R = votes['Party']=="R"
for c in statewide_contests:
    mask_c = votes["Contest"] == c
    D_votes = votes[mask_c & mask_D]['Votes'].sum()
    R_votes = votes[mask_c & mask_R]['Votes'].sum()
    print(c, D_votes, R_votes, '\t', ('DEM' if D_votes > R_votes else 'REP'))

Governor 505 400 	 DEM
Lt Governor 479 393 	 DEM
Secretary of State 511 365 	 DEM
Attorney General 509 390 	 DEM
Commissioner of Agriculture 475 423 	 DEM
Commissioner of Insurance 482 382 	 DEM
State School Superintendent 492 405 	 DEM
Commissioner of Labor 494 402 	 DEM
PSC Eaton 494 367 	 DEM
PSC Pridemore 487 374 	 DEM


In [6]:
def get_repub_fraction(df):
    repub = df.loc[df["Party"]=="R"].copy()
    repub["R_votes"] = repub["Votes"]
    valid_votes = df.groupby(["Contest","Machine"]).agg(np.sum).reset_index()
    valid_votes["Tot_votes"] = valid_votes["Votes"]
    combined = pd.merge(repub, valid_votes, on = ["Contest", "Machine"])
    return combined["R_votes"]/combined["Tot_votes"]

In [7]:
def permute_votes_across_machines(vote_df, reps, prng=np.random):
    """
    Input: votes dataframe, filtered to contain only one contest
    
    """
    
    # Votes for Republican, Democrat/other, and undervotes
    votes_per_candidate = vote_df.groupby(["Party"]).agg(np.sum).reset_index()
    r_votes = int(votes_per_candidate.loc[votes_per_candidate["Party"] == "R", "Votes"])
    d_votes = np.sum(votes_per_candidate["Votes"]) - r_votes
    u_votes = num_votes - r_votes - d_votes
    overall_r_proportion = r_votes/(r_votes + d_votes)

    # test statistic = largest % votes for R on a machine
    votes_for_r = get_repub_fraction(vote_df)
    max_votes_for_r = np.max(votes_for_r)

    # Randomly assign r_votes 1s, d_votes 0s, and u_votes np.nans 
    vote_list = np.array([1]*r_votes + [0]*d_votes + [np.nan]*u_votes)
    perm_distr = np.zeros(reps)

    for r in range(reps):
        prng.shuffle(vote_list)

    # Find fraction of votes for R on each machine
        votes_for_r_perm = np.zeros(len(num_voters_per_machine))
        for i in range(len(num_voters_per_machine)):
            votes_for_r_perm[i] = np.nanmean(vote_list[cum_voters_per_machine[i]:cum_voters_per_machine[i+1]])
        perm_distr[r] = np.max(votes_for_r_perm)
        
    # Center the statistic at the expected fraction of R votes
    perm_distr_norm = perm_distr - overall_r_proportion
    statistic_norm = max_votes_for_r - overall_r_proportion
    
    return {"statistic":max_votes_for_r,
            "pvalue":(1+np.sum(np.abs(perm_distr_norm) >= np.abs(statistic_norm)))/(reps+1)
           }

In [8]:
reps=10000
ps = {}
for c in statewide_contests:
    vote_df = votes[votes["Contest"] == c]
    res = permute_votes_across_machines(vote_df, reps=reps, prng=prng)
    ps[c] = res['pvalue']
    print(c, "\n   statistic =", res["statistic"], "\n   P-value =", res["pvalue"])

fisher_chi = -2*np.sum([math.log(p) for c, p in ps.items()])
print('Combined:\n   ', fisher_chi, chi2.sf(fisher_chi, df=2*len(statewide_contests)))

Governor 
   statistic = 0.5190839694656488 
   P-value = 0.11398860113988601
Lt Governor 
   statistic = 0.5645161290322581 
   P-value = 0.0245975402459754
Secretary of State 
   statistic = 0.5116279069767442 
   P-value = 0.0184981501849815
Attorney General 
   statistic = 0.515625 
   P-value = 0.1506849315068493
Commissioner of Agriculture 
   statistic = 0.5813953488372093 
   P-value = 0.025997400259974
Commissioner of Insurance 
   statistic = 0.5348837209302325 
   P-value = 0.030496950304969503
State School Superintendent 
   statistic = 0.5419847328244275 
   P-value = 0.09669033096690331
Commissioner of Labor 
   statistic = 0.5736434108527132 
   P-value = 0.007899210078992101
PSC Eaton 
   statistic = 0.5114503816793893 
   P-value = 0.0456954304569543
PSC Pridemore 
   statistic = 0.5267175572519084 
   P-value = 0.025297470252974703
Combined:
    65.67868786714891 9.094420735646933e-07


# What if D and R vote totals were flipped on Machine 3?

In [9]:
votes_flipped = votes.copy()
votes_flipped.loc[(votes_flipped.Machine==3) & (votes.Party=="R"), 'Party'] = "D"
votes_flipped.loc[(votes_flipped.Machine==3) & (votes.Party=="D"), 'Party'] = "R"
votes_flipped.head(20)

Unnamed: 0,Contest,Machine,Candidate,Party,Votes
0,Governor,0,B. KEMP (R),R,40
1,Governor,0,S. ABRAMS (D),D,73
2,Governor,0,T. METZ (L),L,4
3,Governor,0,Write-in,W,0
4,Governor,1,B. KEMP (R),R,51
5,Governor,1,S. ABRAMS (D),D,79
6,Governor,1,T. METZ (L),L,3
7,Governor,1,Write-in,W,0
8,Governor,2,B. KEMP (R),R,60
9,Governor,2,S. ABRAMS (D),D,67


In [10]:
ps_flipped = {}
for c in statewide_contests:
    vote_df2 = votes_flipped[votes_flipped["Contest"] == c]
    res = permute_votes_across_machines(vote_df2, reps=reps, prng=prng)
    ps_flipped[c] = res['pvalue']
    print(c, "\n   statistic =", res["statistic"], "\n   P-value =", res["pvalue"])

fisher_chi = -2*np.sum([math.log(p) for c, p in ps_flipped.items()])
print('Combined:\n   ', fisher_chi, chi2.sf(fisher_chi, df=2*len(statewide_contests)))

Governor 
   statistic = 0.48148148148148145 
   P-value = 0.46425357464253575
Lt Governor 
   statistic = 0.4728682170542636 
   P-value = 0.7945205479452054
Secretary of State 
   statistic = 0.4496124031007752 
   P-value = 0.44955504449555045
Attorney General 
   statistic = 0.484375 
   P-value = 0.5433456654334566
Commissioner of Agriculture 
   statistic = 0.49230769230769234 
   P-value = 0.7339266073392661
Commissioner of Insurance 
   statistic = 0.4645669291338583 
   P-value = 0.6042395760423958
State School Superintendent 
   statistic = 0.48031496062992124 
   P-value = 0.8065193480651934
Commissioner of Labor 
   statistic = 0.46875 
   P-value = 0.7967203279672033
PSC Eaton 
   statistic = 0.4732824427480916 
   P-value = 0.27987201279872015
PSC Pridemore 
   statistic = 0.4307692307692308 
   P-value = 0.9387061293870613
Combined:
    9.997865529313279 0.9682106300793477


In [11]:
# version information
%load_ext version_information
%version_information scipy, numpy, csv, pandas, matplotlib, notebook, cryptorandom, permute

Loading extensions from ~/.ipython/extensions is deprecated. We recommend managing extensions like any other Python packages, in site-packages.




Software,Version
Python,3.6.7 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,7.2.0
OS,Darwin 18.2.0 x86_64 i386 64bit
scipy,1.1.0
numpy,1.15.4
csv,1.0
pandas,0.23.1
matplotlib,3.0.2
notebook,5.7.4
cryptorandom,0.2
