# How to anonymize a CVR file

Goal: automatically anonymize a CVR file to meet CRS limit on less than 10 ballots in a style

Understand limits of what can be accomplished via row-level aggregation
  * If a single contest has less than 10 votes, a different approach is needed
  * If the issue is cross-cutting districts, combine with ballots from others as necessary
  * Come up with a way to generate test cases

Entropy of each aggregation shouldn't be too low.
Or perhaps I just want to make sure that for each candidate who got at least 20% of the vote, there are e.g. at least 3 votes for them in the aggregate? So it isn't clear how ALL the people voted, even if you know you're one or two of them?


In [192]:
import doctest
doctest.testmod()

**********************************************************************
File "__main__", line 5, in __main__.pull_absent
Failed example:
    pull_absent("2,2,1,47,2-1-47,VBM,P1,D1,0,1,,,1,1".split(','))
Expected:
    'P1111111'
Got:
    'P1110011'
**********************************************************************
1 items had failures:
   1 of   1 in __main__.pull_absent
***Test Failed*** 1 failures.


TestResults(failed=1, attempted=1)

In [12]:
import csv

In [94]:
from collections import Counter

In [194]:
row = "2,2,1,47,2-1-47,VBM,P1,D1,0,1,,,1,1".split(',')

In [197]:
row[[6,7]]

TypeError: list indices must be integers or slices, not list

In [193]:
def pull_absent(row, headerlen=8, stylecols=[6]):
    """Convert a clean Dominion database row into a string which includes
    the ballot style and, for each column representing a choice, an x if a vote
    was allowed (a 0 or 1), or a space if the contest didn't appear on the ballot (zero-length string).
    >>> pull_absent("2,2,1,47,2-1-47,VBM,P1,D1,0,1,,,1,1".split(','))
    'P1110011'
    """

    return ''.join([row[stylecols][:3]] + [["1", "0"][vote == ''] for vote in row[headerlen:]])

In [56]:
cvrfile = '/srv/voting/audit/ca/inyo/CVR_Export_20201123164142-clean.csv'

In [140]:
data = open(cvrfile)
cvr = csv.reader(data)
version = next(cvr)
contests = next(cvr)
choices = next(cvr)
headers = next(cvr)

In [156]:
choicecount = len(choices) - 8

In [141]:
freq = Counter(pull_absent(row) for row in cvr)

In [142]:
freq

Counter({'1131111111111111111111111100000000001110000000001111111000000000011111111111111111111111100': 967,
         '1091111111111111111000000011111000000000000000000000000000000000011111111111111111111111100': 921,
         '1051111111111111111000000011111000000001111111110000000000000000011111111111111111111111111': 904,
         '1021111111111111111000000011111000000000000000000000000000000000011111111111111111111111100': 895,
         '1081111111111111111000000011111000000000000000000000000000000011111111111111111111111111100': 874,
         '1171111111111111111000000000000111110000000000000000000111111100011111111111111111111111100': 852,
         '1011111111111111111000000011111000000000000000000000000000000011111111111111111111111111100': 656,
         '1031111111111111111000000011111000000000000000000000000000000000011111111111111111111111100': 632,
         '1061111111111111111000000011111000000001111111110000000000000000011111111111111111111111111': 484,
         '110111111

In [179]:
uniqkeys = sorted(freq.keys())
columns = [''.join(column) for column in zip(*uniqkeys)]

In [180]:
columns

['11111111111111111111',
 '00000000011111111112',
 '12345678901234567890',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '00000000000010100000',
 '00000000000010100000',
 '00000000000010100000',
 '00000000000010100000',
 '00000000000010100000',
 '00000000000010100000',
 '00000000000010100000',
 '11111111111101000000',
 '11111111111101000000',
 '11111111111101000000',
 '11111111111101000000',
 '11111111111101000000',
 '00000000000000001110',
 '00000000000000001110',
 '00000000000000001110',
 '00000000000000001110',
 '00000000000000001110',
 '00000000000111110000',
 '00000000000111110000',
 '00000000000111110000',
 '00001110000000000000',


In [181]:
uniqcols = Counter(columns)

In [182]:
uniqcols

Counter({'11111111111111111111': 41,
         '00001110000000000000': 11,
         '00000000000010100000': 7,
         '00000000000011000000': 7,
         '00000000000000001000': 7,
         '11111111111101000000': 5,
         '00000000000000001110': 5,
         '00000000000111110000': 3,
         '10000001001000000000': 3,
         '00000000011111111112': 1,
         '12345678901234567890': 1})

In [177]:
columns

['11111111111111111111',
 '00100100101100211111',
 '56132617042389095874',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '11111011111011000001',
 '11111011111011000001',
 '11111011111011000001',
 '11111011111011000001',
 '11111011111011000001',
 '00000000000000010110',
 '00000000000000010110',
 '00000000000000010110',
 '00000000000000010110',
 '00000000000000010110',
 '00000100001100001001',
 '00000100001100001001',
 '00000100001100001001',
 '11000001000000000000',


In [160]:
freq.keys()

dict_keys(['1051111111111111111000000011111000000001111111110000000000000000011111111111111111111111111', '1061111111111111111000000011111000000001111111110000000000000000011111111111111111111111111', '1111111111111111111000000011111000000000000000000000000000000011111111111111111111111111100', '1031111111111111111000000011111000000000000000000000000000000000011111111111111111111111100', '1021111111111111111000000011111000000000000000000000000000000000011111111111111111111111100', '1161111111111111111000000000000000001110000000000000000000000000011111111111111111111111100', '1011111111111111111000000011111000000000000000000000000000000011111111111111111111111111100', '1071111111111111111000000011111000000001111111110000000000000000011111111111111111111111111', '1101111111111111111000000011111000000000000000000000000000000000011111111111111111111111100', '1041111111111111111000000011111000000000000000000000000000000000011111111111111111111111100', '11211111111111111110000000111110000011

In [169]:
z = zip(*l)

In [171]:
''.join(next(z))

'00100100101100211111'

In [175]:
[''.join(column) for column in zip(*l)]

['11111111111111111111',
 '00100100101100211111',
 '56132617042389095874',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '11111111111111111111',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '00000000000100001000',
 '11111011111011000001',
 '11111011111011000001',
 '11111011111011000001',
 '11111011111011000001',
 '11111011111011000001',
 '00000000000000010110',
 '00000000000000010110',
 '00000000000000010110',
 '00000000000000010110',
 '00000000000000010110',
 '00000100001100001001',
 '00000100001100001001',
 '00000100001100001001',
 '11000001000000000000',


In [174]:
len(''.join(next(z)))

20

In [167]:
list(''.join(zip(*l)))

TypeError: sequence item 0: expected str instance, tuple found

In [164]:
list(map(list, zip(*l)))

[['1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['0',
  '0',
  '1',
  '0',
  '0',
  '1',
  '0',
  '0',
  '1',
  '0',
  '1',
  '1',
  '0',
  '0',
  '2',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['5',
  '6',
  '1',
  '3',
  '2',
  '6',
  '1',
  '7',
  '0',
  '4',
  '2',
  '3',
  '8',
  '9',
  '0',
  '9',
  '5',
  '8',
  '7',
  '4'],
 ['1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1',
  '1'],
 ['1',
  '1',

In [163]:
l = freq.keys()
''.join(list(map(list, zip(*l))))

TypeError: sequence item 0: expected str instance, list found

In [157]:
pairs = [[Counter() for _ in range(choicecount)] for _ in range(choicecount)]

In [158]:
for k in freq:
    for i in range(choicecount):
        for j in range(choicecount):
            pairs[i][j][k[i+3] + k[j+3]] += 1

In [159]:
pairs[0]

[Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'11': 20}),
 Counter({'10': 18, '11': 2}),
 Counter({'10': 18, '11': 2}),
 Counter({'10': 18, '11': 2}),
 Counter({'10': 18, '11': 2}),
 Counter({'10': 18, '11': 2}),
 Counter({'10': 18, '11': 2}),
 Counter({'10': 18, '11': 2}),
 Counter({'11': 13, '10': 7}),
 Counter({'11': 13, '10': 7}),
 Counter({'11': 13, '10': 7}),
 Counter({'11': 13, '10': 7}),
 Counter({'11': 13, '10': 7}),
 Counter({'10': 17, '11': 3}),
 Counter({'10': 17, '11': 3}),
 Counter({'10': 17, '11': 3}),
 Counter({'10': 17, '11': 3}),
 Counter({'10': 17, '11': 3}),
 Counter({'10': 15, '11': 5}),
 Counter({'10': 15, '11': 5}),
 Counter({'10': 15, '11': 5}),
 Counter({'10': 17, '11': 3}

Look for deterministic pairs, and drop one

Noodling

In [127]:
row = next(cvr)

In [130]:
pull_absent(row)

'105 (105-3)1111111111111111000000011111000000001111111110000000000000000011111111111111111111111111'

In [123]:
row[6] + ["1", "0"][vote == ''] for vote in row[8:]

SyntaxError: invalid syntax (974994107.py, line 1)

In [110]:
r = next(cvr)

In [111]:
headers[:8]

['CvrNumber',
 'TabulatorNum',
 'BatchId',
 'RecordId',
 'ImprintedId',
 'CountingGroup',
 'PrecinctPortion',
 'BallotType']

In [93]:
absent

'1111111111111111000000011111000000001111111110000000000000000011111111111111111111111111'

In [88]:
[1,2][False]

1

In [89]:
[1,2][True]

2

In [85]:
absent[:20]

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True]

In [83]:
len(absent)

88

In [5]:
import pandas as pd

In [11]:
df = pd.read_csv(cvr, header=0, names=range(200), skiprows=4)

ParserError: Too many columns specified: expected 200 and found 96

In [10]:
df

Unnamed: 0,1,2,1.1,48,2-1-48,Vote by Mail,105 (105-3),3 (3),1.2,0,...,0.34,1.14,1.15,0.35,1.16,0.36,1.17,0.37,0.38,0.39
0,2,2,1,47,2-1-47,Vote by Mail,105 (105-3),3 (3),0,1,...,1,0,0,1,0,1,0,1,1.0,0.0
1,3,2,1,29,2-1-29,Vote by Mail,105 (105-3),3 (3),1,0,...,1,0,0,1,1,0,1,0,1.0,0.0
2,4,2,1,15,2-1-15,Vote by Mail,105 (105-3),3 (3),0,1,...,1,0,0,1,0,1,0,1,0.0,1.0
3,5,2,1,14,2-1-14,Vote by Mail,105 (105-3),3 (3),1,0,...,1,0,0,1,0,1,0,1,1.0,0.0
4,6,2,1,96,2-1-96,Vote by Mail,105 (105-3),3 (3),1,0,...,0,1,0,1,0,1,1,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9618,9620,2,78,3,2-78-3,Vote by Mail,105 (105-3),3 (3),0,1,...,0,0,0,0,0,0,0,0,0.0,0.0
9619,9621,2,78,2,2-78-2,Vote by Mail,105 (105-3),3 (3),0,0,...,1,0,1,0,0,1,1,0,1.0,0.0
9620,9622,2,79,1,2-79-1,Vote by Mail,102 (102-2),2 (2),1,0,...,0,1,0,1,0,1,0,1,,
9621,9623,2,79,2,2-79-2,Vote by Mail,102 (102-2),2 (2),1,0,...,0,1,1,0,1,0,0,1,,
