In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl

dataset of CVRs

https://dataverse.harvard.edu/dataverse/rcv_cvrs

In [9]:
class VotingData():
    """
    The PrefList class is an object that will hold all the data of a certain election.
    """
    def __init__(self, filepath, num_ranks):
        self.filepath = filepath
        self.num_ranks = num_ranks
        
        ranks = []
        for i in range(1, num_ranks + 1):
            ranks.append('rank'+str(i))

        df = pd.read_csv(filepath, usecols=ranks)
        print("read csv")

        def _left_adj_and_remove_dupes(row, n=5):
            """
            Shifts values in a row of n columns to the left so all '' appear to the right.
            """
            seen = []
            for i in range(n):
                if row.iloc[i] in seen:
                    row.iloc[i] = ''
                if row.iloc[i] == '' and i < n - 1:
                    j = i + 1
                    swapped = False
                    while j <= n-1 and not swapped:
                        if row.iloc[j] in seen:
                            row.iloc[j] = ''
                        if row.iloc[j] != '':
                            row.iloc[i], row.iloc[j] = row.iloc[j], row.iloc[i]
                            swapped = True
                        j += 1
                    if not swapped:
                        return row
                seen.append(row.iloc[i])
            return row

        df = df.groupby(ranks, as_index=False).size()
        df = df.replace(['skipped','overvote'], '')
        print("replaced null votes")
        df = df.groupby(ranks, as_index=False).sum()
        df = df.apply(_left_adj_and_remove_dupes, axis = 1, args=(num_ranks,))
        df = df.groupby(ranks, as_index=False).sum()
        print("left adjusted")
        df = df.loc[df['rank1'] != '']
        df = df.rename(columns={'size':'count'})
        
        self.ballots = df.sort_values('count', ascending=False)
        self.candidates = [x for x in pd.unique(df[ranks].to_numpy().ravel()) if x != '']
        print("done init")
        
        def _remove_and_left_adj(row, n, removed):
            """
            input: row is a row in a ballot. It is assumed that all none '' entries come first and are unique.
                n is the number of ranks in the ballot. 
                removed is the candidate to be removed

            output: a row with removed replaced by '' and all entries left adjusted.
            """
            i = 0
            found = False
            while i < n:
                if row[i] == removed:
                    found = True
                    break
                i += 1

            if found:
                while i < n and row[i] != '':
                    if i == n - 1:
                        row[i] = ''
                    else:
                        row[i] = row[i+1]
                    i += 1
            return row
    
    def plurality_winner(self):
        df = self.ballots[['rank1','count']].groupby('rank1').sum().sort_values('count',ascending=False)
        num_votes = df['count'].sum()
        df['percent'] = round(100*df['count']/num_votes, 1)
        return df
    
    def instant_runoff_winner(self):
            df = self.ballots
            winner =  False
            result = pd.DataFrame({}, index=self.candidates) 
            rnd = 1
            while True:
                curr_rnd = df[['rank1','count']].groupby('rank1').sum().sort_values('count',ascending=False)
                num_votes = curr_rnd['count'].sum()
                curr_rnd['percent'] = round(100*curr_rnd['count']/num_votes, 1)
                curr_rnd = curr_rnd.rename(columns={'count':'count '+str(rnd),'percent':'percent '+str(rnd)})

                # append curr_rnd to results
                result = pd.concat([result,curr_rnd], axis=1)

                if curr_rnd.max()['percent '+str(rnd)] > 50:
                    result = result.sort_values('count 1', ascending=False)
                    return result
                else:
                    loser = curr_rnd.idxmin()['count '+str(rnd)]
                    df = df.apply(_remove_and_left_adj, axis=1, args=(self.num_ranks,loser))
                    df = df.groupby(ranks, as_index=False).sum()
                    df = df.loc[df['rank1'] != '']
                rnd += 1
    
    def STV_winners(self, n):
        """
        input: n is the number of candidates being elected.
        
        output: The results of the single transferable vote algorithm.
        """
        pass
        
    def pairwise_winners(self):
        """
        returns a matrix showing the one on one results of row versus column.
        """
        res = pd.DataFrame(0,columns = self.candidates, index=self.candidates)
        
        def _count_pairwise_scores(row, n):
            for i in range(n-1):
                if row.iloc[i] == '':
                    break
                for j in range(i+1,n):
                    if row.iloc[j] == '':
                        break
                    res.loc[row[i],row[j]] += row['count']

        self.ballots.apply(_count_pairwise_scores, axis=1, args = (self.num_ranks,))
        res = round(100*res/(res + res.T),1)
#         res.style.highlight_between(left=50.1, right=100, color="#33db27")
        return res
        

In [10]:
%%time
ny_voting_data = VotingData("NewYorkCity_06222021_DEMMayorCitywide.csv", 5)

read csv
replaced null votes


KeyboardInterrupt: 

In [4]:
%%time
ny_voting_data.pairwise_winners()

NameError: name 'ny_voting_data' is not defined

In [61]:
%%time
# import matplotlib as mpl

balls = ny_voting_data.ballots
cands = ny_voting_data.candidates
res = pd.DataFrame(0,columns = cands, index=cands)
votes = pd.DataFrame(0,columns = cands, index=cands)


def _count_pairwise_scores(row, n):
    for i in range(n-1):
        if row[i] == '':
            break
        for j in range(i+1,n):
            if row[j] == '':
                break
            res.loc[row[i],row[j]] += row['count']
    
balls.apply(_count_pairwise_scores, axis=1, args = (5,))
res = round(100*res/(res + res.T),1)

res.style.highlight_between(left=50.1, right=100, color="#19b543")
# res = res.background_gradient(cmap=cm)

CPU times: user 1min 4s, sys: 83.9 ms, total: 1min 4s
Wall time: 1min 19s


Unnamed: 0,Aaron S. Foldenauer,Andrew Yang,Art Chang,Dianne Morales,Joycelyn Taylor,Kathryn A. Garcia,Scott M. Stringer,Shaun Donovan,Isaac Wright Jr.,writein,Eric L. Adams,Maya D. Wiley,Raymond J. McGuire,Paperboy Love Prince
Aaron S. Foldenauer,,24.9,43.2,46.2,53.6,31.0,40.5,34.4,56.7,63.3,23.9,35.1,49.6,58.7
Andrew Yang,75.1,,70.7,58.7,79.4,40.4,54.5,52.9,65.8,82.9,34.1,36.8,57.7,74.4
Art Chang,56.8,29.3,,40.0,67.5,35.1,44.2,52.1,67.0,72.9,32.5,17.7,44.1,63.9
Dianne Morales,53.8,41.3,60.0,,72.2,37.7,46.5,53.0,65.9,79.0,28.3,18.3,49.1,75.2
Joycelyn Taylor,46.4,20.6,32.5,27.8,,22.3,31.0,29.2,45.9,69.8,10.8,15.1,32.0,52.0
Kathryn A. Garcia,69.0,59.6,64.9,62.3,77.7,,68.7,77.1,75.2,83.8,49.1,44.7,71.3,78.5
Scott M. Stringer,59.5,45.5,55.8,53.5,69.0,31.3,,58.7,68.6,77.5,30.4,30.7,53.0,73.5
Shaun Donovan,65.6,47.1,47.9,47.0,70.8,22.9,41.3,,58.1,80.0,24.0,26.7,47.7,69.0
Isaac Wright Jr.,43.3,34.2,33.0,34.1,54.1,24.8,31.4,41.9,,64.8,10.6,18.4,30.1,50.7
writein,36.7,17.1,27.1,21.0,30.2,16.2,22.5,20.0,35.2,,13.7,13.0,20.2,30.6
