In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.manifold import MDS 
import glob
import math
import re
from Clustering_Functions import *
from itertools import combinations
import itertools
import os

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
def Borda_vector(ballot, num_cands , borda_style='pes', start = 0):
    """
    Returns the Borda vector of the given (simple or generalized) ballot.
        
    Args:
        ballot : a simple or generalized ballot (tuple of integers or of sets of integers). 
        num_cands : the number of candidates. 
        borda_style : choice of {'pes', 'avg'}
        start : the lowest score awarded; for example, set start=1 if you want a full ballot to award {1,2,...,num_cands} points.
     
    Returns:
        the Borda vector (np.array) of the given generalized ballot.                
    """

    # append set of missing candidates to end of ballot
    ballot = list(ballot)
    missing_cands = set(range(1,num_cands+1))
    for c in ballot:
        S = c if type(c) == set else {c}
        for x in S:
            missing_cands.discard(x)
    if len(missing_cands) > 0:
        ballot.append(missing_cands)
    # compute Borda vector
    score_queue = list(range(start, start+num_cands))
    to_return = [0 for _ in range(num_cands)]
    for c in ballot:
        S = c if type(c) == set else {c}
        scores = [score_queue.pop() for _ in range(len(S))]
        points = np.mean(scores) if borda_style == 'avg' else min(scores)
        for x in S:
            to_return[x-1] = points

    return np.array(to_return)

In [3]:
def Reverse_Borda(proxy):
    """ 
    Returns the generalized ballot corresponding to the given Borda proxy vector.
    Returns a simple ballot if possible, otherwise a generalized ballot
    Works with any borda_style convention ('pes' or 'avg') 
    """
    num_cands = len(proxy)

    proxy = list(proxy)
    to_return = []
    cands_placed = []
    while len(cands_placed) < num_cands:
        S = [x for x in range(1,num_cands+1) if proxy[x-1]==np.max(proxy)] # best-scoring candidates
        cands_placed.extend(S)
        to_return.append(set(S))
        for x in S:
            proxy[x-1] = -1

    # return a simple ballot if possible
    if all(len(c)==1 for c in to_return[:-1]):
        return tuple([list(c)[0] for c in to_return if len(c)==1])
    else:
        return tuple(to_return)

In [4]:
def HH_proxy(ballot,num_cands):
    """
    Returns the head-to-head proxy vector of the given (simple or generalized) ballot.
        
    This is a vector with one entry for each pair of candidates ordered in the natural way; namely {(1,2),(1,3),...,(1,n),(2,3),...}. 
    The entries lie in {-1,0,1} depending on whether the lower-indexed candidate {looses, ties, wins} the head-to-head comparison. 

    Args:
        ballot: a simple or generalized ballot (tuple of integers or of sets of integers).
    
    Returns:
        The head-to-head proxy vector (np.array)
    """
    # append set of missing candidates to end of ballot
    ballot = list(ballot)
    missing_cands = set(range(1,num_cands+1))
    for c in ballot:
        S = c if type(c) == set else {c}
        for x in S:
            missing_cands.discard(x)
    if len(missing_cands) > 0:
        ballot.append(missing_cands)

    M = np.full([num_cands,num_cands], np.nan)

    # first place the zeros for ties and build the unpacked ballot
    unpacked_ballot = []
    for c in ballot:
        S = c if type(c) == set else {c}
        if len(S)>1:
            for x,y in combinations(S,2):
                M[x-1,y-1] = 0
                M[y-1,x-1] = 0
        unpacked_ballot.extend(S)

    # now place the -1 and 1 entries
    for x,y in combinations(unpacked_ballot,2):
        if M[x-1,y-1] != 0:
            M[x-1,y-1] = 1
            M[y-1,x-1] = -1

    # flatten the matrix into a vector
    to_return = []
    for x,y in combinations(range(num_cands),2):
        to_return.append(M[x,y])
    return np.array(to_return)

In [5]:
def Reverse_HH(proxy):
    """ 
    Returns the (simple or generalized) ballot corresponding to the given HH proxy vector,
    or None if the proxy is inconsistent.
    Any positive entry (not just +1) is interpreted as a win for the lower-indexed candidate, and any negative entry a loss,
    while a zero entry indicates a tie.
    Returns a simple ballot if possible, otherwise a generalized ballot.
    """
    # determine the number of candidates
    proxy = list(proxy)
    A = np.sqrt(1+8*len(proxy))
    if not A.is_integer():
        raise ValueError("Invalid proxy vector")
    num_cands = int((1+A)/2)
    
    cand_pairs = list(combinations(range(1,num_cands+1),2))
    ballot = [{num_cands}] # initialize ballot: bullet vote for last candidate

    # We'll work through cand_pairs (i,j) in reverse order.  
    # For 5 candidates, the order is (4,5) | (3,5), (3,4) | (2,5), (2,4), (2,3) | (1,5), (1,4), (1,3), (1,2)
    # which breaks into groups for i = 4,3,2,1
    # for each group, we add i to the top of the ballot, and then use the rest of the group's information to reposition i correctly
    # (or return None if the rest of the group has inconsisent information).
    for i in range(num_cands-1,0,-1):
        group_indices = [x for x in range(len(cand_pairs)) if cand_pairs[x][0]==i]
        left_of_i = [cand_pairs[x][1] for x in group_indices if proxy[x]<0]
        right_of_i = [cand_pairs[x][1] for x in group_indices if proxy[x]>0]
        match_i = [cand_pairs[x][1] for x in group_indices if proxy[x] == 0]
        ballot_map = [] # has one entry {-1,0,+1} for each set in the ballot, indicating whether the set should be left, right, or containing i.
        for c in ballot:
            S = c if type(c) == set else {c}
            if all(x in left_of_i for x in S):
                ballot_map.append(-1)
            elif all(x in right_of_i for x in S):
                ballot_map.append(1)
            elif all(x in match_i for x in S):
                ballot_map.append(0)
            else:
                return None # inconsistent proxy
            
        zero_indices = [x for x in range(len(ballot_map)) if ballot_map[x]==0]
        if (ballot_map != sorted(ballot_map)) or (len(zero_indices)>1):
            return None # inconsistent proxy
        
        if len(zero_indices)==0:
            insertion_index = len(ballot_map) if all(val <= 0 for val in ballot_map) else min([x for x in range(len(ballot_map)) if ballot_map[x] >=0])
            ballot.insert(insertion_index,{i})
            #print(f'inserting {i} at {insertion_index} to get {ballot}')    
        else:
            insertion_index = zero_indices[0]
            ballot[insertion_index] = ballot[insertion_index].union({i})
            #print(f'adding {i} to {insertion_index} to get {ballot}')
    # return a simple ballot if possible
    if all(len(c)==1 for c in ballot[:-1]):
        return tuple([list(c)[0] for c in ballot if len(c)==1])
    else:
        return tuple(ballot)

In [6]:
b = (4, {2,3}, 1, {5,6,7})
p = HH_proxy(b,10)
print(Reverse_HH(HH_proxy(b,10)))
print(Reverse_Borda(Borda_vector(b,10)))

({4}, {2, 3}, {1}, {5, 6, 7}, {8, 9, 10})
({4}, {2, 3}, {1}, {5, 6, 7}, {8, 9, 10})


In [7]:
b = (4, 6, 3)
p = HH_proxy(b,10)
print(Reverse_HH(HH_proxy(b,10)))
print(Reverse_Borda(Borda_vector(b,10)))

(4, 6, 3)
(4, 6, 3)


In [9]:
filepath = '../ballot-clustering old versions'
dfK = pd.read_pickle(f'{filepath}/results_2025.pkl') # heuristic results
dfIP = pd.read_pickle(f'{filepath}/IP_results_v2.pkl') # IP results

In [10]:
# create `df` dataframe to hold IP and heuristic results.
d_proxy = {'bordaa': 'BA', 'bordap': 'BP', 'hh': 'HH'}
df = pd.DataFrame(columns=['filename', 'num_cands', 'method', 'proxy_type', 'technique', 
                           'centers', 'proxies_of_centers', 'center_proxy_match_good','clustering'])

# Add the IP results to the dataframe

for index in dfIP.index:
    filename = f"{dfIP['election'][index]}.csv" 
    num_cands = dfIP['num_candidates'][index]
        
    proxies_of_centers = dfIP['centroids'][index]
    clustering = dfIP['ballots'][index]
    method = dfIP['method'][index]
    proxy_type = d_proxy[dfIP['proxy'][index]]
    technique = f"{method}_{proxy_type}"

    if proxy_type == 'HH':
        centers = {i: Reverse_HH(proxy) for i, proxy in proxies_of_centers.items()}
        center_proxy_match = all(centers[i] is not None for i in centers)
    else:
        centers = {i: Reverse_Borda(proxy) for i, proxy in proxies_of_centers.items()}
        borda_style = 'pes' if proxy_type == 'BP' else 'avg'
        center_proxy_match = all(all(Borda_vector(centers[i],num_cands, borda_style) == np.array(proxy)) for i, proxy in proxies_of_centers.items())

    t = df.shape[0]
    df.loc[t] = [filename, num_cands, method, proxy_type, technique,
                  centers, proxies_of_centers, center_proxy_match, clustering]

In [11]:
# add the heuristic results to the dataframe

D = {'meanBA': ('Lloyd', 'BA'), 'meanBC': ('Lloyd', 'BP'), 'meanH': ('Lloyd', 'HH'), 
     'medoBA': ('PAM', 'BA'), 'medoBC': ('PAM', 'BP'), 'medoH': ('PAM', 'HH'), 'slate': ('slate', None)}

for index in dfK.index:
    filename = dfK['filename'][index] 
    num_cands = dfK['num_cands'][index]
    clustering = dfK['clustering'][index]
    m = dfK['method'][index]
    if m == 'slate_weak':
        continue
    method, proxy_type = D[m]

    technique = method if method == 'slate' else f"{method}_{proxy_type}"

    if method == 'Lloyd':
        proxies_of_centers_as_array = dfK['centers'][index]
        proxies_of_centers = {0: proxies_of_centers_as_array[0], 1: proxies_of_centers_as_array[1]}
        if proxy_type == 'HH':
            centers = {i: Reverse_HH(proxy) for i, proxy in proxies_of_centers.items()}
            center_proxy_match = all(centers[i] is not None for i in centers)
        else:
            centers = {i: Reverse_Borda(proxy) for i, proxy in proxies_of_centers.items()}
            borda_style = 'pes' if proxy_type == 'BP' else 'avg'
            center_proxy_match = all(all(Borda_vector(centers[i],num_cands, borda_style) == np.array(proxy)) for i, proxy in proxies_of_centers.items())
    elif method == 'PAM':
        centers = dfK['centers'][index]
        if proxy_type == 'HH':
            proxies_of_centers = {i: HH_proxy(centers[i], num_cands) for i, center in centers.items()}
            center_proxy_match = True
        else:
            borda_style = 'pes' if proxy_type == 'BP' else 'avg'
            proxies_of_centers = {i: Borda_vector(center,num_cands, borda_style) for i, center in centers.items()}
            center_proxy_match = True
    else:
        centers = dfK['centers'][index]
        proxies_of_centers = None
        center_proxy_match = None
    t = df.shape[0]
    df.loc[t] = [filename, num_cands, method, proxy_type, technique, centers, proxies_of_centers, center_proxy_match, clustering]

In [12]:
df.to_pickle(f'{filepath}/merged_data_with_generalized_ballots.pkl')

In [13]:
# Group by 'technique' and 'center_proxy_match_good' and calculate the size of each group
grouped = df.groupby(['technique', 'center_proxy_match_good']).size().unstack(fill_value=0)

# Calculate the portion for each value of 'center_proxy_match_good' within each 'technique'
portion = grouped.div(grouped.sum(axis=1), axis=0)

print(portion)

center_proxy_match_good     False     True 
technique                                  
Lloyd_BA                 1.000000  0.000000
Lloyd_BP                 1.000000  0.000000
Lloyd_HH                 0.029907  0.970093
PAM_BA                   0.000000  1.000000
PAM_BP                   0.000000  1.000000
PAM_HH                   0.000000  1.000000
continuous_BA            0.944513  0.055487
continuous_BP            0.504692  0.495308
continuous_HH            0.336416  0.663584
continuous_rest_BA       0.000000  1.000000
continuous_rest_BP       0.000000  1.000000
continuous_rest_HH       0.000000  1.000000
discrete_BA              0.000000  1.000000
discrete_BP              0.000000  1.000000
discrete_HH              0.000000  1.000000


In [41]:
# The Pentland Hills election that's studied in the paper
full_filename = 'scot-elex/7_cands/edinburgh_2017_ward2.csv'
filename = os.path.basename(full_filename)
num_cands, election, cand_names, ward = csv_parse(full_filename)
parties = party_abrevs(cand_names)
dfPH = df[df['filename'] == filename].copy()

In [34]:
# print Pentland Hills centers
PH_technique_to_centers = {technique:None for technique in dfPH['technique'].unique()}
for index in dfPH.index:
    technique = dfPH['technique'][index]
    centers = dfPH['centers'][index]
    method = dfPH['method'][index]
    match = dfPH['center_proxy_match_good'][index]
    if centers == None:
        centers = {0: None, 1: None}
    PH_technique_to_centers[technique] = centers
    if not method in ['slate', 'discrete']:
        print(f"{technique}({match}): {centers[0]}, {centers[1]}")

continuous_BA(False): ({1, 6}, {4}, {2, 3, 5, 7}), ({3}, {5}, {7}, {2, 4}, {1, 6})
continuous_BP(False): ({1, 6}, {2, 3, 4, 5, 7}), ({3, 5}, {7}, {4}, {1, 2, 6})
continuous_HH(False): (1, 6), None
continuous_rest_BA(True): (1,), (3,)
continuous_rest_BP(True): (1, 6), (3, 5, 7, 4)
continuous_rest_HH(True): (1, 6, 4, 2, 7, 3, 5), (3, 5, 7, 4, 2, 6, 1)
Lloyd_BP(False): (1, 6, 4, 2, 7, 3, 5), (3, 5, 4, 7, 2, 1, 6)
Lloyd_BA(False): (3, 5, 4, 7, 2, 1, 6), (1, 6, 4, 2, 7, 3, 5)
Lloyd_HH(True): (3, 5, 4, 7, 2, 1, 6), (1, 6, 4, 2, 7, 3, 5)
PAM_BP(True): (3, 5, 7, 4), (1, 6)
PAM_BA(True): (3, 5, 7), (1, 6)
PAM_HH(True): (1, 6), (3, 5, 4)


In [45]:
# find the HH and BP centers of the WHOLE ELECTION
BP_center_proxy, _ = Centroid_and_Medoid(election, num_cands, proxy = 'Borda', borda_style='pes', metric = 'Euclidean')
HH_center_proxy, _ = Centroid_and_Medoid(election, num_cands, proxy = 'HH', metric = 'Euclidean')
BP_center = Reverse_Borda(BP_center_proxy)
HH_center = Reverse_HH(HH_center_proxy)

In [46]:
print(BP_center_proxy)
print(HH_center_proxy)
print(BP_center)
print(HH_center)

[2.73972603 1.59195758 2.0508175  2.37322139 1.75033142 2.4475475
 1.37931949]
[ 0.22182943  0.12443659  0.0597437   0.14997791  0.15068493  0.21961997
 -0.0670791  -0.19125055 -0.03066726 -0.17286787  0.04569156 -0.06734423
  0.17207247 -0.09032258  0.16111357  0.11330093 -0.01732214  0.21175431
 -0.11798498  0.11736633  0.18912947]
(1, 6, 4, 3, 5, 2, 7)
(1, 6, 4, 3, 5, 2, 7)


In [39]:
# determine the portion of elections in which the Lloyd_BP and Lloyd_HH centers agree
good_count = 0
bad_count = 0
inconsistent_count = 0

for index in df.index:
    technique = df['technique'][index]
    if technique != 'Lloyd_BP':
        continue
    BP_centers = df['centers'][index]
    filename = df['filename'][index]
    row_index = df[(df['filename'] == filename) & (df['technique'] == 'Lloyd_HH')].index[0]
    HH_centers = df['centers'][row_index]
    if df['center_proxy_match_good'][row_index] == False:
        inconsistent_count += 1
    elif set(BP_centers) == set(HH_centers):
        good_count += 1
    else:
        bad_count += 1

In [40]:
# number of elections in which the Lloyd_BP and Lloyd_HH centers (agree, disagree, have inconsistent HH centers)
good_count, bad_count, inconsistent_count

(1038, 0, 32)

In [63]:
def same_gen_ballots(b1,b2):
    """
    Returns True if the two generalized ballots are the same, False otherwise.
    """
    b1 = list(b1)
    b2 = list(b2)
    if len(b1) != len(b2):
        return False
    for i in range(len(b1)):
        S1 = b1[i] if type(b1[i]) == set else {b1[i]}
        S2 = b2[i] if type(b2[i]) == set else {b2[i]}
        if S1 != S2:
            return False
    return True

In [67]:
# Re-do the above calculation for the clustering that places all ballots into a single cluster.
# That is, determine the portion of elections for which the Lloyd_BP and Lloyd_HH centers of the WHOLE ELECTION
# (agree, disagree, have inconsisetent HH centers)

# We also count up the times that the Lloyd_BP and Lloyd_HH centers include ties.

good_count = 0
bad_count = 0
inconsistent_count = 0

BP_ties = 0
HH_ties = 0

full_filename_list = glob.glob("scot-elex/**/*.csv")
for full_filename in full_filename_list:
    filename = os.path.basename(full_filename)
    num_cands, election, cand_names, location = csv_parse(full_filename)
    BP_center_proxy, _ = Centroid_and_Medoid(election, num_cands, proxy = 'Borda', borda_style='pes', metric = 'Euclidean')
    HH_center_proxy, _ = Centroid_and_Medoid(election, num_cands, proxy = 'HH', metric = 'Euclidean')
    BP_center = Reverse_Borda(BP_center_proxy)
    HH_center = Reverse_HH(HH_center_proxy)
    #print(full_filename)
    if HH_center == None:
        inconsistent_count += 1
    elif same_gen_ballots(BP_center, HH_center):
        good_count += 1
    else:
        bad_count += 1
    if type(BP_center[0])==set:
        BP_ties += 1
    if HH_center != None and type(HH_center[0])==set:
        HH_ties += 1

In [68]:
good_count, bad_count, inconsistent_count

(540, 487, 43)

In [69]:
BP_ties, HH_ties

(0, 2)