In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from Clustering_Functions import *
from itertools import combinations
import os
import pickle
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
dfK = pd.read_pickle('clustering_results.pkl') # heuristic results
dfIP = pd.read_json("6_cand_all_solutions.json", orient="records")
dfIP_PH = pd.read_json("pentland_hills_all_models.json", orient="records")

In [3]:
print(list(dfK.columns))
print(list(dfIP.columns))

['filename', 'num_cands', 'parties', 'method', 'sil', 'cal', 'dav', 'centers', 'bloc_size']
['n_candidates', 'election_name', 'model', 'candidates', 'optimum_value', 'centroid_set']


In [4]:
print(dfIP['model'].unique())
print(dfK['method'].unique())

['continuous_bordaP' 'continuous_hh' 'continuous_rest_bordaP'
 'continuous_rest_hh' 'discrete_HH' 'discrete_bordaP']
['meanBC' 'meanBA' 'meanH' 'medoBC' 'medoBA' 'medoH' 'slate']


In [3]:
# create `df` dataframe to hold IP and heuristic results.

df = pd.DataFrame(columns=['filename', 'num_cands', 'method', 'proxy', 'technique', 
                           'centers', 'proxies_of_centers'])

# add the IP results to df.
D_IP = {
 'continuous_bordaP': ('coords','Borda'),
 'continuous_hh':('coords','HH'),
 'continuous_rest_bordaP':('all','Borda'),
 'continuous_rest_hh':('all','HH'),
 'discrete_HH': ('cast','HH'),
 'discrete_bordaP': ('cast','Borda')}

for index in dfIP.index:
    method, proxy = D_IP[dfIP['model'][index]]
    filename = f"{dfIP['election_name'][index]}.csv" 
    num_cands = dfIP['n_candidates'][index]
    proxies_of_centers = dfIP['centroid_set'][index]
    technique = f"{method}_{proxy}"
    if technique  == 'cast_HH': # fix that this model uses +- 1/2 for components instead of +-1
        proxies_of_centers = [ [2*x for x in proxy] for proxy in proxies_of_centers]
    t = df.shape[0]
    df.loc[t] = [filename, num_cands, method, proxy, technique, None, proxies_of_centers]

# add the heuristic results to df.
filename_list = df['filename'].unique()
D_K = {'meanBC': ('Lloyd','Borda'), 'meanH':('Lloyd','HH'), 'medoBC':('PAM','Borda'), 'medoH':('PAM','HH')}
for filename in filename_list:
    for method_code in D_K.keys():
        dfK_sub = dfK[(dfK['filename']==filename) & (dfK['method']==method_code)]
        if dfK_sub.shape[0] == 0:
            continue
        num_cands = dfK_sub['num_cands'].values[0]
        method, proxy = D_K[method_code]
        technique = f"{method}_{proxy}"
        centers = dfK_sub['centers'].values[0]
        centers = [centers[0], centers[1]] # ensure its a list rather than dictionary
        t = df.shape[0]
        if method == 'PAM': 
            df.loc[t] = [filename, num_cands, method, proxy, technique, centers, None]
        else: # For Lloyd, centers live in proxy space
            df.loc[t] = [filename, num_cands, method, proxy, technique, None, centers]

In [4]:
filename_list = df['filename'].unique()
len(filename_list), len(df)//10

(205, 205)

In [None]:
# add 'kmedians' models to df for both Borda and HH proxies.
count = 1
for filename in filename_list:    
    full_filename = f"../ballot-clustering/scot-elex/6_cands/{filename}"
    print(count, filename)
    count +=1
    num_cands, election, cand_names, ward = csv_parse(full_filename)
    for proxy in ['Borda', 'HH']:
        _, centers = kmedians(election, proxy = proxy, return_centroids=True)
        technique = f'kMedians_{proxy}'
        t = df.shape[0]
        df.loc[t] = [filename, num_cands, 'kMedians', proxy, technique, None, centers]

In [8]:
method_list = list(df['technique'].unique())
method_list

['coords_Borda',
 'coords_HH',
 'all_Borda',
 'all_HH',
 'cast_HH',
 'cast_Borda',
 'Lloyd_Borda',
 'Lloyd_HH',
 'PAM_Borda',
 'PAM_HH',
 'kMedians_Borda',
 'kMedians_HH']

In [5]:
# Functions that reverse Borda and HH vectors, and sometimes yield generalized ballots.

def Borda_proxy(ballot, num_cands , borda_style='pes', start = 0):
    """
    Returns the Borda vector of the given (simple or generalized) ballot.
        
    Args:
        ballot : a simple or generalized ballot (tuple of integers or of sets of integers). 
        num_cands : the number of candidates. 
        borda_style : choice of {'pes', 'avg'}
        start : the lowest score awarded; for example, set start=1 if you want a full ballot to award {1,2,...,num_cands} points.
     
    Returns:
        the Borda vector (np.array) of the given generalized ballot.                
    """

    # append set of missing candidates to end of ballot
    ballot = list(ballot)
    missing_cands = set(range(1,num_cands+1))
    for c in ballot:
        S = c if type(c) == set else {c}
        for x in S:
            missing_cands.discard(x)
    if len(missing_cands) > 0:
        ballot.append(missing_cands)
    # compute Borda vector
    score_queue = list(range(start, start+num_cands))
    to_return = [0 for _ in range(num_cands)]
    for c in ballot:
        S = c if type(c) == set else {c}
        scores = [score_queue.pop() for _ in range(len(S))]
        points = np.mean(scores) if borda_style == 'avg' else min(scores)
        for x in S:
            to_return[x-1] = points

    return np.array(to_return)

def Reverse_Borda(proxy):
    """ 
    Returns the generalized ballot corresponding to the given Borda proxy vector.
    Returns a simple ballot if possible, otherwise a generalized ballot
    Works with either borda_style convention ('pes' or 'avg') 
    """
    num_cands = len(proxy)

    proxy = list(proxy)
    to_return = []
    cands_placed = []
    while len(cands_placed) < num_cands:
        S = [x for x in range(1,num_cands+1) if proxy[x-1]==np.max(proxy)] # best-scoring candidates
        cands_placed.extend(S)
        to_return.append(set(S))
        for x in S:
            proxy[x-1] = -1

    # return a simple ballot if possible
    if all(len(c)==1 for c in to_return[:-1]):
        return tuple([list(c)[0] for c in to_return if len(c)==1])
    else:
        return tuple(to_return)
    
def HH_proxy(ballot,num_cands):
    """
    Returns the head-to-head proxy vector of the given (simple or generalized) ballot.
        
    This is a vector with one entry for each pair of candidates ordered in the natural way; namely {(1,2),(1,3),...,(1,n),(2,3),...}. 
    The entries lie in {-1,0,1} depending on whether the lower-indexed candidate {looses, ties, wins} the head-to-head comparison. 

    Args:
        ballot: a simple or generalized ballot (tuple of integers or of sets of integers).
    
    Returns:
        The head-to-head proxy vector (np.array)
    """
    # append set of missing candidates to end of ballot
    ballot = list(ballot)
    missing_cands = set(range(1,num_cands+1))
    for c in ballot:
        S = c if type(c) == set else {c}
        for x in S:
            missing_cands.discard(x)
    if len(missing_cands) > 0:
        ballot.append(missing_cands)

    M = np.full([num_cands,num_cands], np.nan)

    # first place the zeros for ties and build the unpacked ballot
    unpacked_ballot = []
    for c in ballot:
        S = c if type(c) == set else {c}
        if len(S)>1:
            for x,y in combinations(S,2):
                M[x-1,y-1] = 0
                M[y-1,x-1] = 0
        unpacked_ballot.extend(S)

    # now place the -1 and 1 entries
    for x,y in combinations(unpacked_ballot,2):
        if M[x-1,y-1] != 0:
            M[x-1,y-1] = 1
            M[y-1,x-1] = -1

    # flatten the matrix into a vector
    to_return = []
    for x,y in combinations(range(num_cands),2):
        to_return.append(M[x,y])
    return np.array(to_return)

def Reverse_HH(proxy):
    """ 
    Returns the (simple or generalized) ballot corresponding to the given HH proxy vector,
    or None if the proxy is inconsistent.
    Any positive entry (not just +1) is interpreted as a win for the lower-indexed candidate, and any negative entry a loss,
    while a zero entry indicates a tie.
    Returns a simple ballot if possible, otherwise a generalized ballot.
    """
    # determine the number of candidates
    proxy = list(proxy)
    A = np.sqrt(1+8*len(proxy))
    if not A.is_integer():
        raise ValueError("Invalid proxy vector")
    num_cands = int((1+A)/2)
    
    cand_pairs = list(combinations(range(1,num_cands+1),2))
    ballot = [{num_cands}] # initialize ballot: bullet vote for last candidate

    # We'll work through cand_pairs (i,j) in reverse order.  
    # For 5 candidates, the order is (4,5) | (3,5), (3,4) | (2,5), (2,4), (2,3) | (1,5), (1,4), (1,3), (1,2)
    # which breaks into groups for i = 4,3,2,1
    # for each group, we add i to the top of the ballot, and then use the rest of the group's information to reposition i correctly
    # (or return None if the rest of the group has inconsisent information).
    for i in range(num_cands-1,0,-1):
        group_indices = [x for x in range(len(cand_pairs)) if cand_pairs[x][0]==i]
        left_of_i = [cand_pairs[x][1] for x in group_indices if proxy[x]<0]
        right_of_i = [cand_pairs[x][1] for x in group_indices if proxy[x]>0]
        match_i = [cand_pairs[x][1] for x in group_indices if proxy[x] == 0]
        ballot_map = [] # has one entry {-1,0,+1} for each set in the ballot, indicating whether the set should be left, right, or containing i.
        for c in ballot:
            S = c if type(c) == set else {c}
            if all(x in left_of_i for x in S):
                ballot_map.append(-1)
            elif all(x in right_of_i for x in S):
                ballot_map.append(1)
            elif all(x in match_i for x in S):
                ballot_map.append(0)
            else:
                return None # inconsistent proxy
            
        zero_indices = [x for x in range(len(ballot_map)) if ballot_map[x]==0]
        if (ballot_map != sorted(ballot_map)) or (len(zero_indices)>1):
            return None # inconsistent proxy
        
        if len(zero_indices)==0:
            insertion_index = len(ballot_map) if all(val <= 0 for val in ballot_map) else min([x for x in range(len(ballot_map)) if ballot_map[x] >=0])
            ballot.insert(insertion_index,{i})
        else:
            insertion_index = zero_indices[0]
            ballot[insertion_index] = ballot[insertion_index].union({i})
    # return a simple ballot if possible
    if all(len(c)==1 for c in ballot[:-1]):
        return tuple([list(c)[0] for c in ballot if len(c)==1])
    else:
        return tuple(ballot)
    
def is_simple(ballot):
    """
    Returns True if the given ballot is simple, False otherwise.
    """
    return all(type(c)== int for c in ballot)

In [6]:
# Find the centers whenever only the proxies of the centers are given
centers_list = []
for index in df.index:
    if df['centers'][index] == None:
        proxies_of_centers = df['proxies_of_centers'][index]
        if df['proxy'][index] == 'Borda':
            centers = [Reverse_Borda(proxy) for proxy in proxies_of_centers]
        elif df['proxy'][index] == 'HH':
            centers = [Reverse_HH(proxy) for proxy in proxies_of_centers]
    else:
        centers = df['centers'][index]
    centers_list.append(centers)

df['centers'] = centers_list

In [7]:
# find their proxies whenever only centers are given
proxies_of_centers_list = []
for index in df.index:
    if df['proxies_of_centers'][index] == None:
        centers = df['centers'][index]
        if df['proxy'][index] == 'Borda':
            proxies_of_centers = [Borda_proxy(center, num_cands=df['num_cands'][index]) for center in centers]
        elif df['proxy'][index] == 'HH':
            proxies_of_centers = [HH_proxy(center, num_cands=df['num_cands'][index]) for center in centers]
    else:
        proxies_of_centers = df['proxies_of_centers'][index]
    proxies_of_centers_list.append(proxies_of_centers)
df['proxies_of_centers'] = proxies_of_centers_list

In [8]:
df

Unnamed: 0,filename,num_cands,method,proxy,technique,centers,proxies_of_centers
0,aberdeen_2012_ward11.csv,6,coords,Borda,coords_Borda,"[(6, 1), ({4, 5}, {1, 2, 3, 6})]","[[3.0, 0.0, 0.0, 0.0, 0.0, 5.0], [0.0, 0.0, 0...."
1,aberdeen_2012_ward11.csv,6,coords,HH,coords_HH,"[(6, 1), ({4, 5}, {1, 2, 3, 6})]","[[1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, -1...."
2,aberdeen_2012_ward11.csv,6,all,Borda,all_Borda,"[(4, 5), (6, 1)]","[[0.0, 0.0, 0.0, 5.0, 4.0, 0.0], [4.0, 0.0, 0...."
3,aberdeen_2012_ward11.csv,6,all,HH,all_HH,"[(6, 1), (4, 5)]","[[1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, -1...."
4,aberdeen_2012_ward11.csv,6,cast,HH,cast_HH,"[(4, 5), (6, 1)]","[[0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -1.0, -1.0, ..."
...,...,...,...,...,...,...,...
2046,west_lothian_2017_ward9.csv,6,PAM,HH,PAM_HH,"[(1, 4), (3,)]","[[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, -1.0, 0.0, 0.0..."
2047,west_lothian_2022_ward1.csv,6,Lloyd,Borda,Lloyd_Borda,"[(4, 1, 5, 2, 6, 3), (5, 2, 3, 1, 6, 4)]","[[3.196907541811297, 1.6118649416219626, 0.262..."
2048,west_lothian_2022_ward1.csv,6,Lloyd,HH,Lloyd_HH,"[(4, 1, 5, 2, 3, 6), (5, 3, 2, 1, 6, 4)]","[[0.24813316739265662, 0.37741132545115036, -0..."
2049,west_lothian_2022_ward1.csv,6,PAM,Borda,PAM_Borda,"[(3, 5, 2), (4, 1, 5, 2)]","[[0, 3, 5, 0, 4, 0], [4, 2, 0, 5, 3, 0]]"


In [9]:
# For 'all' and 'cast' methods: verify that the centers are simple ballots that match the proxies 

for index in df.index:
    method = df['method'][index]
    if not method in ['all', 'cast']:
        continue
    centers = df['centers'][index]
    proxies_of_centers = df['proxies_of_centers'][index]
    num_cands = df['num_cands'][index]
    proxy_type = df['proxy'][index]

    bad = False
    for i in [0,1]:
        ballot = centers[i]
        if not is_simple(ballot):
            print(f"{num_cands},{method},{proxy_type}: {ballot} is not simple")
        proxy_of_center = proxies_of_centers[i]
        if proxy_type == 'Borda':
            correct_proxy = Borda_proxy(ballot, num_cands, borda_style='pes')
        else:
            correct_proxy = HH_proxy(ballot, num_cands)
            
        if list(proxy_of_center) != list(correct_proxy):
            print(f"{num_cands},{method},{proxy_type}: {ballot} -> {list(correct_proxy)} != {list(proxy_of_center)}")

In [15]:
# (for 'coords' method): for each proxy type, find portion of ballot proxies that 
# correspond to: simple ballots, generalized ballots, none.
# For the simple and generalized categories, check if they exactly correspond to ballots.

simple_match_counts = {proxy:0 for proxy in ['Borda','HH']}
simple_unmatch_counts = {proxy:0 for proxy in ['Borda','HH']}
gen_match_counts = {proxy:0 for proxy in ['Borda','HH']}
gen_unmatch_counts = {proxy:0 for proxy in ['Borda','HH']}
none_counts = {proxy:0 for proxy in ['Borda','HH']}

for index in df.index:
    method = df['method'][index]
    if method !='coords':
        continue
    centers = df['centers'][index]
    proxies_of_centers = df['proxies_of_centers'][index]
    num_cands = df['num_cands'][index]
    proxy_type = df['proxy'][index]

    for i in [0,1]:
        ballot = centers[i]
        if ballot == None:
            none_counts[proxy_type] += 1
            continue
        
        is_this_simple = is_simple(ballot)
        proxy_of_center = proxies_of_centers[i]
        if proxy_type == 'Borda':
            correct_proxy = Borda_proxy(ballot, num_cands, borda_style='pes')
        else:
            correct_proxy = HH_proxy(ballot, num_cands)
        if list(proxy_of_center) == list(correct_proxy):
            if is_this_simple:
                simple_match_counts[proxy_type] += 1
            else:
                gen_match_counts[proxy_type] += 1
        else:
            if is_this_simple:
                simple_unmatch_counts[proxy_type] += 1
            else:
                gen_unmatch_counts[proxy_type] += 1

print("SUMMARY for 'coords' method:")
print(f'simple match counts:\t {simple_match_counts}')
print(f'simple unmatch counts:\t {simple_unmatch_counts}') 
print(f'gen match counts:\t {gen_match_counts}')
print(f'gen unmatch counts:\t {gen_unmatch_counts}')
print(f'none counts:\t\t {none_counts}')

SUMMARY for 'coords' method:
simple match counts:	 {'Borda': 134, 'HH': 258}
simple unmatch counts:	 {'Borda': 62, 'HH': 0}
gen match counts:	 {'Borda': 169, 'HH': 59}
gen unmatch counts:	 {'Borda': 45, 'HH': 0}
none counts:		 {'Borda': 0, 'HH': 95}


INTERPRETATIONS:

$95$ of the $205\cdot 2= 410$ HH proxy vectors are inconsistent (have loops like A>B>C>A).  Of the rest, $258$ pull back to simple ballots and $59$ to generalized ballots.  For a sanity check, we verified that these are uniquely determined by the (simple or generalized) ballot to which they pull back.

About half ($214$) of the Borda proxies have ties (multiple occurances of the same number, other than in last place), so they correspond to generalized (rather than simple) ballots.  About a quarter of the Borda proxies aren't valid proxies of actual ballots, since component-wise medians don't necessarily follow the rules.

In [19]:
# Do same for 'kMedians' method

simple_match_counts = {proxy:0 for proxy in ['Borda','HH']}
simple_unmatch_counts = {proxy:0 for proxy in ['Borda','HH']}
gen_match_counts = {proxy:0 for proxy in ['Borda','HH']}
gen_unmatch_counts = {proxy:0 for proxy in ['Borda','HH']}
none_counts = {proxy:0 for proxy in ['Borda','HH']}

for index in df.index:
    method = df['method'][index]
    if method !='kMedians':
        continue
    centers = df['centers'][index]
    proxies_of_centers = df['proxies_of_centers'][index]
    num_cands = df['num_cands'][index]
    proxy_type = df['proxy'][index]

    for i in [0,1]:
        ballot = centers[i]
        if ballot == None:
            none_counts[proxy_type] += 1
            continue
        
        is_this_simple = is_simple(ballot)
        proxy_of_center = proxies_of_centers[i]
        if proxy_type == 'Borda':
            correct_proxy = Borda_proxy(ballot, num_cands, borda_style='pes')
        else:
            correct_proxy = HH_proxy(ballot, num_cands)
        if list(proxy_of_center) == list(correct_proxy):
            if is_this_simple:
                simple_match_counts[proxy_type] += 1
            else:
                gen_match_counts[proxy_type] += 1
        else:
            if is_this_simple:
                simple_unmatch_counts[proxy_type] += 1
            else:
                gen_unmatch_counts[proxy_type] += 1
print("SUMMARY for 'kMedians' method:")
print(f'simple match counts:\t {simple_match_counts}')
print(f'simple unmatch counts:\t {simple_unmatch_counts}') 
print(f'gen match counts:\t {gen_match_counts}')
print(f'gen unmatch counts:\t {gen_unmatch_counts}')
print(f'none counts:\t\t {none_counts}')

SUMMARY for 'kMedians' method:
simple match counts:	 {'Borda': 407, 'HH': 405}
simple unmatch counts:	 {'Borda': 2, 'HH': 0}
gen match counts:	 {'Borda': 0, 'HH': 4}
gen unmatch counts:	 {'Borda': 1, 'HH': 0}
none counts:		 {'Borda': 0, 'HH': 1}


In [15]:
# for each proxy method, determine the portion of elections for which the 'all' centers 
# correspond to ballots that were actually cast in the election.

elections_with_uncast_centers = []
for index in df.index:
    method = df['method'][index]
    proxy = df['proxy'][index]
    if method != 'all':
        continue
    centers = df['centers'][index]
    proxies_of_centers = df['proxies_of_centers'][index]
    num_cands = df['num_cands'][index]
    filename = df['filename'][index]
    full_filename = f"../ballot-clustering/scot-elex/{num_cands}_cands/{filename}"
    num_cands, election, cand_names, ward = csv_parse(full_filename)
    if not (centers[0] in election.keys() and centers[1] in election.keys()):
        print(f"{filename}({proxy}) has centers {centers} that don't correspond to actual ballots")
        print(proxies_of_centers)
        elections_with_uncast_centers.append((filename, proxy, centers, proxies_of_centers))
print(len(elections_with_uncast_centers), "elections with uncast centers")

0 elections with uncast centers


INTERPRETATION:

The 'all' method always produces cast centers, so the method isn't really more general, at least among 6-candidate elections.

But for more candidates, there are examples where $()$ is a center.

In [16]:
# How frequently do 'cast' centers match 'all' centers?  

good_counts = {proxy:0 for proxy in ['Borda', 'HH']}
bad_counts = {proxy:0 for proxy in ['Borda', 'HH']}

for index in df.index:
    if df['method'][index] != 'all':
        continue
    filename = df['filename'][index]
    rest_centers = df['centers'][index]
    proxy = df['proxy'][index]
    row_index = df[(df['filename'] == filename) & (df['technique'] == f'cast_{proxy}')].index[0]
    discrete_centers = df['centers'][row_index]
    if set(rest_centers) != set(discrete_centers):
        #print(f"{filename} ({proxy}): {rest_centers[0],rest_centers[1]} != {discrete_centers[0],discrete_centers[1]}")
        bad_counts[proxy] += 1
    else:
        good_counts[proxy] += 1

# portion of the elections for which the 'all' centers match the 'cast' centers
good_counts = {proxy:count for proxy,count in good_counts.items()}
bad_counts = {proxy:count for proxy,count in bad_counts.items() }
{proxy:good_counts[proxy]/(good_counts[proxy]+bad_counts[proxy]) for proxy in good_counts.keys()}

{'Borda': 1.0, 'HH': 1.0}

INTERPRETATION:

The 'cast' centers always match the 'all' centers, even though there are presumably tied solutions that might be found..

In [10]:
# Compute the score (summed L^1 distance of the ballots to the nearest center) for all the methods and all proxies.
# and rebuild all of the clusterings from the centers.
score_list = []
clusters_list = []
for index in df.index:
    method = df['method'][index]
    proxy = df['proxy'][index]
    filename = df['filename'][index]
    num_cands = df['num_cands'][index]
    full_filename = f"../ballot-clustering/scot-elex/{num_cands}_cands/{filename}"
    num_cands, election, cand_names, ward = csv_parse(full_filename)

    centers = df['proxies_of_centers'][index]
    order = 2 if method == 'Lloyd' else 1

    score, clustering = Clusters_from_centers(election, centers, proxy, order=order, 
                                              centers_live_in_proxy_space=True)
    score_list.append(score)
    clusters_list.append(clustering)
df['score'] = score_list
df['clustering'] = clusters_list

In [None]:
# Sanity check: Verify that the computed score matches the one from the IP solver dataframe dfIP.
D_IP_reverse = {v:k for k,v in D_IP.items()}

IP_techniques = ['coords_Borda', 'coords_HH', 'all_Borda', 'all_HH', 'cast_HH', 'cast_Borda']
Good_counts = {technique:0 for technique in IP_techniques}
Bad_counts = {technique:0 for technique in IP_techniques}

for index in df.index:
    method = df['method'][index]
    proxy = df['proxy'][index]
    filename = df['filename'][index]
    technique = df['technique'][index]
    if method not in ['all', 'cast', 'coords']:
        continue
    row_index = dfIP[(dfIP['election_name'] == filename[:-4]) & (dfIP['model'] == D_IP_reverse[(method, proxy)])].index[0]
    IP_score = dfIP['optimum_value'][row_index]
    if method in ['all', 'coords'] or technique == 'cast_Borda':
        IP_score *= (1/2) # divide by 2 to match our score definition as half the L^1 distance
    computed_score = df['score'][index]
    if abs(IP_score - computed_score) > 1e-6:
        print(f"Score mismatch for {filename}, ({technique}): ratio ={IP_score/computed_score}")
        Bad_counts[technique] += 1
    else:   
        Good_counts[technique] += 1
        
if sum(Bad_counts.values()) == 0:
    print("All scores match!")

All scores match!


In [21]:
# Look at one election in detail
filename = 'aberdeen_2012_ward11.csv'
full_filename = f"../ballot-clustering/scot-elex/{num_cands}_cands/{filename}"
num_cands, election, cand_names, ward = csv_parse(full_filename)
test = df[(df['filename'] == filename) & (df['method'] != 'Lloyd')][['technique', 'centers', 'proxies_of_centers', 'score']]
test

Unnamed: 0,technique,centers,proxies_of_centers,score
0,coords_Borda,"[(6, 1), ({4, 5}, {1, 2, 3, 6})]","[[3.0, 0.0, 0.0, 0.0, 0.0, 5.0], [0.0, 0.0, 0....",16414.5
1,coords_HH,"[(6, 1), ({4, 5}, {1, 2, 3, 6})]","[[1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, -1....",16817.0
2,all_Borda,"[(4, 5), (6, 1)]","[[0.0, 0.0, 0.0, 5.0, 4.0, 0.0], [4.0, 0.0, 0....",16617.0
3,all_HH,"[(6, 1), (4, 5)]","[[1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, -1....",16843.0
4,cast_HH,"[(4, 5), (6, 1)]","[[0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -1.0, -1.0, ...",16843.0
5,cast_Borda,"[(4, 5), (6, 1)]","[[0.0, 0.0, 0.0, 5.0, 4.0, 0.0], [4.0, 0.0, 0....",16617.0
1233,PAM_Borda,"[(6, 1), (4, 5)]","[[4, 0, 0, 0, 0, 5], [0, 0, 0, 5, 4, 0]]",16617.0
1234,PAM_HH,"[(6, 1), (4, 5)]","[[1.0, 1.0, 1.0, 1.0, -1.0, 0.0, 0.0, 0.0, -1....",16843.0
2051,kMedians_Borda,"[(6, 1), (4, 5)]","[[4.0, 0.0, 0.0, 0.0, 0.0, 5.0], [0.0, 0.0, 0....",16617.0
2052,kMedians_HH,"[(5, 4), (6,)]","[[0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -1.0, -1.0, ...",16924.0


In [31]:
election[(6,1)], election[(4,5)] # check that both were cast

(187, 45)

In [17]:
# find portion of elections for which the cast centers match the PAM centers
# when they don't, check whether the cast centers tie or score better.
match_count = {'Borda':0, 'HH':0}
unmatch_count = {'Borda':0, 'HH':0}

tie_count = {'Borda':0, 'HH':0}
PAM_win_count = {'Borda':0, 'HH':0}
cast_win_count = {'Borda':0, 'HH':0}

row_index = 0
for index in df.index:
    if df['method'][index] != 'cast':
        continue
    discrete_centers = df['centers'][index]
    discrete_score = df['score'][index]
    filename = df['filename'][index]
    proxy_type = df['proxy'][index]
    row_index = df[(df['filename'] == filename) & (df['technique'] == f'PAM_{proxy_type}')].index[0]
    PAM_score = df['score'][row_index]
    PAM_centers = df['centers'][row_index]
    if set(PAM_centers) == set(discrete_centers):
        match_count[proxy_type] += 1
        continue
    else:
        unmatch_count[proxy_type] += 1
    
    if discrete_score == PAM_score:
        tie_count[proxy_type] += 1
        print("TIE: ",filename, proxy_type)
    elif discrete_score < PAM_score:
        cast_win_count[proxy_type] += 1
    else:
        PAM_win_count[proxy_type] += 1
        print(proxy_type, discrete_score, PAM_score, discrete_centers, PAM_centers)

print(f'match: {match_count}')
print(f'unmatch but tie: {tie_count}')
print(f'PAM win: {PAM_win_count}')
print(f'"cast" win: {cast_win_count}')

TIE:  east_dunbartonshire_2017_ward5.csv HH
match: {'Borda': 189, 'HH': 194}
unmatch but tie: {'Borda': 0, 'HH': 1}
PAM win: {'Borda': 0, 'HH': 0}
"cast" win: {'Borda': 16, 'HH': 10}


In [16]:
# Look at the one election in which PAM and 'cast' tie but don't match
filename = 'east_dunbartonshire_2017_ward5.csv'
full_filename = f"../ballot-clustering/scot-elex/{num_cands}_cands/{filename}"
num_cands, election, cand_names, ward = csv_parse(full_filename)
test = df[(df['filename'] == filename) & (df['technique'].isin(['cast_HH', 'PAM_HH']))][['technique', 'centers', 'score', 'clustering']]
test

Unnamed: 0,technique,centers,score,clustering
370,cast_HH,"[(4, 5, 1), (3, 6, 2)]",19880.0,"{0: {(1,): 43.0, (1, 2): 5.0, (1, 2, 4): 2.0, ..."
1478,PAM_HH,"[(2, 3, 6), (4, 5, 1)]",19880.0,"{0: {(1, 2): 5.0, (1, 2, 3): 3.0, (1, 2, 3, 6)..."


In [None]:
C1 = test['clustering'][370]
C2 = test['clustering'][1478]
Clustering_closeness(election, C1, C2)

INTERPETATION:

PAM centers match 'cast' centers over $90\%$ of the time.  When they don't match, there's only one election (with HH) for which the reason is that they find different tied optimum (the centers are slightly different and result in slightly different clusterings).  For the rest, it's because 'cast' beats PAM.

In [23]:
# determine the portion of the elections for which the continous score beats the cast score
tie_counts = {proxy:0 for proxy in ['Borda', 'HH']}
continuous_win_counts = {proxy:0 for proxy in ['Borda', 'HH']}
discrete_win_counts = {proxy:0 for proxy in ['Borda', 'HH']}

for index in df.index:
    if df['method'][index] != 'coords':
        continue
    filename = df['filename'][index]
    proxy_type = df['proxy'][index]
    continuous_score = df['score'][index]
    row_index = df[(df['filename'] == filename) & (df['technique'] == f'cast_{proxy_type}')].index[0]
    discrete_score = df['score'][row_index]

    if continuous_score == discrete_score:
        tie_counts[proxy_type] += 1
    elif continuous_score < discrete_score:
        continuous_win_counts[proxy_type] += 1
    else:
        discrete_win_counts[proxy_type] += 1

tie_counts, continuous_win_counts, discrete_win_counts

({'Borda': 22, 'HH': 78}, {'Borda': 183, 'HH': 128}, {'Borda': 0, 'HH': 0})

In [24]:
# determine the portion of the elections for which the continous score beats the kMedians score
tie_with_same_proxies_counts = {proxy:0 for proxy in ['Borda', 'HH']}
tie_with_different_proxies_counts = {proxy:0 for proxy in ['Borda', 'HH']}
continuous_win_counts = {proxy:0 for proxy in ['Borda', 'HH']}
kMedians_win_counts = {proxy:0 for proxy in ['Borda', 'HH']}

for index in df.index:
    if df['method'][index] != 'coords':
        continue
    filename = df['filename'][index]
    proxy_type = df['proxy'][index]
    continuous_score = df['score'][index]
    row_index = df[(df['filename'] == filename) & (df['technique'] == f'kMedians_{proxy_type}')].index[0]
    kmed_score = df['score'][row_index]

    if continuous_score == kmed_score:
        if set(map(tuple, df['proxies_of_centers'][index])) == set(map(tuple, df['proxies_of_centers'][row_index])):
            tie_with_same_proxies_counts[proxy_type] += 1
        else:
            tie_with_different_proxies_counts[proxy_type] += 1
    elif continuous_score < kmed_score:
        continuous_win_counts[proxy_type] += 1
    else:
        kMedians_win_counts[proxy_type] += 1
print(f'tie with same proxies: {tie_with_same_proxies_counts}')
print(f'tie with different proxies: {tie_with_different_proxies_counts}') 
print(f'continuous win: {continuous_win_counts}')
print(f'kMedians win: {kMedians_win_counts}')

tie with same proxies: {'Borda': 5, 'HH': 15}
tie with different proxies: {'Borda': 0, 'HH': 0}
continuous win: {'Borda': 200, 'HH': 191}
kMedians win: {'Borda': 0, 'HH': 0}


In [44]:
# measure clustering closness of the cast vs continuous methods
closeness = {proxy:[] for proxy in ['Borda', 'HH']}
errors = 0

for index in df.index:
    if df['method'][index] != 'coords':
        continue
    filename = df['filename'][index]
    proxy_type = df['proxy'][index]
    C_continuous = df['clustering'][index]
    row_index = df[(df['filename'] == filename) & (df['technique'] == f'cast_{proxy_type}')].index[0]
    C_discrete = df['clustering'][row_index]

    num_cands = df['num_cands'][index]
    full_filename = f"../ballot-clustering/scot-elex/{num_cands}_cands/{filename}"
    _, election, __, ____ = csv_parse(full_filename)
    try:
        closeness[proxy_type].append(Clustering_closeness(election, C_continuous, C_discrete))
    except:
        errors += 1
        print(proxy, filename)

In [37]:
errors

0

In [45]:
{proxy: np.mean(closeness[proxy]) for proxy in closeness.keys()}

{'Borda': 0.03241625811392328, 'HH': 0.02020563301256821}