# Do multi-round matches

We use a greedy algorithm: we do best matches first, then we ban further pairwise matches.

In [29]:
import pandas as pd

import numpy as np
import pandas as pd
from tqdm import tqdm

from scipy.cluster.hierarchy import linkage
import hcluster   # requires dedupe-hcluster
from paper_reviewer_matcher import (
    preprocess, compute_affinity
)

from group_matching import compute_conflicts, generate_pod_numbers

users = pd.read_csv('data/mindmatch_example.csv').to_dict(orient='records')
n_users = len(users)
print('Number of registered users: {}'.format(n_users))

users_df = pd.DataFrame(users).fillna('')
users_dict = {r['user_id']: dict(r) for _, r in users_df.iterrows()}  # map of user id to details
persons_1 = list(map(preprocess, list(users_df['abstracts'])))
persons_2 = list(map(preprocess, list(users_df['abstracts'])))
A = compute_affinity(
    persons_1, persons_2,
    n_components=30, min_df=2, max_df=0.8,
    weighting='tfidf', projection='svd'
)
cois_list = compute_conflicts(users_df)
for i, j in cois_list:
    A[i, j] = -1

A_cluster = - A
A_cluster[A_cluster == 1000] = 1
A0 = A_cluster

Number of registered users: 1162


1162it [03:01,  6.41it/s]


In [31]:
def measure_goodness(A_cluster, cluster_assignments):
    dists = []
    for i in range(cluster_assignments.min(), cluster_assignments.max()+1):
        # Calculate the average pairwise distance within the cluster.
        mean_dist = A_cluster[cluster_assignments == i, :][:, cluster_assignments == i].mean()
        dists.append(mean_dist)
        
    return dists

In [32]:
# We apply the alternative bottom-up method suggested here
# https://github.com/jmonlong/Hippocamplus/blob/master/content/post/2018-06-09-ClusterEqualSize.Rmd
from scipy.spatial.distance import squareform

def get_distance_vector(B):
    Bs = (B + B.T) / 2.0
    diag_mask = (np.ones_like(Bs) - np.eye(Bs.shape[0]))
    Bs = Bs * diag_mask
    return squareform(Bs)

def agglomerate(A, group_size):
    ngroups = int(np.ceil(A.shape[0] / group_size))
    nsmallgroups = ngroups * group_size - A.shape[0]
    nbiggroups = ngroups - nsmallgroups
    labels = np.ones(A.shape[0]) * np.nan
    
    A = A.copy()
    
    groups = []
    group_sizes = [group_size] * nbiggroups + [group_size - 1] * nsmallgroups
    assert A.shape[0] == sum(group_sizes)
    j = 0
    for gs in tqdm(group_sizes):
        B = A[np.isnan(labels), :][:, np.isnan(labels)]
        z = linkage(get_distance_vector(B),
                    method='average',
                    metric='euclidean')
        
        the_nums = np.where(z[:, -1] >= gs)[0]
        minpos = the_nums.min()
        
        cluster_nums = [z[minpos, 0], z[minpos, 1]]
        
        i = 0
        while i < len(cluster_nums):
            if cluster_nums[i] >= B.shape[0]:
                cluster_nums.append(z[int(cluster_nums[i]) - B.shape[0], 0])
                cluster_nums.append(z[int(cluster_nums[i]) - B.shape[0], 1])
            i += 1
            
        cluster_nums = np.array(cluster_nums).astype(int)
        cluster_nums = cluster_nums[cluster_nums < B.shape[0]]
        
        assert len(cluster_nums) >= gs
        cluster_nums = cluster_nums[:gs]
        
        # Map cluster nums to the original numbers prior to subsetting.
        the_map = np.where(np.isnan(labels))[0]        
        cluster_nums = [the_map[k] for k in cluster_nums]
        labels[cluster_nums] = j        
        j += 1
        
    return labels.astype(int)

In [33]:
# Ban previous match sets
A = A0.copy()

print((A == 1).sum())

labels = agglomerate(A, 5)
goodnesses = np.array(measure_goodness(A0, labels))
print([goodnesses.mean(), np.std(goodnesses)])

for j in range(3):
    for i in range(labels.max()+1):
        a = np.where(labels==i)[0]
        for k in a:
            A[labels==i, k] = 1
            
    print((A == 1).sum())

    labels = agglomerate(A, 5)

    goodnesses = np.array(measure_goodness(A0, labels))
    print([goodnesses.mean(), np.std(goodnesses)])

270


100%|█████████████████████████████████████████████████████████████████████████████████████| 233/233 [00:01<00:00, 158.01it/s]


[0.20721110253896619, 0.05313283490597109]
6064


100%|█████████████████████████████████████████████████████████████████████████████████████| 233/233 [00:01<00:00, 166.21it/s]


[0.23253462354067633, 0.05266612920988387]
10692


100%|█████████████████████████████████████████████████████████████████████████████████████| 233/233 [00:01<00:00, 165.58it/s]


[0.2426361171008488, 0.057776939786365375]
15310


100%|█████████████████████████████████████████████████████████████████████████████████████| 233/233 [00:01<00:00, 170.24it/s]


[0.251461187332032, 0.0577577764271906]
