In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4,5'
from collections import Counter
import faiss
import tqdm
import numpy as np
import pandas as pd
import subprocess
from src import utils
from src.reranking import Diffusion, explore_exploit
# from experiments.submit_retrieval import predict_landmark_id

ROOT = '../'

# index_dirs = ['../experiments/v2clean_only/feats_index19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
# test_dirs = ['../experiments/v2clean_only/feats_test19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
index_dirs = ['../experiments/v2clean/feats_index19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
test_dirs = ['../experiments/v2clean/feats_test19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']

weights = [1.0]

ids_index, feats_index = utils.prepare_ids_and_feats(index_dirs, weights, normalize=True)
ids_test, feats_test = utils.prepare_ids_and_feats(test_dirs, weights, normalize=True)

co = faiss.GpuMultipleClonerOptions()
co.shard = True

vres = []
for _ in range(2):
    res = faiss.StandardGpuResources()
    vres.append(res)

In [2]:
retrieval_solution = pd.read_csv('../input/v2_1/retrieval_solution_v2.1.csv')
retrieval_solution = retrieval_solution.sort_values('id')
assert (retrieval_solution['id'] == ids_test).all()
(retrieval_solution['images'] != 'None').sum()

1129

In [3]:
print('build index...')
cpu_index = faiss.IndexFlatIP(feats_index.shape[1])
gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
gpu_index.add(feats_index)
sims, topk_idx = gpu_index.search(x=feats_test, k=100)
print('query search done.')

subm = pd.DataFrame(ids_test, columns=['id'])
subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[topk_idx])

setting = 'k-NN search'
output_name = f'../output/{setting}.csv.gz'
subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
print('saved to ' + output_name)

cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
print(cmd)
subprocess.run(cmd, shell=True)

build index...
query search done.
saved to ../output/k-NN search.csv.gz
kaggle c submit -c landmark-retrieval-2019 -f ../output/k-NN search.csv.gz -m "" 


CompletedProcess(args='kaggle c submit -c landmark-retrieval-2019 -f ../output/k-NN search.csv.gz -m "" ', returncode=1)

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
from collections import Counter
import faiss
import tqdm
import numpy as np
import pandas as pd
import subprocess
from src import utils
from src.reranking import Diffusion, explore_exploit
# from experiments.submit_retrieval import predict_landmark_id

ROOT = '../'

index_dirs = ['../experiments/v2clean/feats_index19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
test_dirs = ['../experiments/v2clean/feats_test19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
train_dirs = ['../experiments/v2clean/feats_train_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']

weights = [1.0]

ids_index, feats_index = utils.prepare_ids_and_feats(index_dirs, weights, normalize=True)
ids_test, feats_test = utils.prepare_ids_and_feats(test_dirs, weights, normalize=True)
ids_train, feats_train = utils.prepare_ids_and_feats(train_dirs, weights, normalize=True)

co = faiss.GpuMultipleClonerOptions()
co.shard = True

vres = []
for _ in range(2):
    res = faiss.StandardGpuResources()
    vres.append(res)

In [3]:
print('build index...')
cpu_index = faiss.IndexFlatL2(feats_index.shape[1])
gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
gpu_index.add(feats_index)
dists, topk_idx = gpu_index.search(x=feats_test, k=100)
print('query search done.')

build index...
query search done.


In [None]:
def predict_landmark_id(ids_query, feats_query, ids_train, feats_train, landmark_dict, topk=3):
    co = faiss.GpuMultipleClonerOptions()
    co.shard = True

    vres = []
    for _ in range(2):
        res = faiss.StandardGpuResources()
        vres.append(res)
    
    print('build index...')
    cpu_index = faiss.IndexFlatIP(feats_train.shape[1])
    gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
    gpu_index.add(feats_train)
    sims, topk_idx = gpu_index.search(x=feats_query, k=topk)
    print('query search done.')

    df = pd.DataFrame(ids_query, columns=['id'])
    df['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_train[topk_idx])

    rows = []
    for imidx, (_, r) in tqdm.tqdm(enumerate(df.iterrows()), total=len(df)):
        image_ids = [name.split('/')[-1] for name in r.images.split(' ')]
        counter = Counter()
        for i, image_id in enumerate(image_ids[:topk]):
            landmark_id = landmark_dict[image_id]

            counter[landmark_id] += sims[imidx, i]

        landmark_id, score = counter.most_common(1)[0]
        rows.append({
            'id': r['id'],
            'landmarks': f'{landmark_id} {score:.9f}',
        })

    pred = pd.DataFrame(rows).set_index('id')
    pred['landmark_id'], pred['score'] = list(
        zip(*pred['landmarks'].apply(lambda x: str(x).split(' '))))
    pred['score'] = pred['score'].astype(np.float32)

    return pred

train19_csv = pd.read_pickle(ROOT + 'input/train.pkl')[['id', 'landmark_id']]
landmark_dict = train19_csv.set_index('id').sort_index().to_dict()['landmark_id']

train19_clean = pd.read_csv('../input/clean/train19_cleaned_verifythresh30_freqthresh3.csv')
clean_ids_train = np.concatenate(train19_clean['images'].str.split(' '))

isin_clean = np.isin(ids_train, clean_ids_train)
ids_train = ids_train[isin_clean]
feats_train = feats_train[isin_clean]

pred_index = predict_landmark_id(ids_index, feats_index, ids_train, feats_train, landmark_dict, topk=3)
pred_test = predict_landmark_id(ids_test, feats_test, ids_train, feats_train, landmark_dict, topk=3)

In [19]:
pred_index = predict_landmark_id(ids_index, feats_index, ids_train, feats_train, landmark_dict, topk=3)
pred_test = predict_landmark_id(ids_test, feats_test, ids_train, feats_train, landmark_dict, topk=3)

build index...
query search done.


100%|██████████| 761757/761757 [03:03<00:00, 4144.05it/s]


build index...
query search done.


100%|██████████| 117577/117577 [00:26<00:00, 4458.22it/s]


In [22]:
a = pred_test.loc[retrieval_solution[retrieval_solution['images'] != 'None']['id']]
a[a['score'] > 0.5]

Unnamed: 0_level_0,landmarks,landmark_id,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00084cdf8f600d00,159483 1.665004373,159483,1.665004
00141b8a5a729084,172966 1.232237160,172966,1.232237
0044d82ea7654ece,22363 2.465871274,22363,2.465871
00d5b448fa93e1b8,4465 2.467962444,4465,2.467963
013098904123b014,165890 2.204839647,165890,2.204840
01b4c4975d1bebbe,105914 2.612013161,105914,2.612013
027a19e10127d5c4,135993 2.793360889,135993,2.793361
0289a20454afe4d2,141495 2.700329542,141495,2.700330
02cb34a8dc4030da,135591 2.102216244,135591,2.102216
03182100f70cf0cf,44991 2.842160702,44991,2.842161


In [24]:
# Ours
retrieval_solution = pd.read_csv('../input/v2_1/retrieval_solution_v2.1.csv')
retrieval_solution = retrieval_solution.sort_values('id')
assert (retrieval_solution['id'] == ids_test).all()
actual_test_ids = retrieval_solution[retrieval_solution['images'] != 'None']['id']

topk=100
subm = pd.DataFrame(index=ids_test)
subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[topk_idx])
subm['index_id_list'] = subm['images'].apply(lambda x: x.split(' ')[:topk])

# Make higher score, ealier position for re-ranking.
pred_index = pred_index.sort_values('score', ascending=False)

for thresh in [1.0]:
    setting = f'our_reranking_thresh{thresh}'
    images = []
    for test_id in tqdm.tqdm(actual_test_ids):
        pred, score = pred_test.loc[test_id, ['landmark_id', 'score']]
        ids = subm.loc[test_id, 'index_id_list']
        if score < thresh:
            images.append(subm.loc[test_id, 'images'])
            continue

        retrieved_list = pred_index.loc[ids, 'landmark_id']
        whole_ids_same = pred_index[pred_index['landmark_id'] == pred].index
        # keep the order by referring to the original index
        diff = sorted(set(whole_ids_same) - set(ids), key=whole_ids_same.get_loc)

        retrieved_list = pd.concat([
            retrieved_list,
            pd.Series(pred, index=diff)
        ])

        # use mergesort to keep relative order of original list.
        predefined_limit_topk = 100
        reranked_ids = (pred != retrieved_list).sort_values(kind='mergesort').index[:predefined_limit_topk]
        images.append(' '.join(reranked_ids))

    subm = pd.DataFrame(index=ids_test)
    subm.index.name = 'id'
    subm['images'] = ''
    subm.loc[retrieval_solution.set_index('id')['images'] != 'None', 'images'] = images
    subm = subm.reset_index()

    output_name = f'../output/{setting}.csv'
    subm[['id', 'images']].to_csv(output_name, index=False)
    print('saved to ' + output_name)

    cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
    print(cmd)
    subprocess.run(cmd, shell=True)

100%|██████████| 1129/1129 [02:42<00:00,  7.09it/s]


saved to ../output/our_reranking_thresh1.0.csv
kaggle c submit -c landmark-retrieval-2019 -f ../output/our_reranking_thresh1.0.csv -m "" 


In [None]:
# QE + Ours

topk = 100
alpha = 3
qe = True
dba = False
n_qe = 10
setting = f'alpha_QE_{n_qe}'

feats_concat = np.concatenate([feats_test, feats_index], axis=0)

cpu_index = faiss.IndexFlatIP(feats_concat.shape[1])
gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
gpu_index.add(feats_concat)
sims, topk_idx = gpu_index.search(x=feats_concat, k=n_qe)

weights = np.expand_dims(sims ** alpha, axis=-1).astype(np.float32)
feats_concat = (feats_concat[topk_idx] * weights).sum(axis=1)
feats_concat = utils.l2norm_numpy(feats_concat.astype(np.float32))

split_at = [len(feats_test)]
if qe and dba:
    reranked_feats_test, reranked_feats_index = np.split(feats_concat, split_at, axis=0)
elif not qe and dba:
    _, reranked_feats_index = np.split(feats_concat, split_at, axis=0)
    reranked_feats_test = feats_test
elif qe and not dba:
    reranked_feats_test, _ = np.split(feats_concat, split_at, axis=0)
    reranked_feats_index = feats_index
else:
    raise ValueError

print('build index...')
cpu_index = faiss.IndexFlatL2(reranked_feats_index.shape[1])
gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
gpu_index.add(reranked_feats_index)
dists, topk_idx = gpu_index.search(x=reranked_feats_test, k=topk)
print('query search done.')

subm = pd.DataFrame(ids_test, columns=['id'])
subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[topk_idx])

assert np.all(subm['id'] == pred_test.index)
subm['index_id_list'] = subm['images'].apply(lambda x: x.split(' ')[:topk])

# Make higher score, ealier position for re-ranking.
pred_index = pred_index.sort_values('score', ascending=False)

images = []
for test_id, pred, ids in tqdm.tqdm(zip(subm['id'], pred_test['landmark_id'], subm['index_id_list']),
                                    total=len(subm)):
    retrieved_list = pred_index.loc[ids, 'landmark_id']
    whole_ids_same = pred_index[pred_index['landmark_id'] == pred].index
#     diff = set(whole_ids_same) - set(ids)
    diff = sorted(set(whole_ids_same) - set(ids), key=whole_ids_same.get_loc)
    
    retrieved_list = pd.concat([
        retrieved_list,
        pd.Series(pred, index=diff)
    ])

    # use mergesort to keep relative order of original list.
    predefined_limit_topk = 100
    reranked_ids = (pred != retrieved_list).sort_values(kind='mergesort').index[:predefined_limit_topk]
    images.append(' '.join(reranked_ids))

subm['images'] = images


output_name = f'../output/{setting}_reranking.csv.gz'
subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
print('saved to ' + output_name)

cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
print(cmd)
subprocess.run(cmd, shell=True)

In [None]:
#  QE alpha experiments
for n_qe in [1, 2, 3, 5, 10]:

    setting = f'alpha_QE_{n_qe}'

    feats_concat = np.concatenate([feats_test, feats_index], axis=0)

    cpu_index = faiss.IndexFlatIP(feats_concat.shape[1])
    gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
    gpu_index.add(feats_concat)
    sims, topk_idx = gpu_index.search(x=feats_concat, k=n_qe)

    weights = np.expand_dims(sims ** alpha, axis=-1).astype(np.float32)

    feats_concat = (feats_concat[topk_idx] * weights).sum(axis=1)

    feats_concat = utils.l2norm_numpy(feats_concat.astype(np.float32))

    split_at = [len(feats_test)]
    if qe and dba:
        reranked_feats_test, reranked_feats_index = np.split(feats_concat, split_at, axis=0)
    elif not qe and dba:
        _, reranked_feats_index = np.split(feats_concat, split_at, axis=0)
        reranked_feats_test = feats_test
    elif qe and not dba:
        reranked_feats_test, _ = np.split(feats_concat, split_at, axis=0)
        reranked_feats_index = feats_index
    else:
        raise ValueError

    print('build index...')
    cpu_index = faiss.IndexFlatL2(reranked_feats_index.shape[1])
    gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
    gpu_index.add(reranked_feats_index)
    dists, topk_idx = gpu_index.search(x=reranked_feats_test, k=topk)
    print('query search done.')
    
    subm = pd.DataFrame(ids_test, columns=['id'])
    subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[topk_idx])
    output_name = f'../output/{setting}.csv.gz'
    subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
    print('saved to ' + output_name)

    cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
    print(cmd)
    subprocess.run(cmd, shell=True)


In [5]:
#  AQE K experiments
topk = 100
alpha = 1
qe = True
dba = False

for n_qe in [10]:

    setting = f'AQE_{n_qe}'

    feats_concat = np.concatenate([feats_test, feats_index], axis=0)

    cpu_index = faiss.IndexFlatIP(feats_concat.shape[1])
    gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
    gpu_index.add(feats_concat)
    sims, topk_idx = gpu_index.search(x=feats_concat, k=n_qe)

    weights = np.expand_dims(sims ** alpha, axis=-1).astype(np.float32)

    feats_concat = (feats_concat[topk_idx] * weights).sum(axis=1)

    feats_concat = utils.l2norm_numpy(feats_concat.astype(np.float32))

    split_at = [len(feats_test)]
    if qe and dba:
        reranked_feats_test, reranked_feats_index = np.split(feats_concat, split_at, axis=0)
    elif not qe and dba:
        _, reranked_feats_index = np.split(feats_concat, split_at, axis=0)
        reranked_feats_test = feats_test
    elif qe and not dba:
        reranked_feats_test, _ = np.split(feats_concat, split_at, axis=0)
        reranked_feats_index = feats_index
    else:
        raise ValueError

    print('build index...')
    cpu_index = faiss.IndexFlatL2(reranked_feats_index.shape[1])
    gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
    gpu_index.add(reranked_feats_index)
    dists, topk_idx = gpu_index.search(x=reranked_feats_test, k=topk)
    print('query search done.')
    
    subm = pd.DataFrame(ids_test, columns=['id'])
    subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[topk_idx])
    output_name = f'../output/{setting}.csv.gz'
    subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
    print('saved to ' + output_name)

    cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
    print(cmd)
    subprocess.run(cmd, shell=True)


build index...
query search done.
saved to ../output/AQE_10.csv.gz
kaggle c submit -c landmark-retrieval-2019 -f ../output/AQE_10.csv.gz -m "" 


In [None]:
# EGT

from easydict import EasyDict as edict
from src.reranking import Diffusion, explore_exploit

ds = edict({
    'ids_index': ids_index,
    'ids_test': ids_test,
    'feats_index': feats_index,
    'feats_test': feats_test,
})

threshs = [np.inf, 0.9]
explore_k = 100

qidxs_dict = {id_: idx for idx, id_ in enumerate(ds.ids_test)}
queries = [edict(idx=qidxs_dict[id_], id_=id_) for id_ in ds.ids_test]

# Search with GpuMultiple
co = faiss.GpuMultipleClonerOptions()
co.shard = True
vres = []
for _ in range(2):
    res = faiss.StandardGpuResources()
    vres.append(res)

cpu_index = faiss.IndexFlatIP(ds.feats_index.shape[1])
gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
gpu_index.add(ds.feats_index)

ii_sims, ii_ids = gpu_index.search(x=ds.feats_index, k=100)
ti_sims, ti_ids = gpu_index.search(x=ds.feats_test, k=100)

allpair = edict({
    'ti_sims': ti_sims, 'ti_ids': ti_ids,
    'ii_sims': ii_sims, 'ii_ids': ii_ids,
})

for thresh in threshs:
    images = []
    for q in tqdm.tqdm(queries,
                       total=len(queries),
                       mininterval=0.1,
                       desc='Explore-Exploit'):
        reranked_ids = explore_exploit(q, ds, allpair, thresh, explore_k)
        assert len(np.unique(reranked_ids[:100])) == 100

        images.append(' '.join(reranked_ids))

    subm = pd.DataFrame(ids_test, columns=['id'])
    subm['images'] = images

    output_name = f'../output/egt_thresh{thresh}.csv.gz'
    subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
    print('saved to ' + output_name)

    cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
    print(cmd)
    subprocess.run(cmd, shell=True)

Explore-Exploit:   2%|▏         | 2300/117577 [12:02<9:44:16,  3.29it/s] 

In [None]:
# Yang's Diffusion

truncation_size = 1000
kd_size = 50
kq_size = 10
gamma = 3

diffusion = Diffusion(feats_index, './tmp')
offline = diffusion.get_offline_results(truncation_size, kd_size)

sims, ids = diffusion.knn.search(feats_test, kq_size)
sims = sims ** gamma
qr_num = ids.shape[0]

all_scores = np.empty((qr_num, truncation_size), dtype=np.float32)
all_ranks = np.empty((qr_num, truncation_size), dtype=np.int)
for i in tqdm.tqdm(range(qr_num), desc='[search] query'):
    scores = sims[i] @ offline[ids[i]]
    parts = np.argpartition(-scores, truncation_size)[:truncation_size]
    ranks = np.argsort(-scores[parts])

    all_scores[i] = scores[parts][ranks]
    all_ranks[i] = parts[ranks]

subm = pd.DataFrame(ids_test, columns=['id'])
subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[all_ranks[:, :100]])

output_name = f'../output/yangs_diffusion.csv.gz'
subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
print('saved to ' + output_name)

cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
print(cmd)
subprocess.run(cmd, shell=True)

In [None]:
# Iscen's Diffusion
# 1M x 1M ぐらいの行列を保持する必要があって無理だった

from src.reranking import sim_kernel, normalize_connection_graph, topK_W, find_trunc_graph, dfs_trunk, cg_diffusion, fsr_rankR
from scipy.sparse import csr_matrix

K = 100 # approx 50 mutual nns
QUERYKNN = 10
R = 2000
alpha = 0.9

Q = feats_test[retrieval_solution['images'] != 'None'].T
X = feats_index

sim  = np.dot(X, Q)
qsim = sim_kernel(sim).T

sortidxs = np.argsort(-qsim, axis = 1)
for i in range(len(qsim)):
    qsim[i,sortidxs[i,QUERYKNN:]] = 0

qsim = sim_kernel(qsim)

print('build index...')
cpu_index = faiss.IndexFlatIP(X.shape[1])
gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
gpu_index.add(X)
sims, topk_idx = gpu_index.search(x=X, k=K)
print('query search done.')

import scipy.sparse as sparse

def get_affinity(sims, ids, gamma=3):
    """Create affinity matrix for the mutual kNN graph of the whole dataset
    Args:
        sims: similarities of kNN
        ids: indexes of kNN
    Returns:
        affinity: affinity matrix
    """
    num = sims.shape[0]
    sims[sims < 0] = 0  # similarity should be non-negative
    sims = sims ** gamma
    # vec_ids: feature vectors' ids
    # mut_ids: mutual (reciprocal) nearest neighbors' ids
    # mut_sims: similarites between feature vectors and their mutual nearest neighbors
    vec_ids, mut_ids, mut_sims = [], [], []
    print(f'per num: {num}')
    for i in range(num):
        # check reciprocity: i is in j's kNN and j is in i's kNN
        ismutual = np.isin(ids[ids[i]], i).any(axis=1)
        if ismutual.any():
            vec_ids.append(i * np.ones(ismutual.sum()))
            mut_ids.append(ids[i, ismutual])
            mut_sims.append(sims[i, ismutual])
    print('map')
    vec_ids, mut_ids, mut_sims = map(
        np.concatenate, [vec_ids, mut_ids, mut_sims])
    affinity = sparse.csc_matrix((mut_sims, (vec_ids, mut_ids)),
                                 shape=(num, num), dtype=np.float32)
    affinity[range(num), range(num)] = 0
    return affinity

# dists = sim_kernel(dists)
# A = csr_matrix((dists.ravel(), (np.arange(X.shape[0]).repeat(K), topk_idx.ravel())),
#                shape=(X.shape[0], X.shape[0]))
W = get_affinity(sims, topk_idx, gamma=3)
Wn = normalize_connection_graph(W)

# plain_ranks = np.argsort(-sim, axis=0)
cg_ranks =  cg_diffusion(qsim, Wn, alpha)
# cg_trunk_ranks =  dfs_trunk(sim, A, alpha = alpha, QUERYKNN = QUERYKNN )
fast_spectral_ranks = fsr_rankR(qsim, Wn, alpha, R)

alg_names = ['Diffusion cg'  'Spectral R=2000']
alg_ranks = [cg_ranks,  fast_spectral_ranks]

for ranks, aname in zip(alg_ranks, alg_names):
    ranks = ranks.T[:, :100]

    subm = pd.DataFrame(index=ids_test)
    subm.index.name = 'id'
    subm['images'] = ''
    subm.loc[retrieval_solution.set_index('id')['images'] != 'None', 'images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[ranks[:, :100]])

    output_name = f'../output/{aname}.csv.gz'
    subm = subm.reset_index()
    subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
    print('saved to ' + output_name)

    cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
    print(cmd)
    subprocess.run(cmd, shell=True)

In [3]:
# k-reciprocal
# https://github.com/zhunzhong07/person-re-ranking/tree/master/python-version
from scipy.spatial.distance import cdist
"""
API
q_g_dist: query-gallery distance matrix, numpy array, shape [num_query, num_gallery]
q_q_dist: query-query distance matrix, numpy array, shape [num_query, num_query]
g_g_dist: gallery-gallery distance matrix, numpy array, shape [num_gallery, num_gallery]
k1, k2, lambda_value: parameters, the original paper is (k1=20, k2=6, lambda_value=0.3)
Returns:
  final_dist: re-ranked distance, numpy array, shape [num_query, num_gallery]
"""
prob_f = feats_test[retrieval_solution['images'] != 'None']
gal_f = feats_index
feat = np.concatenate([prob_f, gal_f], axis=0)
k1=20
k2=6
lambda_value=0.3
topk = 1000

cpu_index = faiss.IndexFlatL2(feat.shape[1])
gpu_index = faiss.index_cpu_to_gpu_multiple_py(vres, cpu_index, co)
gpu_index.add(feat)
dists, initial_rank = gpu_index.search(x=feat, k=topk)
print('search done.')

# dists = np.power(dists, 2)
V = np.zeros_like(dists).astype(np.float32)
original_dist = dists

query_num = gal_f.shape[0]
gallery_num = feat.shape[0]
all_num = gallery_num

for i in range(all_num):
    # k-reciprocal neighbors
    forward_k_neigh_index = initial_rank[i,:k1+1]
    backward_k_neigh_index = initial_rank[forward_k_neigh_index,:k1+1]
    fi = np.where(backward_k_neigh_index==i)[0]
    k_reciprocal_index = forward_k_neigh_index[fi]
    k_reciprocal_expansion_index = k_reciprocal_index
    for j in range(len(k_reciprocal_index)):
        candidate = k_reciprocal_index[j]
        candidate_forward_k_neigh_index = initial_rank[candidate,:int(np.around(k1/2.))+1]
        candidate_backward_k_neigh_index = initial_rank[candidate_forward_k_neigh_index,:int(np.around(k1/2.))+1]
        fi_candidate = np.where(candidate_backward_k_neigh_index == candidate)[0]
        candidate_k_reciprocal_index = candidate_forward_k_neigh_index[fi_candidate]
        if len(np.intersect1d(candidate_k_reciprocal_index,k_reciprocal_index))> 2./3*len(candidate_k_reciprocal_index):
            k_reciprocal_expansion_index = np.append(k_reciprocal_expansion_index,candidate_k_reciprocal_index)

    k_reciprocal_expansion_index = np.unique(k_reciprocal_expansion_index)
    weight = np.exp(-original_dist[i,k_reciprocal_expansion_index])
    V[i,k_reciprocal_expansion_index] = 1.*weight/np.sum(weight)

original_dist = original_dist[:query_num,]
if k2 != 1:
    V_qe = np.zeros_like(V,dtype=np.float32)
    for i in range(all_num):
        V_qe[i,:] = np.mean(V[initial_rank[i,:k2],:],axis=0)
    V = V_qe
    del V_qe
del initial_rank
invIndex = []
for i in range(gallery_num):
    invIndex.append(np.where(V[:,i] != 0)[0])

jaccard_dist = np.zeros_like(original_dist,dtype = np.float32)


for i in range(query_num):
    temp_min = np.zeros(shape=[1,gallery_num],dtype=np.float32)
    indNonZero = np.where(V[i,:] != 0)[0]
    indImages = []
    indImages = [invIndex[ind] for ind in indNonZero]
    for j in range(len(indNonZero)):
        temp_min[0,indImages[j]] = temp_min[0,indImages[j]]+ np.minimum(V[i,indNonZero[j]],V[indImages[j],indNonZero[j]])
    jaccard_dist[i] = 1-temp_min/(2.-temp_min)

final_dist = jaccard_dist*(1-lambda_value) + original_dist*lambda_value
final_dist = final_dist[:query_num,query_num:]

search done.
