In [1]:
import faiss

https://towardsdatascience.com/comprehensive-guide-to-approximate-nearest-neighbors-algorithms-8b94f057d6b6

In [2]:
import pandas as pd
import numpy as np
from time import time
from tqdm import tqdm
tqdm.pandas()

model_name = 'seresnext101'
fold = 2
checkpoint = '20600'
algo = 'dist_global_org'

nums = [model_name,fold,
        checkpoint,
        model_name,
        fold,algo]

In [3]:
train = pd.read_csv('../WC_input/train.csv')

In [4]:
train.set_index('Image').loc['PM-WWA-20140813-067.jpg']

Id    -1
Name: PM-WWA-20140813-067.jpg, dtype: object

In [5]:
dic = train.set_index('Image').to_dict(orient='dict')['Id']

In [6]:
top20 = pd.read_csv('../WC_result/{0}_{1}/out_{2}/{3}_sub_fold{4}_{5}.csv'.format(*nums), header=None)
top20 = top20.set_index(0)

In [7]:
enc = pd.read_csv('../WC_result/{0}_{1}/out_{2}/encoding_org_img.csv'.format(*nums), header=None)

enc = enc.set_index(0)

enc['embeddings'] = enc.values.tolist()

enc = enc.reset_index()

enc = enc.iloc[:, [0, 2050-1]]

enc.columns = ['Image', 'embeddings']

enc.head(1)

Unnamed: 0,Image,embeddings
0,PM-WWA-20180811-093.jpg,"[0.030311882000000002, 0.0005835553, -0.031396..."


In [8]:
train.shape

(4539, 2)

In [9]:
enc = enc.merge(train, on='Image', how='left')

In [10]:
enc = enc[enc.Id!='-1']

In [11]:
enc = enc.reset_index(drop=True)

## Exhaustive Search Usage


In [12]:
USE_GPU = True

class ExactIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = np.ascontiguousarray(vectors.astype('float32'))
        self.labels = labels    
   
    def build(self):
        self.index = faiss.IndexFlatL2(self.dimension,)
#         if USE_GPU:
#             res = faiss.StandardGpuResources()
#             self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        return distances, [self.labels[i] for i in indices[0]]

In [13]:
index = ExactIndex(enc["embeddings"].apply(pd.Series).values, enc["Image"].values)
index.build()

In [14]:
enc["Image"].values

array(['PM-WWA-20180811-093.jpg', 'PM-WWA-20120430-011.jpg',
       'PM-WWA-20110724-023.jpg', ..., 'PM-WWA-20090811-050.jpg',
       'PM-WWA-20060627-020.jpg', 'PM-WWA-20060715-071.jpg'], dtype=object)

In [15]:
result = np.ascontiguousarray(enc["embeddings"].apply(pd.Series).values).astype('float32')

In [16]:
result[0]

array([ 0.03031188,  0.00058356, -0.03139683, ..., -0.04197725,
       -0.01696076,  0.06060556], dtype=float32)

In [17]:
result[1]

array([-0.01400099,  0.00361327,  0.0593184 , ..., -0.02270603,
        0.03372436, -0.01720172], dtype=float32)

In [18]:
enc.head(1)

Unnamed: 0,Image,embeddings,Id
0,PM-WWA-20180811-093.jpg,"[0.030311882000000002, 0.0005835553, -0.031396...",


In [19]:
test = enc[enc.Image.isin(top20.index)]
test = test.reset_index() 
test.columns = ['id', 'Image' , 'embeddings', 'class']

In [20]:
test.shape

(808, 4)

In [21]:
test

Unnamed: 0,id,Image,embeddings,class
0,0,PM-WWA-20180811-093.jpg,"[0.030311882000000002, 0.0005835553, -0.031396...",
1,18,PM-WWA-20170710-031.jpg,"[-0.008558646, 0.003506818, 0.0126826210000000...",
2,19,PM-WWA-20170622-226.jpg,"[-0.008336851999999999, -0.0007319453999999999...",
3,20,PM-WWA-20180813-271.jpg,"[0.011478172, 0.0031907053000000004, -0.019210...",
4,21,PM-WWA-20180506-348.jpg,"[0.18167746, -0.0012695227, 0.07532703, -0.018...",
5,23,PM-WWA-20170417-174.jpg,"[0.0096659055, -0.003666343, -0.02681862, -0.0...",
6,24,PM-WWA-20170528-041.jpg,"[-0.01026584, -0.002028815, 0.02457689, 0.0450...",
7,29,PM-WWA-20170731-007.jpg,"[-0.009453551999999999, 0.0026841874, 0.022789...",
8,32,PM-WWA-20180506-365.jpg,"[-0.0142374225, 0.00018124631999999997, 0.0177...",
9,49,PM-WWA-20170528-221.jpg,"[0.013722275, -0.0017804452, -0.011037735, 0.0...",


In [22]:
test[test.Image.isin(top20.index)].shape

(808, 4)

In [23]:
test

Unnamed: 0,id,Image,embeddings,class
0,0,PM-WWA-20180811-093.jpg,"[0.030311882000000002, 0.0005835553, -0.031396...",
1,18,PM-WWA-20170710-031.jpg,"[-0.008558646, 0.003506818, 0.0126826210000000...",
2,19,PM-WWA-20170622-226.jpg,"[-0.008336851999999999, -0.0007319453999999999...",
3,20,PM-WWA-20180813-271.jpg,"[0.011478172, 0.0031907053000000004, -0.019210...",
4,21,PM-WWA-20180506-348.jpg,"[0.18167746, -0.0012695227, 0.07532703, -0.018...",
5,23,PM-WWA-20170417-174.jpg,"[0.0096659055, -0.003666343, -0.02681862, -0.0...",
6,24,PM-WWA-20170528-041.jpg,"[-0.01026584, -0.002028815, 0.02457689, 0.0450...",
7,29,PM-WWA-20170731-007.jpg,"[-0.009453551999999999, 0.0026841874, 0.022789...",
8,32,PM-WWA-20180506-365.jpg,"[-0.0142374225, 0.00018124631999999997, 0.0177...",
9,49,PM-WWA-20170528-221.jpg,"[0.013722275, -0.0017804452, -0.011037735, 0.0...",


In [24]:
dat = np.ascontiguousarray(enc["embeddings"].apply(pd.Series).values).astype('float32')

In [25]:
test['top20dist'] = np.nan
test['top20imgs'] = np.nan

In [26]:
test['top20imgs'] = test['id'].map(lambda x: index.query(np.expand_dims(dat[x], 1).reshape(1, -1), 100))

In [27]:
test['top20dist'] = test['top20imgs'].map(lambda x: x[0][0][1:])
test['top20imgs'] = test['top20imgs'].map(lambda x: x[1][1:])

In [28]:
test['top20imgs'].head()

0    [PM-WWA-20160408-598.jpg, PM-WWA-20100723-339....
1    [PM-WWA-20110724-031.jpg, PM-WWA-20160319-207....
2    [PM-WWA-20170625-283.jpg, PM-WWA-20060819-009....
3    [PM-WWA-20060818-192.jpg, PM-WWA-20060530-117....
4    [PM-WWA-20180818-156.jpg, PM-WWA-20110720-104....
Name: top20imgs, dtype: object

In [29]:
test['top20dist'].head()

0    [0.9966525, 1.4590219, 1.4743505, 1.5267613, 1...
1    [1.2870718, 1.3537942, 1.4264284, 1.4435966, 1...
2    [0.800068, 1.3947868, 1.410857, 1.4308887, 1.5...
3    [1.285229, 1.3624868, 1.3715265, 1.3976867, 1....
4    [1.4545264, 1.4719937, 1.4918422, 1.4954445, 1...
Name: top20dist, dtype: object

In [32]:
fp = '/home/pt-support/Humpback-Whale-Identification-1st-/WC_input/'
train = pd.read_csv(fp+'train.csv') 
image_to_id = dict(zip(train.Image, train.Id))

In [33]:
res = pd.concat([test['top20imgs'].apply(pd.Series), 
                 test['top20dist'].apply(pd.Series),
                 test['top20imgs'].apply(pd.Series).applymap(lambda x: image_to_id[x] if x in dic.keys() else np.nan)
                ], axis=1, keys=["img", "dist", "class"])
# res = res.swaplevel(0, 1, axis=1).sort_index(axis=1)

In [34]:
confident_labels_test = pd.concat([res.img.loc[res['class'].loc[:, 0][(res.dist.loc[:, 0]<1)].dropna().index, 0],
           res['class'].loc[:, 0][(res.dist.loc[:, 0]<1)].dropna()], axis=1, keys=['Image','Id']).reset_index(drop=True)

In [35]:
confident_labels_test.to_csv(fp + 'pseudo_labels.csv', index=False)

In [None]:
jhgfdsa

In [None]:
confident_labels_test.sample()

In [None]:
pd.DataFrame(test.Image).join(test['top20imgs'].apply(pd.Series)).to_csv('faiss.csv', header=False, index=False)

In [None]:
exp = test['top20imgs'].apply(pd.Series)

In [None]:
test['top20imgs'].apply(pd.Series).applymap(lambda x: image_to_id[x] if x in dic.keys() else np.nan)

In [None]:
exp_class = exp.applymap(lambda x: image_to_id[x] if x in dic.keys()else np.nan)

In [None]:
res = pd.concat([exp, exp_class], axis=1, keys=["img", "class"])

In [None]:
res = res.swaplevel(0, 1, axis=1).sort_index(axis=1)

In [None]:
res.iloc[i].swaplevel(0, 1)['class'].dropna()[:20].index

In [None]:
res.iloc[i].swaplevel(0, 1)['img'][res.iloc[i].swaplevel(0, 1)['class'].dropna()[:20].index]

In [None]:
collector = []
for i in range(exp.shape[0]):
    collector.append(res.iloc[i].swaplevel(0, 1)['img'][res.iloc[i].swaplevel(0, 1)['class'].dropna()[:20].index].reset_index(drop=True))

In [None]:
result = pd.DataFrame(test.Image).join(pd.concat(collector, axis=1, ignore_index=True).T)

In [None]:
result.shape

In [None]:
result.set_index('Image').loc['PM-WWA-20180811-093.jpg']

In [None]:
result.to_csv('faiss_remove_test.csv', header=False, index=False)