In [15]:
import cv2
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
import os
import seaborn as sns
import math
import random

import timeit
import json
# sns.set_theme()


def kmeans_trans(img, K=10, attempts=10, epsilon=0.1, max_iter=10, lab=False):
    if lab is True:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
    Z = np.float32(img.reshape((-1, 3)))
    
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, max_iter, epsilon)
    K = K
    attempts = attempts
    ret, label, center = cv2.kmeans(Z, K , None, criteria, attempts, cv2.KMEANS_RANDOM_CENTERS)

    center = np.uint8(center)
    res = center[label.flatten()]
    res = res.reshape((img.shape))
    if lab is True:
        res = cv2.cvtColor(res, cv2.COLOR_LAB2BGR)
    return res, label

def proportion_list(label):
    """
    input: a label list of an image after kmeans transformation
    return: a sorted color proportion list
    """
    return (np.sort(np.unique(label, return_counts=True)[1]/len(label))*100)[::-1]    
    
def get_proportion_list(img, K=10, attempts=10, epsilon=0.1, max_iter=10, lab=False):
    res, label = kmeans_trans(img, K, attempts, epsilon, max_iter, lab)
    return proportion_list(label)

def euclidean_dist_df(proportion_df):
    cp_df = pd.DataFrame(euclidean_distances(proportion_df.T, squared=True), columns=proportion_df.columns)
    cp_df["name"] = cp_df.columns
    cp_df = cp_df.set_index("name")
    return cp_df

def embedding(proportion_df):
    """
        embedding proportion euclidean lists to hamming binary strings
    """
    euclid_df = np.floor(proportion_df)
    max_digit = np.max(euclid_df.max())
    ham_dict= {}
    for col in proportion_df.columns:
        euclid_list = list(euclid_df[col])
        ham_str = ""
        for num in euclid_list:
            ham_str += "1"*int(num) + "0"* int(max_digit-num)
        ham_dict[col] = ham_str
    return ham_dict

def hashfunction(K, max_digit, dim):
    hashes = []
    for i in range(K):
        hashes.append(random.randint(0, max_digit*dim-1))
    def h(ham_dict):
        hashtable = {}
        for key in ham_dict:
            hashcode = ""
            for j in hashes:
                hashcode += ham_dict[key][j]
            if hashcode in hashtable:
                hashtable[hashcode].append(key)
            else:
                hashtable[hashcode] = [key]
        return hashtable
    return h

def BER(b1, b2):
    y = int(b1,2) ^ int(b2,2)
    xor_str = str(bin(y))[2:]
    sum = 0
    for c in xor_str:
        if c == '1':
            sum += 1
    return sum/len(b1)

In [16]:
BER("01101", "10101")

0.4

In [2]:
exp_res = {
    "group":"",
    "feature":"",
    "search":"",
    "T_cons":"",
    "T_search":"",
    "fnr":"",
    "fpr":""
}
groups = ["sth_else", "color_modified", "blur_modified", "flip_modified"]
Ks = [10]
for group in groups[0:1]:
    for K in Ks:
        images = os.listdir(f"images/{group}")
        if ".DS_Store" in images:
            images.remove(".DS_Store")
        exp_res["group"] = group

        # feature extraction & dataset construction
        proportion_dict = {}
        t1 = timeit.default_timer()
        for image in images:
            img = cv2.imread(f"images/{group}/{image}")
            proportion_dict[image] = get_proportion_list(img, K=K, epsilon=0.0001, max_iter=500, lab=True)
        t2 = timeit.default_timer()
        print(f'Dataset construction time:  {(t2 - t1) / float(len(proportion_dict))} per image')
        proportion_df = pd.DataFrame(proportion_dict)
        exp_res["feature"] = f"Color Quant/K={K}"
        exp_res["T_cons"]= (t2 - t1) / float(len(proportion_dict))

#         #preprocessing
#         dataset = []
#         dataset_label = []
#         queries = []
#         queries_label = []
#         for p in proportion_dict:
#             if 'origin' in p:
#                 dataset_label.append(p)
#                 dataset.append(list(proportion_dict[p]))
#             else:
#                 queries_label.append(p)
#                 queries.append(proportion_dict[p])
#         dataset /= np.linalg.norm(dataset, axis=1).reshape(-1, 1)
#         queries /= np.linalg.norm(queries, axis=1).reshape(-1, 1)
#         dataset = np.array(dataset,)
#         queries = np.array(queries,)

#         # Perform linear scan using NumPy to get answers to the queries.
#         print('Solving queries using linear scan')
#         t1 = timeit.default_timer()
#         answers = []
#         for query in queries:
#             answers.append(np.dot(list(dataset), query).argmax())
#         t2 = timeit.default_timer()
#         print('Done')
#         print('Linear scan time: {} per query'.format((t2 - t1) / float(len(queries))))
#         exp_res["search"] = "Linear Search"
#         exp_res["T_search"] = (t2 - t1) / len(queries)
#         answers_label = [dataset_label[ans] for ans in answers]

#         #Rate
#         def fnr(queries_label, answers_label):
#             fn = 0
#             for i in range(len(queries_label)):
#                 if str.split(queries_label[i], "_")[0] != str.split(answers_label[i], "_")[0]:
#                     fn += 1
#             return fn/len(queries_label)
#         def fpr(queries_label, answers_label):
#             fp = 0
#             p = 0
#             for i in range(len(queries_label)):
#                 if str.split(queries_label[i], "_")[0] == str.split(answers_label[i], "_")[0]:
#                     p += 1
#                     if str.split(queries_label[i], "_")[1][:4] == "else":
#                         fp +=1
#             return fp/p
#         exp_res["fnr"] = fnr(queries_label, answers_label)
#         exp_res["fpr"] = fpr(queries_label, answers_label)

#         #output to file
#         with open("exp_res.json", "a") as outfile:  
#             json.dump(exp_res, outfile, indent=4) 

Dataset construction time:  4.6455394553076985 per image


In [18]:
hamming_dict = embedding(proportion_df)
hamming_dict

{'2_origin.jpg': '1111111111111111100000000000000000111111111111111000000000000000000011111111111110000000000000000000001111111111111000000000000000000000111111111100000000000000000000000011111111110000000000000000000000001111111000000000000000000000000000111110000000000000000000000000000011110000000000000000000000000000001000000000000000000000000000000000',
 '4_else.jpg': '1111111111111111111100000000000000111111111111111110000000000000000011111111111110000000000000000000001111111111100000000000000000000000111111110000000000000000000000000011111100000000000000000000000000001111100000000000000000000000000000111110000000000000000000000000000011111000000000000000000000000000001111000000000000000000000000000000',
 '5_else.jpg': '111111111111111111100000000000000011111111111111100000000000000000001111111111110000000000000000000000111111111110000000000000000000000011111111110000000000000000000000001111111100000000000000000000000000111111100000000000000000000000000011111100000000000000000000

In [23]:
BER(hamming_dict["1_origin.jpg"], hamming_dict["6_origin.jpg"])

0.14705882352941177

In [24]:
proportion_df


Unnamed: 0,2_origin.jpg,4_else.jpg,5_else.jpg,2_else2.jpg,3_else.jpg,2_else.jpg,5_origin.jpg,6_origin.jpg,6_else.jpg,3_origin.jpg,4_origin.jpg,1_else.jpg,1_origin.jpg
0,17.01913,20.571274,19.18485,15.718944,12.843828,20.853935,25.345256,16.523694,16.371527,24.104674,15.925412,34.22254,26.90707
1,15.051836,17.672629,15.881231,15.678254,11.735351,15.837037,23.958271,13.949758,14.735881,11.616195,14.628092,14.787027,26.859694
2,13.679728,13.730081,12.064258,13.537812,11.522099,14.972917,16.826627,13.60256,13.910954,10.929339,12.399894,12.347664,14.55102
3,13.4946,11.855827,11.370262,10.860403,10.735124,13.087731,10.044709,12.43817,10.500596,10.241387,10.9888,10.229844,8.480685
4,10.98087,8.706775,10.414129,9.902401,9.293238,12.61088,6.648783,11.133691,9.703655,9.253895,10.635657,9.715772,6.41035
5,10.809627,6.845528,8.54219,9.093453,9.192565,8.667593,6.515648,7.324455,8.081787,8.779899,10.130323,8.600294,5.343659
6,7.856834,5.635772,7.168483,8.989658,9.134111,6.829977,4.325882,7.137669,7.223708,8.617512,8.03954,4.973999,4.188411
7,5.320272,5.236856,6.921468,6.509274,8.756319,4.028125,3.614506,7.127724,7.082488,6.622778,6.335885,2.534635,2.923834
8,4.226165,5.090515,5.01088,5.526002,8.563635,1.634606,2.165922,6.144933,6.984324,6.274962,5.543899,1.891549,2.641035
9,1.560938,4.654743,3.44225,4.183799,8.223731,1.477199,0.554396,4.617347,5.40508,3.559359,5.372499,0.696677,1.694242


In [27]:
number_of_tables = 50
params_cp = falconn.LSHConstructionParameters()
params_cp.dimension = len(dataset[0])
params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
params_cp.l = 50
params_cp.num_rotations = 1
params_cp.seed = 5721840
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
falconn.compute_number_of_hash_functions(18, params_cp)

print('Constructing the LSH table')
t1 = timeit.default_timer()
table = falconn.LSHIndex(params_cp)
table.setup(dataset)
t2 = timeit.default_timer()
print('Done')
print(f'Construction time: {t2-t1}')

Constructing the LSH table
Done
Construction time: 0.048085303000334534


In [28]:
query_object = table.construct_query_object()

In [38]:
print('Choosing number of probes')
number_of_probes = number_of_tables

def evaluate_number_of_probes(number_of_probes):
    query_object.set_num_probes(number_of_probes)
    score = 0
    for (i, query) in enumerate(queries):
        if answers[i] in query_object.get_candidates_with_duplicates(query):
            score += 1
    return float(score) / len(queries)

while True:
    accuracy = evaluate_number_of_probes(number_of_probes)
    print('{} -> {}'.format(number_of_probes, accuracy))
    if accuracy >= 0.9:
        break
    number_of_probes = number_of_probes * 2
if number_of_probes > number_of_tables:
    left = number_of_probes // 2
    right = number_of_probes
    while right - left > 1:
        number_of_probes = (left + right) // 2
        accuracy = evaluate_number_of_probes(number_of_probes)
        print('{} -> {}'.format(number_of_probes, accuracy))
        if accuracy >= 0.9:
            right = number_of_probes
        else:
            left = number_of_probes
    number_of_probes = right
print('Done')
print('{} probes'.format(number_of_probes))

# final evaluation
t1 = timeit.default_timer()
score = 0
for (i, query) in enumerate(queries):
    if query_object.find_nearest_neighbor(query) == answers[i]:
        score += 1
t2 = timeit.default_timer()

print('Query time: {}'.format((t2 - t1) / len(queries)))
print('Precision: {}'.format(float(score) / len(queries)))

Choosing number of probes
50 -> 1.0
Done
50 probes
Query time: 0.00017233337496236345
Precision: 1.0


In [39]:
len(query_object.get_candidates_with_duplicates(queries[0]))

151

In [321]:
max_digit = int(np.max(np.floor(proportion_df).max()))
K, L = 50, 5
final_tables = []
ft={}
gs = []
for i in range(L):
    g = hashfunction(K, max_digit, dim=10)
    gs.append(g)
    table = g(embedding(proportion_df))
    final_tables.append(table)
    ft = {**ft, **table}
final_tables

[{'00000100000100000001011111000001000001000010100110': ['5_redmotor.jpg'],
  '00000100000100000001011111000001000001010010100110': ['5_yellowmotor.jpg'],
  '00000100000100000001011011000001000001000010100110': ['6_originwoman.jpg',
   '3_yellowgem.jpg',
   '7_coloredzebra.jpg',
   '6_coloredwoman.jpg'],
  '00000110000000000001011011000001001001010010100110': ['2_origingorilla.jpg',
   '2_graygorilla.jpg'],
  '00000100000100000001001011000001000001000010100110': ['3_whitegem.jpg'],
  '00000110000100000001011011000001000001000010100110': ['1_blackcar.jpg'],
  '00000110000100000001011011000001001001010010100110': ['4_greenman.jpg'],
  '00100110000100000001011011000001001001010010100110': ['7_originzebra.jpg'],
  '00100100000100000001011011000001001001010010100110': ['4_blueman.jpg'],
  '00000100000100000001011011000001000001010010100110': ['1_redcar.jpg'],
  '00000100000100000001011011000001001001010010100110': ['4_redman.jpg']},
 {'01001001001001010011001001000000000000000010000001': ['

In [322]:
#query 

res = []
for i, g in enumerate(gs):
    ql = list(g(embedding(proportion_df["3_whitegem.jpg"].to_frame())).keys())[0]
    res.append(final_tables[i][ql])
res

[['3_whitegem.jpg'],
 ['3_yellowgem.jpg', '3_whitegem.jpg'],
 ['3_yellowgem.jpg', '3_whitegem.jpg'],
 ['3_whitegem.jpg'],
 ['3_whitegem.jpg']]

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.

PackagesNotFoundError: The following packages are not available from current channels:

  - falconn

Current channels:

  - https://repo.anaconda.com/pkgs/main/osx-64
  - https://repo.anaconda.com/pkgs/main/noarch
  - https://repo.anaconda.com/pkgs/r/osx-64
  - https://repo.anaconda.com/pkgs/r/noarch

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.


