In [1]:
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import os
from img2vec_pytorch import Img2Vec
from PIL import Image
import xml.etree.ElementTree as ET

  warn(f"Failed to load image Python extension: {e}")


In [2]:
elastic_client = Elasticsearch(hosts=['http://localhost:9200'],
                               basic_auth=('elastic', 'master'))
model = Img2Vec()

In [3]:
def getSimilarity(vector:list, embedding_field:str, index_name:str, size:int, k:int, candidate:int):
    result = elastic_client.search(
        index=index_name,
        body={
            "size": size,
            "knn": {
            "field": "{}".format(embedding_field),
            "query_vector": vector,
            "k": k,
            "num_candidates": candidate
            },
        "fields": [ "no", "class_label" ],
        "_source": "false"
        }
    )
    return result

In [4]:
def frequency_histogram(subclasses:list, k:int):
    result = {}
    ctr=1
    for value, key in sorted(((subclasses.count(e), e) for e in set(subclasses)), reverse=True):
        if (ctr > k): break
        result[key] = value
        ctr+=1
        
    return result

In [5]:
def score_histogram(subclasses:list, k:int):
    temp_result = {}
    result = {}
    ctr=1
    for entry in subclasses:
        elements = entry.split(";")
        subclass_list = elements[0].split(",")
        for subclass in subclass_list:
            if (subclass not in temp_result):
                temp_result[subclass] = float(elements[1])
            else:
                temp_result[subclass] = temp_result.get(subclass) + float(elements[1])

    for key, value in sorted(temp_result.items(), key=lambda x:x[1], reverse=True):
        if (ctr > k): break
        result[key] = value
        ctr+=1

    return result

In [6]:
def process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict): 
        accuracy_dict[index] = accuracy_dict.get(index) + 1
    else:
        accuracy_dict[index] = 1

In [7]:
def get_process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict): 
        return accuracy_dict[index]
    else:
        return 0

In [8]:
def print_process_result(accuracy_dict, k_list, n_list):
    for k in k_list:
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            print("k={} - n={} - Positive: {} - Negative: {} - " 
                "Accuracy: {} ".format(k,n,positive,negative,accuracy))

In [9]:
def transform_process_result(accuracy_dict, k_list, n_list):
    matrix = np.zeros((len(k_list), len(n_list)))
    i = j = 0
    for k in k_list:
        j=0
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            matrix[i][j] = accuracy
            j+=1
        i+=1
    return matrix

In [10]:
def generate_embedding(segment):
    #img = Image.open(filename).convert("RGB")
    vec = model.get_vec(segment)
    return vec.tolist()

In [11]:
def get_files(path:str, extension:str):
    file_list = [f for f in os.listdir(path) if f.endswith(extension)]
    return file_list

In [12]:
def get_classes_segments(path:str, file_name:str):
    segments = []
    class_ = {}
    values = []
    tree = ET.parse(path+file_name)
    root = tree.getroot()
    for child in root:
        if (child.tag == 'object'):
            for new_child in child:
                if (new_child.tag == 'name'):
                    class_ = new_child.text
                if (new_child.tag == 'bndbox'):
                    for bndbox in new_child:
                        values.append(int(bndbox.text)) 
                    segments.append({'class': class_, 'segment_values': values})
                    values = []            
    return segments

In [13]:
dataset_path = "./datasets/road/test/"
index_name = "road_image_segment"
field = "embedding"

In [14]:
#Performs queries to group returned patent subclasses for each input patent
#Ranking strategy based on sum of occurrencies taking into account the reverse mode 
k_list = [1,2,3,4,5,6,7,8,9,10]
n_list = [1,5,10,25,50,75,100]
file_list = get_files(dataset_path, ".jpg")
accuracy_dict = {}
max_n = 100
candidate = 100
id = 0
file_id = 0
class_label = ""

print("\nInitializing the ranking strategy based on sum of occurrencies!!!\n")
for file in file_list:
    subclass_list = []
    hit_list = []
    file_id += 1
    image_file = dataset_path+file
    xml_file = file[0:len(file)-4]+".xml"
    classes_segments = get_classes_segments(dataset_path, xml_file)
    image = Image.open(image_file).convert("RGB")
    print("Image: ", file_id, " -> ", image_file)
    for class_segment in classes_segments:
        id += 1
        class_ = class_segment.get("class")
        values = class_segment.get("segment_values") 
        xmin = values[0]
        xmax = values[1]
        ymin = values[2]
        ymax = values[3]
        segment = image.crop((xmin,ymin,xmax,ymax))
        vector = generate_embedding(segment)
 
        result = getSimilarity(vector, field, index_name, max_n, max_n, candidate)

        hit_list.clear()
        hits=0

        subclass_dict = {}
        for hit in result['hits']['hits']:
            #print(hit['fields']['no'])
            try:
                #class_label = hit["fields"]["class_label"]
                hit_list.append(hit["fields"]["class_label"])
            except:
                print('Error ',hit['fields']['no'])
            hits+=1

        print("Query id: ", id, " - Classes: ", class_, " - Hits: ", hits, " - ", xml_file) 
        
        for k in k_list: 
            for n in n_list:
                ctr_hit = 0
                for subclass in hit_list:
                    ctr_hit+=1
                    if (ctr_hit > n): break
                    subclass_list.extend(subclass)

                histogram_res = frequency_histogram(subclass_list, k)
                subclass_list.clear()

                if (class_ in histogram_res): 
                    process_result(accuracy_dict, k, n, 'positive')
                else:
                    process_result(accuracy_dict, k, n, 'negative')
    
print_process_result(accuracy_dict, k_list, n_list)
matrix = transform_process_result(accuracy_dict, k_list, n_list)
print("Accuracy by k and n")
print(matrix)


Initializing the ranking strategy based on sum of occurrencies!!!

Image:  1  ->  ./datasets/road/test/103_png_jpg.rf.57f21999ee94bfa091a9ac8ee7289954.jpg


  result = elastic_client.search(


Query id:  1  - Classes:  D40  - Hits:  100  -  103_png_jpg.rf.57f21999ee94bfa091a9ac8ee7289954.xml
Query id:  2  - Classes:  D40  - Hits:  100  -  103_png_jpg.rf.57f21999ee94bfa091a9ac8ee7289954.xml
Query id:  3  - Classes:  D00  - Hits:  100  -  103_png_jpg.rf.57f21999ee94bfa091a9ac8ee7289954.xml
Image:  2  ->  ./datasets/road/test/109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.jpg
Query id:  4  - Classes:  D40  - Hits:  100  -  109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.xml
Query id:  5  - Classes:  D40  - Hits:  100  -  109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.xml
Query id:  6  - Classes:  D40  - Hits:  100  -  109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.xml
Query id:  7  - Classes:  D00  - Hits:  100  -  109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.xml
Query id:  8  - Classes:  D00  - Hits:  100  -  109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.xml
Query id:  9  - Classes:  D40  - Hits:  100  -  109_png_jpg.rf.d86165eeac5d7517ddeb964c3df1e409.xml
Query id:  1