In [16]:
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import os
from img2vec_pytorch import Img2Vec
from PIL import Image
from urllib.parse import urlparse, unquote

In [17]:
elastic_client = Elasticsearch(hosts=['http://localhost:9200'],
                               basic_auth=('elastic', 'master'))
model = Img2Vec()

In [18]:
def getSimilarity(vector:list, embedding_field:str, index_name:str, size:int, k:int, candidate:int):
    result = elastic_client.search(
        index=index_name,
        body={
            "size": size,
            "knn": {
            "field": "{}".format(embedding_field),
            "query_vector": vector,
            "k": k,
            "num_candidates": candidate
            },
        "fields": [ "no", "class_label", "class_id" ],
        "_source": "false"
        }
    )
    return result

In [5]:
#def frequency_histogram(subclasses:list, k:int):
#    result = {}
#    ctr=1
#    for value, key in sorted(((subclasses.count(e), e) for e in set(subclasses)), reverse=True):
#        if (ctr > k): break
#        result[key] = value
#        ctr+=1
#        
#    return result

In [6]:
# def score_histogram(subclasses:list, k:int):
#     temp_result = {}
#     result = {}
#     ctr=1
#     for entry in subclasses:
#         elements = entry.split(";")
#         subclass_list = elements[0].split(",")
#         for subclass in subclass_list:
#             if (subclass not in temp_result):
#                 temp_result[subclass] = float(elements[1])
#             else:
#                 temp_result[subclass] = temp_result.get(subclass) + float(elements[1])

#     for key, value in sorted(temp_result.items(), key=lambda x:x[1], reverse=True):
#         if (ctr > k): break
#         result[key] = value
#         ctr+=1

#     return result

In [19]:
def histogram(classes:list, k:int):
    result = {}
    ctr=1
    for value, key in sorted(((classes.count(e), e) for e in set(classes)), reverse=True):
        if (ctr > k): break
        result[key] = value
        ctr+=1
    return result

In [20]:
def process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict): 
        accuracy_dict[index] = accuracy_dict.get(index) + 1
    else:
        accuracy_dict[index] = 1

In [21]:
def get_process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict): 
        return accuracy_dict[index]
    else:
        return 0

In [22]:
def print_process_result(accuracy_dict, k_list, n_list):
    for k in k_list:
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            print("k={} - n={} - Positive: {} - Negative: {} - " 
                "Accuracy: {} ".format(k,n,positive,negative,accuracy))

In [23]:
def transform_process_result(accuracy_dict, k_list, n_list):
    matrix = np.zeros((len(k_list), len(n_list)))
    i = j = 0
    for k in k_list:
        j=0
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            matrix[i][j] = accuracy
            j+=1
        i+=1
    return matrix

In [24]:
def generate_embedding(filename):
    img = Image.open(filename).convert("RGB")
    vec = model.get_vec(img)
    return vec.tolist()

In [13]:
#def get_files(path:str, extension:str):
#    file_list = [f for f in os.listdir(path) if f.endswith(extension)]
#    return file_list

In [14]:
#def get_classes(path:str, file_name:str):
#    tree = ET.parse(path+file_name)
#    root = tree.getroot()
#    classes=[]
#    for child in root:
#        if (child.tag == 'object'):
#            for new_child in child:
#                if (new_child.tag == 'name'):
#                    classes.append(new_child.text)
#    return classes

In [25]:
# dataset_path = "datasets/small/images/"
# dataset_file = "datasets/small/sb_test.csv"
# index_name = "bs_index"
dataset_path = "datasets/full/images/"
dataset_file = "datasets/full/sb_test.csv"
index_name = "bs_index_full"
field = "embedding"

In [26]:
low_memory=False
df = pd.read_csv(dataset_file)
df

Unnamed: 0,foto,class_id,padrao_construtivo
0,https://geo-criciuma-fotos.s3-us-east-2.amazon...,3,Médio
1,https://geo-criciuma-fotos.s3-us-east-2.amazon...,5,Baixo
2,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo
3,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo
4,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo
...,...,...,...
280,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo
281,https://geo-criciuma-fotos.s3-us-east-2.amazon...,3,Médio
282,https://geo-criciuma-fotos.s3-us-east-2.amazon...,2,Médio Alto
283,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo


In [27]:
def extract_filename(url):
    path = urlparse(url).path
    # Desconsiderando barras no final do caminho e decodificando caracteres especiais
    return unquote(path.split("/")[-1])

In [28]:
df['arquivo'] = df['foto'].apply(extract_filename)
df

Unnamed: 0,foto,class_id,padrao_construtivo,arquivo
0,https://geo-criciuma-fotos.s3-us-east-2.amazon...,3,Médio,1_1568663546679.jpg
1,https://geo-criciuma-fotos.s3-us-east-2.amazon...,5,Baixo,y_1570043910041.jpg
2,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo,13_1606482873688.jpg
3,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo,60_1572867183701.jfif
4,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo,6_1571915256945.jfif
...,...,...,...,...
280,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo,4_1571676077736.jfif
281,https://geo-criciuma-fotos.s3-us-east-2.amazon...,3,Médio,foto_fachada_1604491446238.jpg
282,https://geo-criciuma-fotos.s3-us-east-2.amazon...,2,Médio Alto,01_1592499630633.jpg
283,https://geo-criciuma-fotos.s3-us-east-2.amazon...,4,Médio Baixo,08_1606482642722.jpg


In [15]:
#Performs queries to group returned patent subclasses for each input patent
print("Index name: ",index_name)
k_list = [1,2,3,4,5]
n_list = [1,10,25,50,75,100]
accuracy_dict = {}
max_n = 100
candidate = 100
ctr_hit = 0
hits = 0
positive = negative = 0
examiner_list = []
hit_list = []
ctr_queries = 0 
for index, row in df.iterrows():
    ctr_queries+=1
    image_file = dataset_path+row.arquivo
    #class_id = row.class_id
    query_vector = generate_embedding(image_file)
    result = getSimilarity(query_vector, "embedding", index_name, max_n, max_n, candidate)

    hit_list.clear()
    hits=0
    for hit in result['hits']['hits']:
        hit_list.append(hit["fields"]["class_id"][0])
        hits+=1
    #print(hit_list)    

    print("Query id: "+str(ctr_queries)+" - Class label: "+ row.padrao_construtivo + 
          " - Class id: "+ str(row.class_id)  +" - Hits: "+str(hits)) 
    # #print('Image: '+row.arquivo+" Class: "+row.padrao_construtivo+" Embedding: "+str(query_vector))

    for k in k_list: 
        for n in n_list:
            histogram_res = histogram(hit_list[:n], k)
            #print(histogram_res)
            if (str(row.class_id) in histogram_res): 
                process_result(accuracy_dict, k, n, 'positive')
            else:
                process_result(accuracy_dict, k, n, 'negative')

print_process_result(accuracy_dict, k_list, n_list)
print("Accuracy by k and n")
matrix = transform_process_result(accuracy_dict, k_list, n_list)
print(matrix)

Index name:  bs_index_full


  result = elastic_client.search(


Query id: 1 - Class label: Médio - Class id: 3 - Hits: 100
Query id: 2 - Class label: Baixo - Class id: 5 - Hits: 100
Query id: 3 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 4 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 5 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 6 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 7 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 8 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 9 - Class label: Médio Alto - Class id: 2 - Hits: 100
Query id: 10 - Class label: Médio Baixo - Class id: 4 - Hits: 100
Query id: 11 - Class label: Médio - Class id: 3 - Hits: 100
Query id: 12 - Class label: Baixo - Class id: 5 - Hits: 100
Query id: 13 - Class label: Médio - Class id: 3 - Hits: 100
Query id: 14 - Class label: Médio - Class id: 3 - Hits: 100
Query id: 15 - Class label: Médio - Class id: 3 - Hits: 100
Query id: 16 - Class label: Médio - Class id: 3 - Hits: 100
Qu

In [43]:
#Performs queries to group returned patent subclasses for each input patent
print("Index name: ",index_name)
k = 5
n = 10
accuracy_dict = {}
candidate = 100
ctr_hit = 0
hits = 0
positive = negative = 0
examiner_list = []
hit_list = []

id = 280
row = df.iloc[id]
print(row)

image_file = dataset_path+row.arquivo
query_vector = generate_embedding(image_file)
result = getSimilarity(query_vector, "embedding", index_name, n, n, candidate)
#print(result)

hit_list.clear()
hits=0
for hit in result['hits']['hits']:
    hit_list.append(hit["fields"]["class_id"][0])
    hits+=1
#hit_list

print("Query - Class label: "+ row.padrao_construtivo + 
      " - Class id: "+ str(row.class_id)  +" - Hits: "+str(hits)) 
#print('Image: '+row.arquivo+" Class: "+row.padrao_construtivo+" Embedding: "+str(query_vector))

histogram_res = histogram(hit_list[:n], k)
print(histogram_res)
#print(histogram_res)
# if (str(row.class_id) in histogram_res): 
#     process_result(accuracy_dict, k, n, 'positive')
# else:
#     process_result(accuracy_dict, k, n, 'negative')

# print_process_result(accuracy_dict, k_list, n_list)
# print("Accuracy by k and n")
# matrix = transform_process_result(accuracy_dict, k_list, n_list)
# print(matrix)

Index name:  bs_index_full
foto                  https://geo-criciuma-fotos.s3-us-east-2.amazon...
class_id                                                              3
padrao_construtivo                                                Médio
arquivo               936118aa-80d3-4177-b105-f865eeb8d6fe_159664441...
Name: 100, dtype: object
Query - Class label: Médio - Class id: 3 - Hits: 10
{'4': 6, '3': 4}


  result = elastic_client.search(


In [22]:
# #Performs queries to group returned patent subclasses for each input patent
# #Ranking strategy based on sum of occurrencies taking into account the reverse mode 
# k_list = [1,2,3,4,5,6,7,8,9,10]
# n_list = [10,25,50,75,100]
# accuracy_dict = {}
# max_n = 100
# candidate = 100
# id = 0

# print("\nInitializing the ranking strategy based on sum of occurrencies!!!\n")
# for index, row in df.iterrows():
#     subclass_list = []
#     hit_list = []

#     id += 1
#     image_file = dataset_path+row.arquivo
#     classes = row.padrao_construtivo
#     vector = generate_embedding(image_file)
    
#     result = getSimilarity(vector, field, index_name, max_n, max_n, candidate)

#     hit_list.clear()
#     hits=0

#     subclass_dict = {}
#     for hit in result['hits']['hits']:
#         try:
#             hit_list.append(hit["fields"]["classes"])
#         except:
#             print('Error ',hit['fields']['no'])
#         hits+=1

#     print("Query id: "+str(id)+" - Classes: "+classes+" - Hits: "+str(hits)+" - "+row.arquivo) #This last part concatenate the subclasses using the ";" character

#     for k in k_list: 
#         for n in n_list:
#             ctr_hit = 0
#             for subclass in hit_list:
#                 ctr_hit+=1
#                 if (ctr_hit > n): break
#                 subclass_list.extend(subclass)

#             histogram_res = frequency_histogram(subclass_list, k)
#             subclass_list.clear()

#             for classid in classes:
#                 if (classid in histogram_res): 
#                     process_result(accuracy_dict, k, n, 'positive')
#                 else:
#                     process_result(accuracy_dict, k, n, 'negative')
    
# print_process_result(accuracy_dict, k_list, n_list)
# matrix = transform_process_result(accuracy_dict, k_list, n_list)
# print("Accuracy by k and n")
# print(matrix)