In [1]:
#pip install torch
#pip install sentence-transformers
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
model = SentenceTransformer('all-MiniLM-L6-v2') #Loads the pre-trained SBERT model  
from elasticsearch.helpers import bulk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
elastic_client = Elasticsearch(hosts=['http://localhost:9200'], basic_auth=('elastic', 'master'))

1_dataset_title_abstract_200_16_teste

In [13]:
patent_data = pd.read_csv("datasets/patents/1_dataset_title_abstract_200_16_teste.csv")
print(patent_data.head(5))
index_name  = "patents_1"
rows = len(patent_data.index)
print(rows)

      examiner                                               text
0  Examiner 01  cyclonic separator for separating particles fr...
1  Examiner 01  air filtration device. an air filtration devic...
2  Examiner 01  carbon dioxide enhanced complex-adsorption pro...
3  Examiner 01  device for the separation of liquid and/or sol...
4  Examiner 01  drift eliminator, light trap, and method of fo...
3200


In [4]:
def getSimilarity(vector:list, embedding_field:str, index_name:str, size:int, k:int, candidate:int):
    result = elastic_client.search(
        index=index_name,
        body={
            "size": size,
            "knn": {
                "field": embedding_field,
                "query_vector": vector,
                "k": k,
                "num_candidates": candidate
            },
        "fields": ["no", "examiner" ],
        "_source": "false"
        }
    )
    return result

In [5]:
def histogram(examiner:list, k:int):
    result = {}
    ctr=1
    for value, key in sorted(((examiner.count(e), e) for e in set(examiner)), reverse=True):
        if (ctr > k): break
        result[key] = value
        ctr+=1
    return result

In [6]:
def process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict): 
        accuracy_dict[index] = accuracy_dict.get(index) + 1
    else:
        accuracy_dict[index] = 1

In [8]:
def get_process_result(accuracy_dict, k, n, type):
    index = "{}-{}-{}".format(k,n,type)
    if (index in accuracy_dict): 
        return accuracy_dict[index]
    else:
        return 0

In [9]:
def print_process_result(accuracy_dict, k_list, n_list):
    for k in k_list:
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            print("k={} - n={} - Positive: {} - Negative: {} - " 
                "Accuracy: {} ".format(k,n,positive,negative,accuracy))

In [11]:
def transform_process_result(accuracy_dict, k_list, n_list):
    matrix = np.zeros((len(k_list), len(n_list)))
    i = j = 0
    for k in k_list:
        j=0
        for n in n_list:
            positive = get_process_result(accuracy_dict, k, n, 'positive')
            negative = get_process_result(accuracy_dict, k, n, 'negative')
            accuracy = positive / (positive + negative)
            matrix[i][j] = accuracy
            j+=1
        i+=1
    return matrix

In [12]:
#Performs queries to group returned patent subclasses for each input patent
print("Index name: ",index_name)
k_list = [1,2,3,4,5,6,7,8,9,10]
n_list = [10,25,50,75,100]
accuracy_dict = {}
max_n = 100
candidate = 100
ctr_hit = 0
hits = 0
positive = negative = 0
examiner_list = []
hit_list = []
ctr_queries = 0 
for index, row in patent_data.iterrows():
    ctr_queries+=1
    query_vector = model.encode(row.text).tolist()
    result = getSimilarity(query_vector, "embedding", index_name, max_n, max_n, candidate)

    hit_list.clear()
    hits=0
    for hit in result['hits']['hits']:
        hit_list.append(hit["fields"]["examiner"][0])
        hits+=1

    print("Query id: "+str(ctr_queries)+" - Examiner: "+ row.examiner  +" - Hits: "+str(hits)) 

    for k in k_list: 
        for n in n_list:
            histogram_res = histogram(hit_list[:n], k)
            if (row.examiner  in histogram_res): 
                process_result(accuracy_dict, k, n, 'positive')
            else:
                process_result(accuracy_dict, k, n, 'negative')

print_process_result(accuracy_dict, k_list, n_list)
print("Accuracy by k and n")
matrix = transform_process_result(accuracy_dict, k_list, n_list)
print(matrix)

Index name:  patents_1


  result = elastic_client.search(


Query id: 1 - Examiner: Examiner 01 - Hits: 100
Query id: 2 - Examiner: Examiner 01 - Hits: 100
Query id: 3 - Examiner: Examiner 01 - Hits: 100
Query id: 4 - Examiner: Examiner 01 - Hits: 100
Query id: 5 - Examiner: Examiner 01 - Hits: 100
Query id: 6 - Examiner: Examiner 01 - Hits: 100
Query id: 7 - Examiner: Examiner 01 - Hits: 100
Query id: 8 - Examiner: Examiner 01 - Hits: 100
Query id: 9 - Examiner: Examiner 01 - Hits: 100
Query id: 10 - Examiner: Examiner 01 - Hits: 100
Query id: 11 - Examiner: Examiner 01 - Hits: 100
Query id: 12 - Examiner: Examiner 01 - Hits: 100
Query id: 13 - Examiner: Examiner 01 - Hits: 100
Query id: 14 - Examiner: Examiner 01 - Hits: 100
Query id: 15 - Examiner: Examiner 01 - Hits: 100
Query id: 16 - Examiner: Examiner 01 - Hits: 100
Query id: 17 - Examiner: Examiner 01 - Hits: 100
Query id: 18 - Examiner: Examiner 01 - Hits: 100
Query id: 19 - Examiner: Examiner 01 - Hits: 100
Query id: 20 - Examiner: Examiner 01 - Hits: 100
Query id: 21 - Examiner: Exam