### Load Models

In [1]:
import numpy as np, gensim, linecache, requests
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# load patent vectors
retriever_model = SentenceTransformer('stsb-roberta-large')

retriever_patent_vectors_B = np.load('./patent_embeddings_roberta_large.npy')
retriever_patent_vectors_B_norm = np.load('./patent_embeddings_roberta_large_norm.npy')

# QA models
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

QA_model = pipeline('question-answering', model=model, tokenizer=tokenizer)
# QA_input = {
#     'question': 'Why is model conversion important?',
#     'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
# }
# res = nlp(QA_input)

### Calculate cosine distance of question. Get top patents that can answer question

In [20]:
question = "What do wireless communication device contain??"
num_results = 10

In [21]:
# convert question to retriever vector
# cosine distance formula:
# CD = (A * B) / (A_norm * B_norm)
# A = input_text
# B = embedding from embedding_matrix

A = retriever_model.encode(question, show_progress_bar=False)
A_norm = np.linalg.norm(A)

# A * B
AB = A.dot(np.transpose(retriever_patent_vectors_B))
# A_norm * B_norm
A_norm_dot_B_norm = np.array(A_norm).dot(np.transpose(retriever_patent_vectors_B_norm))

# calculate cosine similarities
cosine_similarity_retriever = AB / A_norm_dot_B_norm

# sort result
# argsort() returns array of sorted index (starts at 0) of top 10 most similar patents
top_index = cosine_similarity_retriever.argsort()[0 - num_results:][::-1]

top_patents_id = []
for i in top_index:
    patent_id = linecache.getline('./librarypatent_full_id_lines.txt', i+1)
    top_patents_id.append(patent_id[:-1])

In [30]:
A = retriever_model.encode(question, show_progress_bar=False)
A_norm = np.linalg.norm(A)
print(A)
print(A_norm)

[ 0.6836851   0.03060884  0.35621378 ...  0.06875013 -0.14872573
  1.1083986 ]
24.873655


In [32]:
print(top_index)

[145866 660582 145345 208137 633421 177709  99029  23910 206886 252673]


### Retrieve and Divide Patent Text

In [23]:
r = requests.post('http://192.168.50.213:3000/api/search/_mget', json={
  "ids":top_patents_id
})

### Divide Patent text into lists

In [24]:
patent_text_list = []

for i in r.json()['docs']:
    
    if '_source' not in i:
        continue
    
    # extract text
    source = i['_source']
    
    title = source['title']
    abstract = source['abs']
    claims = ' '.join( [ claim['text'] for claim in source['ind_claims'] ] )
    line = abstract + ' ' + claims
    line = gensim.utils.deaccent(line)
    
    # get id
    id = source['id']
    
    patent_text_list.append({'title': title, 'id': id, 'text': line, 'score': 0, 'answer': ''})

### Answer Question

In [25]:
# {'score': 0.21171504259109497, 'start': 59, 'end': 84, 'answer': 'gives freedom to the user'}

for i in patent_text_list:
    
    text = i['text']
    
    # use patent text to answer questions
    res = QA_model({ 'question': question, 'context': text })
    
    print(res)
    
    score = res['score']
    answer = res['answer']
    
    i['score'] = score
    i['answer'] = answer

{'score': 0.2014942765235901, 'start': 4746, 'end': 4768, 'answer': 'reserved or empty bits'}
{'score': 0.18846938014030457, 'start': 37, 'end': 66, 'answer': 'a plurality of wireless units'}
{'score': 0.5465201139450073, 'start': 1677, 'end': 1697, 'answer': 'at least one antenna'}
{'score': 0.5028700828552246, 'start': 3576, 'end': 3588, 'answer': 'content data'}
{'score': 0.18052873015403748, 'start': 2920, 'end': 2984, 'answer': 'a memory configured to store at least one predetermined criteria'}
{'score': 0.18052873015403748, 'start': 2920, 'end': 2984, 'answer': 'a memory configured to store at least one predetermined criteria'}
{'score': 0.18052873015403748, 'start': 2920, 'end': 2984, 'answer': 'a memory configured to store at least one predetermined criteria'}
{'score': 0.10627047717571259, 'start': 5307, 'end': 5316, 'answer': 'impedance'}
{'score': 0.11491251736879349, 'start': 67, 'end': 92, 'answer': 'a position detection unit'}
{'score': 0.4374890625476837, 'start': 3718, 

In [26]:
# sort and give highest answers
answer_scores = sorted(patent_text_list, key=lambda d: d['score'], reverse=True)

In [27]:
# top answer
print(answer_scores[0])

{'title': 'Device, System and Method of Wireless Communication Over Non-Contiguous Channels', 'id': '8582551', 'text': '\nSome demonstrative embodiments include devices, systems and/or methods of wireless communication over non-contiguous channels. For example, a device may include a wireless communication unit capable of transmitting symbols of a wireless communication packet to a wireless communication device over a plurality of non-contiguous wireless communication channels, wherein the wireless communication unit is to transmit, as part of a preamble of the packet, signaling information defining transmission characteristics over the plurality of non-contiguous channels.\n An apparatus comprising: a wireless communication unit capable of transmitting symbols of a wireless communication packet to a wireless communication device over a plurality of non-contiguous wireless communication channels, wherein the wireless communication unit is to transmit, as part of a preamble of the packe