In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.exceptions import ConnectionError, NotFoundError
from elasticsearch.helpers import bulk
import json
import pandas as pd
import numpy as np
import string
import random
import itertools

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import json
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from elastic import Elastic


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('./corpus.json', 'r') as corpusfile:
    corpus = json.load(corpusfile)

In [3]:
elastic = Elastic()

In [4]:
manufacturer = "RussellHobbs"
product_id = "25270_56"

In [5]:
def unique_id(size):
    chars = list(set(string.ascii_lowercase + string.digits).difference('LIO01'))
    return ''.join(random.choices(chars, k=size))



def get_product_contexts(manufacturer, product_id):
    text = []
    uid = []
    for i in range(len(corpus)):
        # Very shaky and lazy        
        if corpus[i]['manufacturer'] == manufacturer:
            for j in range(len(corpus[i]['products'])):
                if corpus[i]['products'][j]['product_id'] == product_id:
                    if corpus[i]['products'][j]['languages']['en'] is not None:
                        if len(corpus[i]['products'][j]['languages']['en']) != 0:
                            for k in range(len(corpus[i]['products'][j]['languages']['en'])):
                                if corpus[i]['products'][j]['languages']['en'][k]['paragraphText'] != '':
                                    paragraphText = corpus[i]['products'][j]['languages']['en'][k]['paragraphText']
                                    id = corpus[i]['manufacturer'] + '_' + corpus[i]['products'][j]['product_id'] + '_' + unique_id(5)
                                    text.append(paragraphText)
                                    uid.append(id)

                                    list_ = [uid,text]
                                    contexts = pd.DataFrame(list_).T
                                    contexts.columns= ['uid', 'text']
                                  
    return contexts

In [6]:
elastic.delete_index(name="coffee")

Index coffee has been deleted successfully!


In [7]:
elastic.create_index(name="coffee", config_file="./index_config.json")

[INFO] index coffee has been created!


In [8]:
from embedders import SwivelEmbedder, BertEmbedder, UniversalSentenceEmbedder, allMiniLMEmbedder, FinetunedAllMiniLMEmbedder

bert = BertEmbedder()
swivel = SwivelEmbedder()
universal = UniversalSentenceEmbedder()
all_mini = allMiniLMEmbedder()
finetuned_all_mini = FinetunedAllMiniLMEmbedder()

contexts =get_product_contexts(manufacturer, product_id)

# contexts['swivel_embedding'] = contexts['text'].apply(lambda t: swivel.embed_single_text(t)[0].tolist())
# contexts['bert_embedding'] = contexts['text'].apply(lambda t: bert.embed_single_text(t)[0].tolist())
#contexts['universal_sentence_embedding'] = contexts['text'].apply(lambda t: universal.embed_single_text(t)[0].tolist())
#contexts['all_mini_embedding'] = contexts['text'].apply(lambda t: all_mini.embed_single_text(t).tolist())
contexts['finetuned_all_mini_embedding'] = contexts['text'].apply(lambda t: finetuned_all_mini.embed_single_text(t).tolist())





Loaded pre-trained model https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/2 successfully!
Loaded pre-trained model https://tfhub.dev/tensorflow/cord-19/swivel-128d/3 successfully!
Loaded pre-trained model https://tfhub.dev/google/universal-sentence-encoder/4 successfully!
Loaded pre-trained model sentence-transformers/all-MiniLM-L6-v2 successfully!
Loaded pre-trained model sentence-transformers/all-MiniLM-L6-v2 successfully!


In [9]:
contexts

Unnamed: 0,uid,text,finetuned_all_mini_embedding
0,RussellHobbs_25270_56_2vikf,GB DE FR NL IT ES PT DK SE NO FI RU CZ SK PL H...,"[-0.0713464766740799, 0.0666460320353508, -0.0..."
1,RussellHobbs_25270_56_ecrkf,"Follow basic safety precautions, including: Th...","[-0.07415638118982315, 0.06053953617811203, 0...."
2,RussellHobbs_25270_56_wpjt8,1. Lid 2. Slot 3. Filter holder 4. Paper filte...,"[0.021055476740002632, 0.033697012811899185, 0..."
3,RussellHobbs_25270_56_tz3he,"Fill the reservoir to the max mark, and run th...","[0.01735161803662777, 0.030152225866913795, 0...."
4,RussellHobbs_25270_56_wf4b5,1. Remove the carafe from the hotplate. 2. Ope...,"[0.030607910826802254, 0.047216761857271194, -..."
5,RussellHobbs_25270_56_jtjkz,It isn’t neccessary to set the clock to use yo...,"[-0.03425657004117966, 0.0227992944419384, -0...."
6,RussellHobbs_25270_56_qw2jk,Pressing the f button will make the brewed cof...,"[0.029520494863390923, -0.0193497184664011, -0..."
7,RussellHobbs_25270_56_eumy5,Note: If you want to use the coffee strength f...,"[0.0007969274884089828, 0.028213204815983772, ..."
8,RussellHobbs_25270_56_gxgpm,The status light will operate as follows: Brew...,"[-0.01741929166018963, -0.023695865646004677, ..."
9,RussellHobbs_25270_56_z6tg3,Note: Make sure the clock is set to the correc...,"[-0.022040795534849167, 0.016449250280857086, ..."


In [10]:
contexts['_op_type'] = "index"
contexts['_index'] = "coffee"
#data = data[['_op_type', '_index', 'uid', 'swivel_embedding', 'bert_embedding', 'universal_sentence_embedding']]
embeddings = contexts[['_op_type', '_index', 'uid', 'finetuned_all_mini_embedding']]
index_data = embeddings.to_dict(orient='records')
print(len(index_data))
# print(index_data[0])


15


In [11]:
elastic.index_documents(index_data)

In [12]:
def retrieve(query:str, n_returns:int, index_name: str, embedder):
    '''
    return: 2-D list [['idx': cosine_similarity]], sorted by cosine score descendingly
    e.g.[['0', 0.963506],
         ['1', 0.9492332000000001],
         ['75', 0.9438302999999999]]
    '''
    if embedder == 'swivel':
        query_vector = swivel.embed_single_text(query)[0].tolist()
    elif embedder == 'bert':
        query_vector = bert.embed_single_text(query)[0].tolist()
    elif embedder == 'universal_sentence':
        query_vector = universal.embed_single_text(query)[0].tolist()
    elif embedder == 'all_mini':
        query_vector = all_mini.embed_single_text(query).tolist()
    elif embedder == 'finetuned_all_mini':
        query_vector = finetuned_all_mini.embed_single_text(query).tolist()
    else:
        raise ValueError("embedder must be either 'swivel','bert', 'universal_sentence' or ...!")
        
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": f"cosineSimilarity(params.query_vector, '{embedder}_embedding') + 1.0", # Add 1.0 because ES doesnt support negative score
                "params": {"query_vector": query_vector}
            }
        }
    }
    
    response = elastic.search(index_name, script_query, n_returns, list_fields_to_return=["uid"])
    
    res = []
    for hit in response["hits"]["hits"]:
        res.append([hit["_source"]["uid"], hit["_score"] - 1])
    return np.array(res)

In [13]:
"""
    Similarity search
"""

query = "setting the clock"

relevant_contexts_uid = retrieve(query=query, n_returns=5, index_name="coffee", embedder='finetuned_all_mini')

In [16]:
relevant_contexts = []
for i in range(len(relevant_contexts_uid)):
    rel_con = contexts[contexts["uid"] == relevant_contexts_uid[i][0]]["text"].iloc[0]
    relevant_contexts.append(rel_con)
relevant_contexts

In [18]:
relevant_contexts

['It isn’t neccessary to set the clock to use your coffee maker but you will need to set it if you want to use the timer functions. 1. Put the plug into the power socket. 2. Use the hr and min buttons to set the correct time in 24 hour format. • The clock will retain its settings until the coffee maker is unplugged.',
 'Note: Make sure the clock is set to the correct time. Use the hr and min buttons. 1. Set the timer to the time you want to start brewing. a) Press the T button. b) Use the hr and min buttons to set the time you want brewing to start. 5 If you don’t press either the hr or min buttons within 5 seconds, the coffee maker will revert to “normal”, and the current time will show. Press the T button to start again. 2. When the time is programmed, press the W button. The T button will light up and the T icon will appear on the display. Your coffee maker is now set to make coffee at the selected time. 3. If you want to use the coffee strength feature when using the timer, press t

In [19]:
"""
    Question answering
"""

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
nlp  = pipeline("question-answering", model="deepset/roberta-base-squad2")

Downloading: 100%|██████████| 571/571 [00:00<00:00, 71.3kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 496M/496M [01:02<00:00, 7.97MB/s] 
Downloading: 100%|██████████| 79.0/79.0 [00:00<?, ?B/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.06MB/s] 
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 675kB/s] 
Downloading: 100%|██████████| 772/772 [00:00<00:00, 49.3kB/s]


In [37]:
answers = []

for context in relevant_contexts:
    answer = nlp(question = query, context = context)
    answers.append(answer)

answers = sorted(answers , key=lambda k: k['score'], reverse=True)


In [48]:
for i in range(len(answers)):
    print("Question: ", query)
    print("--Answer:", answers[i]['answer'])
    print()


Question:  setting the clock
--Answer: the correct time

Question:  setting the clock
--Answer: The clock will retain its settings until the coffee maker is unplugged

Question:  setting the clock
--Answer: 3. To wake the appliance, press the min button

Question:  setting the clock
--Answer: press the f and T buttons together, and hold them for 3 seconds

Question:  setting the clock
--Answer: Press the W button

