In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import json
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


#### Define Retriever Class
Given the inputs below performs the retrieval step of RAG:
- model_name: for the embeddings (see: https://www.sbert.net/docs/pretrained_models.html)
- sentences: a list of strings to be embedded
- similarity_metric: a function to evaluate similarities (defaults to cosine_similarity)
- query: prompt against which to 
</br></br>
Methods:
    - make_embeddings():
      + maps original sentences to embedding space; fills attribute "embeddings"
    - retrieve(query, n):
      + finds the n closest sentences in embedding space according to the provided similarity_metric

In [2]:
from typing import List, Callable

class Retriever:
    def __init__(self, 
                 model_name: str, 
                 sentences: List[str],
                 similarity_metric: Callable = cosine_similarity
                 ):
        self.model = SentenceTransformer(model_name)
        self.sentences = sentences
        self.metric = similarity_metric
        self.embeddings = None

    def make_embeddings(self):
        self.embeddings = self.model.encode(self.sentences)

    def retrieve(self, query: str, n: int = 10):
        if self.embeddings is None:
            self.make_embeddings()
        query_embedding = self.model.encode([query])
        scores = self.metric(query_embedding, self.embeddings)
        top_n_idx = np.argsort(scores[0])[::-1][:n]
        return [
            {
                'response': self.sentences[i],
                'score': scores[0][i] 
            }
            for i in top_n_idx
        ]

Load Best Buy Worker Data

In [3]:
with open('../../data/BestBuyWorkers.json', 'r') as f:
    workers = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../data/BestBuyWorkers.json'

Format sentences from title and text

In [6]:
def format_reddit_entry(entry: dict) -> str:
    title = entry['reddit_title'].strip() if entry['reddit_title'] else ''
    text = entry['reddit_text'].strip() if entry['reddit_text'] else ''
    text = text.replace('\n', ' ')
    if len(title)>0:
        if len(text)>0:
            return title+'; '+text
        return title
    else:
        if len(text)>0:
            return text
    return None
sentences = [format_reddit_entry(worker) for worker in workers]
# clean out the None entries
sentences = [sentence for sentence in sentences if sentence]

In [33]:
retriever = Retriever('multi-qa-mpnet-base-dot-v1', sentences)
retriever.make_embeddings()


In [41]:
query = 'What do employees of Best Buy think of the company?'
#query = 'What do employees share about promotions at Best Buy?'
responses = retriever.retrieve(query, n=20)

print(f'Query: {query}\n---')
for response in responses:
    print(f'{response["score"]:.3f}: {response["response"]}')

Query: What do employees of Best Buy think of the company?
---
0.802: What is your opinion on Best Buy currently.
0.759: Why are you at Best Buy?
0.750: Donâ€™t listen to this guy, I work there and the team environment is outstanding everyone stands around talking to each other and letâ€™s the antisocial people ring up the customers. Youâ€™ll enjoy Best Buy as long as you arenâ€™t antisocial and you actually enjoy technology
0.738: Life as a Best Buy worker ðŸ’€
0.732: Being a veteran and Best Buy employee, I can see what you and OP mean. It really depends on the individual, but the blanket statement is accurate enough.
0.721: Best Buy used to be a great company to work for before Corie the CEO took over. Employees used to love working there, staffing was great, it wasnâ€™t all about stupid sales metrics. Times have changed
0.719: And one last thing: have an open mind. Best Buy will not be a career for 98% of current employees. Use what skills and knowledge you learn here to springboar

Naive Generation using HuggingFace

In [2]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceHub
import os

huggingfacehub_api_token = os.getenv('HUGGINGFACE_API_TOKEN')

template = """
Please construct an answer to the question: '{query}' by summarizing the testimonials provided 
in the responses below (delimeted by --).

Responses: {responses}
```
"""

prompt = PromptTemplate(template=template, input_variables=["query", "responses"])

In [87]:
llm = HuggingFaceHub(repo_id='tiiuae/falcon-7b-instruct', huggingfacehub_api_token=huggingfacehub_api_token)
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [90]:
out = llm_chain.run(query=query, 
              responses='--'.join(['']+[response['response'] for response in responses[:8]])
)

In [93]:
display(out.split('```')[1].strip())

"The employees of Best Buy have mixed opinions about the company. Some employees enjoy the team environment and the opportunity to interact with customers, while others find it difficult to work with antisocial individuals. The majority of employees have a positive attitude towards the company, but some are concerned about the company's future due to recent financial struggles."