In [35]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
import pandas as pd

#### Define Retriever Class
Given the inputs below performs the retrieval step of RAG:
- model_name: for the embeddings (see: https://www.sbert.net/docs/pretrained_models.html)
- sentences: a list of strings to be embedded
- similarity_metric: a function to evaluate similarities (defaults to cosine_similarity)
- query: prompt against which to 
</br></br>
Methods:
    - make_embeddings():
      + maps original sentences to embedding space; fills attribute "embeddings"
    - retrieve(query, n):
      + finds the n closest sentences in embedding space according to the provided similarity_metric

In [36]:
from typing import List, Callable

class Retriever:
    def __init__(self, 
                 model_name: str, 
                 sentences: List[str],
                 similarity_metric: Callable = cosine_similarity,
                 split_sentences: bool = False
                 ):
        self.model = SentenceTransformer(model_name)
        self.sentences = sentences
        self.metric = similarity_metric
        self.split_sentences = split_sentences
        self.embeddings = None

    def _sentence_token_length(self, sentence: str):
        return len(self.model.tokenizer.tokenize(sentence))
    
    def split_sentence(self, sentence: str):
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.model.max_seq_length, 
            chunk_overlap=20, 
            length_function=self._sentence_token_length
        )
        return splitter.split_text(sentence)
    
    def prep_sentences(self):
        # Split sentences into chunks such that each chunk is less than the max token length of the model
        self.sentences = [self.split_sentence(s) for s in self.sentences]
        self.sentences = [s for sublist in self.sentences for s in sublist]

    def make_embeddings(self):
        if self.split_sentences:
            self.prep_sentences()
        self.embeddings = self.model.encode(self.sentences)

    def retrieve(self, query: str, n: int = 10):
        if self.embeddings is None:
            self.make_embeddings()
        query_embedding = self.model.encode([query])
        scores = self.metric(query_embedding, self.embeddings)
        top_n_idx = np.argsort(scores[0])[::-1][:n]
        return [
            {
                'response': self.sentences[i],
                'score': scores[0][i] 
            }
            for i in top_n_idx
        ]

Load Best Buy Worker Data

In [3]:
with open('../../data/BestBuyWorkers.json', 'r') as f:
    workers = json.load(f)

Format sentences from title and text

In [4]:
def format_reddit_entry(entry: dict) -> str:
    title = entry['reddit_title'].strip() if entry['reddit_title'] else ''
    text = entry['reddit_text'].strip() if entry['reddit_text'] else ''
    text = text.replace('\n', ' ')
    if len(title)>0:
        if len(text)>0:
            return title+'; '+text
        return title
    else:
        if len(text)>0:
            return text
    return None
sentences = [format_reddit_entry(worker) for worker in workers]
# clean out the None entries
sentences = [sentence for sentence in sentences if sentence]

In [7]:
original_sentences = sentences

In [5]:
sentence_length = [len(sentence.split()) for sentence in sentences]
idx = np.argmax(sentence_length)
print(sentence_length[idx])
print(sentences[idx])


1665


In [6]:
retriever = Retriever('multi-qa-mpnet-base-dot-v1', sentences)

In [17]:
sentences



In [15]:
print(self.prompts.get(self.default_prompt_name, None))

None


In [13]:
print(prompt)

(None,)


In [6]:
retriever = Retriever('multi-qa-mpnet-base-dot-v1', sentences)

In [24]:
retriever.model.tokenizer.tokenize(original_sentences[idx])

['retaliation',
 'prevention',
 'policy',
 ':',
 'a',
 'former',
 'employee',
 'story',
 ';',
 'i',
 'began',
 'my',
 'employment',
 'at',
 'one',
 'of',
 'orlando',
 '’',
 's',
 'best',
 'buy',
 'stores',
 'in',
 '2018',
 '.',
 'following',
 'a',
 'year',
 'after',
 ',',
 'i',
 'was',
 'sexually',
 'harassed',
 'by',
 'my',
 'supervisor',
 'in',
 'late',
 '2019',
 '.',
 'after',
 'reporting',
 'that',
 'incident',
 ',',
 'my',
 'employment',
 'at',
 'best',
 'buy',
 'took',
 'the',
 'worst',
 'of',
 'turns',
 '.',
 'while',
 'the',
 'store',
 'as',
 'a',
 'whole',
 'could',
 'agree',
 'that',
 'my',
 'supervisor',
 '’',
 's',
 'overall',
 'behavior',
 'at',
 'work',
 'was',
 'questionable',
 '(',
 'just',
 'before',
 'being',
 'promoted',
 'as',
 'my',
 'supervisor',
 ',',
 'he',
 'had',
 'been',
 'documented',
 'for',
 'threatening',
 'to',
 'physically',
 'harm',
 'a',
 'customer',
 '.',
 ')',
 ',',
 'his',
 'sales',
 'for',
 'the',
 'store',
 'were',
 'competitive',
 '##ly',
 'high

In [23]:
len(retriever.model.tokenizer.tokenize(original_sentences[idx]))

1971

In [20]:
out = retriever.model.tokenize(original_sentences[idx])
print(out['input_ids'].shape)

torch.Size([9366, 3])


In [21]:
print(len(out))

2


In [19]:
embedding = retriever.model.encode(sentences[idx])
print(embedding.shape)

(768,)


In [18]:
print(out['attention_mask'].shape)

torch.Size([9366, 3])


In [11]:
out = retriever.model.tokenize(sentences[0])
print(out['input_ids'].shape)

torch.Size([37, 3])


In [34]:
retriever.model.max_seq_length

512

In [34]:
def sentence_token_length(sentence: str):
    return len(retriever.model.tokenizer.tokenize(sentence))

splitter = RecursiveCharacterTextSplitter(chunk_size=retriever.model.max_seq_length, chunk_overlap=20, length_function=sentence_token_length)

# Example text that you want to split
#text = "This is an example document that contains several sentences. The goal is to split this document into parts that do not exceed the maximum sequence length allowed by the transformer model."
text = original_sentences[idx]
# Split the text
parts = splitter.split_text(text)

# Print the split parts
for i, part in enumerate(parts, 1):
    print(f"Part {i}: {part}")


Part 1: Retaliation Prevention Policy: A Former Employee Story; I began my employment at one of Orlando’s Best Buy stores in 2018. Following a year after, I was sexually harassed by my supervisor in late 2019. After reporting that incident, my employment at Best Buy took the worst of turns.  While the store as a whole could agree that my supervisor’s overall behavior at work was questionable (just before being promoted as my supervisor, he had been documented for threatening to physically harm a customer.), his sales for the store were competitively high.  My former supervisor had the highest amount of Total Tech Support sales out of all the employees in the store, while his moods were not always pleasant, he was a crucial part of the team since he helped carry the store’s revenue.  During his short time as my supervisor, he crossed several boundaries which led to his termination. I do want to mention that during the process, the human resources representative assigned to my case durin

In [37]:
splitter.split_text('test123 hello my name is; what?!')

['test123 hello my name is; what?!']

In [27]:
help(TextSplitter)

Help on class TextSplitter in module langchain_text_splitters.base:

class TextSplitter(langchain_core.documents.transformers.BaseDocumentTransformer, abc.ABC)
 |  TextSplitter(chunk_size: 'int' = 4000, chunk_overlap: 'int' = 200, length_function: 'Callable[[str], int]' = <built-in function len>, keep_separator: 'bool' = False, add_start_index: 'bool' = False, strip_whitespace: 'bool' = True) -> 'None'
 |  
 |  Interface for splitting text into chunks.
 |  
 |  Method resolution order:
 |      TextSplitter
 |      langchain_core.documents.transformers.BaseDocumentTransformer
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, chunk_size: 'int' = 4000, chunk_overlap: 'int' = 200, length_function: 'Callable[[str], int]' = <built-in function len>, keep_separator: 'bool' = False, add_start_index: 'bool' = False, strip_whitespace: 'bool' = True) -> 'None'
 |      Create a new TextSplitter.
 |      
 |      Args:
 |          chunk_size: Maximum size

In [33]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter

# Load your SentenceTransformer model
tokenizer = retriever.model.tokenizer

# Initialize TextSplitter with the tokenizer from SentenceTransformer
text_splitter = RecursiveCharacterTextSplitter(
    tokenizer = tokenizer,
    chunk_size = 512,
    chunk_overlap  = 50
)

# Example text that you want to split
text = "This is an example document that contains several sentences. The goal is to split this document into parts that do not exceed the maximum sequence length allowed by the transformer model."

# Split the text
parts = text_splitter.split_text(text)

# Print the split parts
for i, part in enumerate(parts, 1):
    print(f"Part {i}: {part}")


TypeError: TextSplitter.__init__() got an unexpected keyword argument 'tokenizer'

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [52]:
from langchain.text_splitter import SpacyTextSplitter
text_splitter = SpaCyTextSplitter()
docs = text_splitter.split_text(text)

NameError: name 'SpaCyTextSplitter' is not defined

In [53]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 512,
    chunk_overlap  = 50
)
docs = text_splitter.create_documents([sentences[idx]])
print(len(docs[0].page_content))

508


In [43]:
len(docs[0].page_content)

508

In [None]:
retriever.model.

In [25]:
sentences[idx]



In [32]:
tokenizer = retriever.model.tokenizer
''.join(tokenizer.decode(seg) for seg in out['input_ids'])

'<s> r </s><s> e </s><s> t </s><s> a </s><s> l </s><s> i </s><s> a </s><s> t </s><s> i </s><s> o </s><s> n </s><s> </s> <pad><s> p </s><s> r </s><s> e </s><s> v </s><s> e </s><s> n </s><s> t </s><s> i </s><s> o </s><s> n </s><s> </s> <pad><s> p </s><s> o </s><s> l </s><s> i </s><s> c </s><s> y </s><s> : </s><s> </s> <pad><s> a </s><s> </s> <pad><s> f </s><s> o </s><s> r </s><s> m </s><s> e </s><s> r </s><s> </s> <pad><s> e </s><s> m </s><s> p </s><s> l </s><s> o </s><s> y </s><s> e </s><s> e </s><s> </s> <pad><s> s </s><s> t </s><s> o </s><s> r </s><s> y </s><s> ; </s><s> </s> <pad><s> i </s><s> </s> <pad><s> b </s><s> e </s><s> g </s><s> a </s><s> n </s><s> </s> <pad><s> m </s><s> y </s><s> </s> <pad><s> e </s><s> m </s><s> p </s><s> l </s><s> o </s><s> y </s><s> m </s><s> e </s><s> n </s><s> t </s><s> </s> <pad><s> a </s><s> t </s><s> </s> <pad><s> o </s><s> n </s><s> e </s><s> </s> <pad><s> o </s><s> f </s><s> </s> <pad><s> o </s><s> r </s><s> l </s><s> a </s><s> n </s><s> d </s><s>

In [7]:
retriever.model.tokenize(['hello world'])

{'input_ids': tensor([[   0, 7596, 2092,    2]]),
 'attention_mask': tensor([[1, 1, 1, 1]])}

In [None]:
retriever.make_embeddings()


In [41]:
query = 'What do employees of Best Buy think of the company?'
#query = 'What do employees share about promotions at Best Buy?'
responses = retriever.retrieve(query, n=20)

print(f'Query: {query}\n---')
for response in responses:
    print(f'{response["score"]:.3f}: {response["response"]}')

Query: What do employees of Best Buy think of the company?
---
0.802: What is your opinion on Best Buy currently.
0.759: Why are you at Best Buy?
0.750: Don’t listen to this guy, I work there and the team environment is outstanding everyone stands around talking to each other and let’s the antisocial people ring up the customers. You’ll enjoy Best Buy as long as you aren’t antisocial and you actually enjoy technology
0.738: Life as a Best Buy worker 💀
0.732: Being a veteran and Best Buy employee, I can see what you and OP mean. It really depends on the individual, but the blanket statement is accurate enough.
0.721: Best Buy used to be a great company to work for before Corie the CEO took over. Employees used to love working there, staffing was great, it wasn’t all about stupid sales metrics. Times have changed
0.719: And one last thing: have an open mind. Best Buy will not be a career for 98% of current employees. Use what skills and knowledge you learn here to springboard yourself in

Naive Generation using HuggingFace

In [2]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceHub
import os

huggingfacehub_api_token = os.getenv('HUGGINGFACE_API_TOKEN')

template = """
Please construct an answer to the question: '{query}' by summarizing the testimonials provided 
in the responses below (delimeted by --).

Responses: {responses}
```
"""

prompt = PromptTemplate(template=template, input_variables=["query", "responses"])

In [87]:
llm = HuggingFaceHub(repo_id='tiiuae/falcon-7b-instruct', huggingfacehub_api_token=huggingfacehub_api_token)
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [90]:
out = llm_chain.run(query=query, 
              responses='--'.join(['']+[response['response'] for response in responses[:8]])
)

In [93]:
display(out.split('```')[1].strip())

"The employees of Best Buy have mixed opinions about the company. Some employees enjoy the team environment and the opportunity to interact with customers, while others find it difficult to work with antisocial individuals. The majority of employees have a positive attitude towards the company, but some are concerned about the company's future due to recent financial struggles."