In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import List
import pandas as pd

In [3]:
from src.opensearch import OpenSearchClient
from src.embedding_helper import EmbeddingHelper
from src.doc_processor_helper import get_all_data_paths
from transformers import pipeline
from src.logger import logger

# Running all pdfs

In [4]:
# config
models_retrieval = ['intfloat/e5-small-v2', 'thenlper/gte-small']
questions_retrieval = ['query: What is the name of the company?', 'What is the name of the company?']
k_retrieval = 20
query_mask = ".  The name of the company is [MASK]."
query_qa = "What is the name of the company?"
pipe_qa = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') # QA
pipe_mlm = pipeline('fill-mask', model='bert-base-cased') # MLM
path_pdfs = get_all_data_paths()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-09-07 08:59:58,824 - src.logger - INFO - found 5 pdfs in /Users/meninderpurewal/My Drive/Mikey/Code/Retrieval/data


In [5]:
def extract(texts: List, query: str, pipe: pipeline, mlm: bool)->List:
    l = []
    for r in texts:
        txt = r.replace('passage: ', '')
        if mlm:
            txt = txt + query
            ans = pipe(txt)
            ans = [(a['token_str'], a['score']) for a in ans]
        else:
            ans = pipe(question=query, context=txt)
            ans = [(ans['answer'], ans['score'])]
        l.extend(ans)
    #l.sort(key=lambda x: x[1], reverse=True)
    return l

In [6]:
results = [] # list of lists [model_retrieval, path_pdf, response_score, response_text, mlm_score, mlm_value, qa_score, qa_value]

for idx_retrieval in [0,1]:
    model_id = models_retrieval[idx_retrieval]
    logger.info('Retrieval model_id: %s', model_id)
    eh = EmbeddingHelper(model_id=model_id)
    osc = OpenSearchClient(model_id=model_id)
    qe = eh.get_embeddings_query(questions_retrieval[idx_retrieval], model_id=model_id)
    for path_pdf in path_pdfs:
        logger.info('Retrieval for path_pdf: %s', path_pdf.name)
        response = osc.search_documents(qe, path_pdf.name, k=k_retrieval)
        response_text = [r[1] for r in response]

        logger.info('Extraction using MLM on %s: ', path_pdf.name)
        mlm = extract(response_text, query_mask, pipe_mlm, mlm=True)
        logger.info('Extraction using QA on %s: ', path_pdf.name)
        qa = extract(response_text, query_qa, pipe_qa, mlm=False)
        for r, m, q in zip(response, mlm, qa):
            results.append([model_id, path_pdf.name, r[0], r[1], m[1], m[0], q[1], q[0]])



2023-09-07 09:00:01,838 - src.logger - INFO - Retrieval model_id: intfloat/e5-small-v2
2023-09-07 09:00:02,261 - src.logger - INFO - Loaded model intfloat/e5-small-v2
2023-09-07 09:00:02,313 - src.logger - INFO - {'name': '87fee34ad386', 'cluster_name': 'docker-cluster', 'cluster_uuid': '-a_peNY5SsWnBbMYhkB_OA', 'version': {'distribution': 'opensearch', 'number': '2.6.0', 'build_type': 'tar', 'build_hash': '7203a5af21a8a009aece1474446b437a3c674db6', 'build_date': '2023-02-24T18:58:37.352296474Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}
2023-09-07 09:00:02,313 - src.logger - INFO - Using index name intfloat-e5-small-v2-index
2023-09-07 09:00:02,313 - src.logger - INFO - Getting embeddings for query: What is the name of the company?
2023-09-07 09:00:02,337 - src.logger - INFO - Retrieval for path_pdf: amazon_10k_2022.pd

In [7]:
cols = ['model_id', 'path_pdf', 'response_score', 'response_text', 'mlm_score', 'mlm_value', 'qa_score', 'qa_value']
df = pd.DataFrame(results, columns=cols)

In [8]:
# save pandas dataframe as a pickle file
df.to_pickle("./data/results.pkl")
# load it back
# df = pd.read_pickle("./data/results.pkl")
# df.to_csv('./data/results.csv', index=False)


Analysis

In [9]:
df.sort_values(by=['mlm_score'], ascending=False, inplace=True)

In [24]:
for path_pdf in path_pdfs:
    print(path_pdf.name)
    df_pdf = df[df['path_pdf'] == path_pdf.name].copy()
    df_pdf.sort_values(by=['qa_score'], ascending=False, inplace=True)
    print(df_pdf.head(10))
    print('------------------')

amazon_10k_2022.pdf
               model_id             path_pdf  response_score  \
0    thenlper/gte-small  amazon_10k_2022.pdf        0.751230   
100  thenlper/gte-small  amazon_10k_2022.pdf        0.751230   
8    thenlper/gte-small  amazon_10k_2022.pdf        0.739897   
108  thenlper/gte-small  amazon_10k_2022.pdf        0.739897   
115  thenlper/gte-small  amazon_10k_2022.pdf        0.730996   
15   thenlper/gte-small  amazon_10k_2022.pdf        0.730996   
103  thenlper/gte-small  amazon_10k_2022.pdf        0.748023   
3    thenlper/gte-small  amazon_10k_2022.pdf        0.748023   
9    thenlper/gte-small  amazon_10k_2022.pdf        0.739638   
109  thenlper/gte-small  amazon_10k_2022.pdf        0.739638   

                                         response_text  mlm_score  \
0    cash on hand. We expect to fund the acquisitio...   0.032546   
100  cash on hand. We expect to fund the acquisitio...   0.032546   
8    We Face Risks Related to Adequately Protecting...   0.007496   

In [None]:
QUERY_COMPANY = '''What is the name of the company being discussed?'''
QUERY_REVENUE = '''What is the annual revenue of the company?'''
QUERY_TICKER = '''What is the ticker of the company?'''

# Run One pdf

## Retrieval

In [26]:
query = 'query: What should I name this company?'
query = 'query: What is the name of the company?'
#query = 'query: What is the annual revenue of the company?'
model_id='intfloat/e5-small-v2'
eh = EmbeddingHelper(model_id=model_id)
qe = eh.get_embeddings_query(query, model_id=model_id)

2023-09-07 11:53:54,335 - src.logger - INFO - Loaded model intfloat/e5-small-v2
2023-09-07 11:53:54,342 - src.logger - INFO - Getting embeddings for query: What should I name this company?


In [27]:
osc = OpenSearchClient(model_id=model_id)
response= osc.search_documents(qe, 'apple_10k_2022.pdf', k=20)

2023-09-07 11:53:54,936 - src.logger - INFO - {'name': '87fee34ad386', 'cluster_name': 'docker-cluster', 'cluster_uuid': '-a_peNY5SsWnBbMYhkB_OA', 'version': {'distribution': 'opensearch', 'number': '2.6.0', 'build_type': 'tar', 'build_hash': '7203a5af21a8a009aece1474446b437a3c674db6', 'build_date': '2023-02-24T18:58:37.352296474Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}
2023-09-07 11:53:54,936 - src.logger - INFO - Using index name intfloat-e5-small-v2-index


In [28]:
response.sort(key=lambda x: x[0], reverse=True)
print(len(response))
response_text = [r[1] for r in response]

20


In [29]:
response[:3]

[(0.74294984,
  'passage: have a material adverse impact on the Company’s business, results of operations and financial condition.'),
 (0.73366475, 'passage: Apple Inc. | 2022 Form 10-K | 8'),
 (0.72973746,
  'passage: Apple Inc. | 2022 Form 10-K | 7\n\nBusiness Risks\n\nTo remain competitive and stimulate customer demand, the Company must successfully manage frequent introductions and transitions of products and services.')]

## Extraction

In [10]:
# extraction: MLM
pipe_mlm = pipeline('fill-mask', model='bert-base-cased') # MLM
l = []
for r in response_text:
    txt = r.replace('passage: ', '')
    txt = txt + ".  The name of the company is [MASK]."
    #txt = r + '''The company's annual revenue was $[MASK] million.'''
    ans = pipe_mlm(txt)
    ans = [(a['token_str'], a['score']) for a in ans]
    l.extend(ans)
l.sort(key=lambda x: x[1], reverse=True)
print("mlm:")
print(l)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


mlm:
[('Messenger', 0.7873184680938721), ('the', 0.35498788952827454), ('The', 0.34480974078178406), ('Government', 0.06383132934570312), ('unknown', 0.061747707426548004), ('LLC', 0.05567697808146477), ('unknown', 0.05320306494832039), ('Regulation', 0.048365090042352676), ('LLC', 0.04782203957438469), ('LLC', 0.04610437527298927), ('unknown', 0.029059216380119324), ('LLC', 0.029029149562120438), ('LLC', 0.02785695157945156), ('LLC', 0.025983288884162903), ('LLC', 0.02523551695048809), ('LLC', 0.024454236030578613), (':', 0.022109152749180794), ('Inc', 0.021660607308149338), ('regulated', 0.021331172436475754), ('LLC', 0.020701885223388672), ('LLC', 0.020700717344880104), ('Inc', 0.020132074132561684), ('LLC', 0.019516415894031525), ('unknown', 0.017448069527745247), ('LLC', 0.017401231452822685), ('Enterprise', 0.01704832911491394), ('LLC', 0.017036080360412598), ('[UNK]', 0.016596578061580658), ('[UNK]', 0.016510510817170143), ('.', 0.016416257247328758), ('LLC', 0.01609698869287967

In [11]:
# extraction: QA
pipe_qa = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') # QA
l = []
question = query.replace('query: ', '')
for r in response_text:
    text = r.replace('passage: ', '')
    ans = pipe_qa(question=question, context=text)
    ans = [(ans['answer'], ans['score'])]
    l.extend(ans)
#l.sort(key=lambda x: x[1], reverse=True)
print("qa")
print(l)

qa
[('company', 0.06052592024207115), ('Texas', 0.09065748751163483), ('developers', 0.5530598163604736), ('IDPC', 0.00030595457064919174), ('Compensation', 0.0004959810175932944), ('Meta', 0.5752513408660889), ('Reality Labs', 0.7245837450027466), ('online', 0.009905783459544182), ('metaverse', 0.34949374198913574), ('antitrust investigation', 0.005311804823577404), ('California', 0.007139567285776138), ('online', 0.0031969475094228983), ('Intellectual Property', 0.9862333536148071), ('FTC', 0.13476067781448364), ('Apps Products\n\n\n\nFacebook', 0.02249363623559475), ('Executive Overview of Full Year 2022', 0.7630600333213806), ('Meta Platforms', 0.31121528148651123), ('Meta', 0.9851541519165039), ('Instagram and WhatsApp', 0.44499194622039795), ('Business', 0.422544002532959)]
