In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.doc_processor_helper import get_all_data_paths, get_sub_documents
from src.embedding_helper import EmbeddingHelper
from src.opensearch import OpenSearchClient

# Run One

In [None]:
## Get Data
path_pdfs = get_all_data_paths()
path_pdf = path_pdfs[2]
sub_documents = get_sub_documents(path_pdf, chunk_size=1250, chunk_overlap=100)
sub_documents = ['passage: '+ sd.page_content for sd in sub_documents]
sub_documents = sub_documents[:300]

In [None]:
## Get Embeddings
model_id='intfloat/e5-small-v2' # this model needs "passge" and "query" prepended to the text
eh = EmbeddingHelper(model_id=model_id)
embeddings_list = eh.get_embeddings_batch(sub_documents, chunk_size=10)

In [None]:
## Write to OpenSearch
osc = OpenSearchClient(model_id=model_id)
osc.create_index(overwrite=True)
osc.index_documents(embeddings_list, sub_documents, filename=path_pdf.name)

# Run Batch

In [3]:
from pathlib import Path
def write_embeddings(model_id:str, path_pdf:Path, osc:OpenSearchClient, prepend:bool):
    sub_documents = get_sub_documents(path_pdf, chunk_size=1250, chunk_overlap=100)
    if prepend:
        sub_documents = ['passage: '+ sd.page_content for sd in sub_documents]
    else:
        sub_documents = [sd.page_content for sd in sub_documents]
    sub_documents = sub_documents[:300]
    eh = EmbeddingHelper(model_id=model_id)
    embeddings_list = eh.get_embeddings_batch(sub_documents, chunk_size=10)
    assert len(sub_documents)==len(embeddings_list), 'lengths do not match for '+str(path_pdf)
    osc.index_documents(embeddings_list, sub_documents, filename=path_pdf.name)

In [4]:
model_id='intfloat/e5-small-v2' # this model needs "passge" and "query" prepended to the text
path_pdfs = get_all_data_paths()
osc = OpenSearchClient(model_id=model_id)
osc.create_index(overwrite=True)
for path_pdf in path_pdfs:
    print(f'Writing {path_pdf.name} to OpenSearch')
    write_embeddings(model_id, path_pdf, osc, prepend=True)
    print(f'Finished writing {path_pdf.name} to OpenSearch')

2023-09-05 21:24:42,484 - src.logger - INFO - found 5 pdfs in /Users/meninderpurewal/My Drive/Mikey/Code/Retrieval/data
2023-09-05 21:24:42,569 - src.logger - INFO - {'name': '87fee34ad386', 'cluster_name': 'docker-cluster', 'cluster_uuid': '-a_peNY5SsWnBbMYhkB_OA', 'version': {'distribution': 'opensearch', 'number': '2.6.0', 'build_type': 'tar', 'build_hash': '7203a5af21a8a009aece1474446b437a3c674db6', 'build_date': '2023-02-24T18:58:37.352296474Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}
2023-09-05 21:24:42,605 - src.logger - INFO - intfloat-e5-small-v2-index exists
2023-09-05 21:24:42,692 - src.logger - INFO - deleted intfloat-e5-small-v2-index because it already existed
2023-09-05 21:24:42,693 - src.logger - INFO - Now creating intfloat-e5-small-v2-index


Writing amazon_10k_2022.pdf to OpenSearch


2023-09-05 21:24:51,433 - src.logger - INFO - 1 doc inputted
2023-09-05 21:24:51,434 - src.logger - INFO - first doc has 282140 chars
2023-09-05 21:24:51,434 - src.logger - INFO - # of sub-docs from all 1 docs is 288
100%|██████████| 288/288 [00:02<00:00, 118.75it/s]
2023-09-05 21:25:19,020 - src.logger - INFO - Done
2023-09-05 21:25:19,036 - src.logger - INFO - [{'epoch': '1693963519', 'timestamp': '01:25:19', 'count': '288'}]


Finished writing amazon_10k_2022.pdf to OpenSearch
Writing google_10k_2022.pdf to OpenSearch


2023-09-05 21:25:24,013 - src.logger - INFO - 1 doc inputted
2023-09-05 21:25:24,013 - src.logger - INFO - first doc has 334124 chars
2023-09-05 21:25:24,013 - src.logger - INFO - # of sub-docs from all 1 docs is 341
100%|██████████| 300/300 [00:02<00:00, 141.61it/s]
2023-09-05 21:25:50,435 - src.logger - INFO - Done
2023-09-05 21:25:50,455 - src.logger - INFO - [{'epoch': '1693963550', 'timestamp': '01:25:50', 'count': '588'}]


Finished writing google_10k_2022.pdf to OpenSearch
Writing meta_10k_2022.pdf to OpenSearch


2023-09-05 21:26:03,268 - src.logger - INFO - 1 doc inputted
2023-09-05 21:26:03,269 - src.logger - INFO - first doc has 490502 chars
2023-09-05 21:26:03,269 - src.logger - INFO - # of sub-docs from all 1 docs is 540
100%|██████████| 300/300 [00:02<00:00, 146.72it/s]
2023-09-05 21:26:28,651 - src.logger - INFO - Done
2023-09-05 21:26:28,678 - src.logger - INFO - [{'epoch': '1693963588', 'timestamp': '01:26:28', 'count': '888'}]


Finished writing meta_10k_2022.pdf to OpenSearch
Writing tesla_10k_2022.pdf to OpenSearch


2023-09-05 21:26:54,534 - src.logger - INFO - 1 doc inputted
2023-09-05 21:26:54,534 - src.logger - INFO - first doc has 868975 chars
2023-09-05 21:26:54,534 - src.logger - INFO - # of sub-docs from all 1 docs is 926
100%|██████████| 300/300 [00:02<00:00, 147.41it/s]
2023-09-05 21:27:17,867 - src.logger - INFO - Done
2023-09-05 21:27:17,916 - src.logger - INFO - [{'epoch': '1693963637', 'timestamp': '01:27:17', 'count': '1188'}]


Finished writing tesla_10k_2022.pdf to OpenSearch
Writing apple_10k_2022.pdf to OpenSearch


2023-09-05 21:27:22,353 - src.logger - INFO - 1 doc inputted
2023-09-05 21:27:22,353 - src.logger - INFO - first doc has 287558 chars
2023-09-05 21:27:22,353 - src.logger - INFO - # of sub-docs from all 1 docs is 311
100%|██████████| 300/300 [00:02<00:00, 121.82it/s]
2023-09-05 21:27:50,856 - src.logger - INFO - Done
2023-09-05 21:27:50,872 - src.logger - INFO - [{'epoch': '1693963670', 'timestamp': '01:27:50', 'count': '1488'}]


Finished writing apple_10k_2022.pdf to OpenSearch


In [5]:
model_id='thenlper/gte-small'
path_pdfs = get_all_data_paths()
osc = OpenSearchClient(model_id=model_id)
osc.create_index(overwrite=True)
for path_pdf in path_pdfs:
    print(f'Writing {path_pdf.name} to OpenSearch')
    write_embeddings(model_id, path_pdf, osc, prepend=False)
    print(f'Finished writing {path_pdf.name} to OpenSearch')

2023-09-05 21:27:51,803 - src.logger - INFO - found 5 pdfs in /Users/meninderpurewal/My Drive/Mikey/Code/Retrieval/data
2023-09-05 21:27:51,887 - src.logger - INFO - {'name': '87fee34ad386', 'cluster_name': 'docker-cluster', 'cluster_uuid': '-a_peNY5SsWnBbMYhkB_OA', 'version': {'distribution': 'opensearch', 'number': '2.6.0', 'build_type': 'tar', 'build_hash': '7203a5af21a8a009aece1474446b437a3c674db6', 'build_date': '2023-02-24T18:58:37.352296474Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}
2023-09-05 21:27:51,914 - src.logger - INFO - thenlper-gte-small-index exists
2023-09-05 21:27:52,010 - src.logger - INFO - deleted thenlper-gte-small-index because it already existed
2023-09-05 21:27:52,011 - src.logger - INFO - Now creating thenlper-gte-small-index


Writing amazon_10k_2022.pdf to OpenSearch


2023-09-05 21:28:00,102 - src.logger - INFO - 1 doc inputted
2023-09-05 21:28:00,102 - src.logger - INFO - first doc has 282140 chars
2023-09-05 21:28:00,102 - src.logger - INFO - # of sub-docs from all 1 docs is 288
100%|██████████| 288/288 [00:02<00:00, 120.53it/s]
2023-09-05 21:28:27,604 - src.logger - INFO - Done
2023-09-05 21:28:27,630 - src.logger - INFO - [{'epoch': '1693963707', 'timestamp': '01:28:27', 'count': '288'}]


Finished writing amazon_10k_2022.pdf to OpenSearch
Writing google_10k_2022.pdf to OpenSearch


2023-09-05 21:28:32,811 - src.logger - INFO - 1 doc inputted
2023-09-05 21:28:32,811 - src.logger - INFO - first doc has 334124 chars
2023-09-05 21:28:32,812 - src.logger - INFO - # of sub-docs from all 1 docs is 341
100%|██████████| 300/300 [00:02<00:00, 128.82it/s]
2023-09-05 21:28:58,898 - src.logger - INFO - Done
2023-09-05 21:28:58,937 - src.logger - INFO - [{'epoch': '1693963738', 'timestamp': '01:28:58', 'count': '588'}]


Finished writing google_10k_2022.pdf to OpenSearch
Writing meta_10k_2022.pdf to OpenSearch


2023-09-05 21:29:11,485 - src.logger - INFO - 1 doc inputted
2023-09-05 21:29:11,485 - src.logger - INFO - first doc has 490502 chars
2023-09-05 21:29:11,485 - src.logger - INFO - # of sub-docs from all 1 docs is 540
100%|██████████| 300/300 [00:02<00:00, 123.41it/s]
2023-09-05 21:29:36,853 - src.logger - INFO - Done
2023-09-05 21:29:36,868 - src.logger - INFO - [{'epoch': '1693963776', 'timestamp': '01:29:36', 'count': '888'}]


Finished writing meta_10k_2022.pdf to OpenSearch
Writing tesla_10k_2022.pdf to OpenSearch


2023-09-05 21:30:02,765 - src.logger - INFO - 1 doc inputted
2023-09-05 21:30:02,765 - src.logger - INFO - first doc has 868975 chars
2023-09-05 21:30:02,765 - src.logger - INFO - # of sub-docs from all 1 docs is 926
100%|██████████| 300/300 [00:02<00:00, 114.83it/s]
2023-09-05 21:30:25,919 - src.logger - INFO - Done
2023-09-05 21:30:25,932 - src.logger - INFO - [{'epoch': '1693963825', 'timestamp': '01:30:25', 'count': '1188'}]


Finished writing tesla_10k_2022.pdf to OpenSearch
Writing apple_10k_2022.pdf to OpenSearch


2023-09-05 21:30:30,600 - src.logger - INFO - 1 doc inputted
2023-09-05 21:30:30,601 - src.logger - INFO - first doc has 287558 chars
2023-09-05 21:30:30,601 - src.logger - INFO - # of sub-docs from all 1 docs is 311
100%|██████████| 300/300 [00:02<00:00, 123.26it/s]
2023-09-05 21:30:58,798 - src.logger - INFO - Done
2023-09-05 21:30:58,830 - src.logger - INFO - [{'epoch': '1693963858', 'timestamp': '01:30:58', 'count': '1488'}]


Finished writing apple_10k_2022.pdf to OpenSearch


In [6]:
# estimate size of db
# docs * sub*docs * vector size * 4 bytes * 2 models
5 * 300 * 384 * 4 * 2 / 1024 / 1024 # in MB

4.39453125

In [7]:
import torch