In [6]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [7]:
os.environ['PINECONE_API_KEY']=os.getenv('PINECONE_API_KEY')

In [31]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone as PineconeClient
from pinecone import ServerlessSpec

In [32]:
pc = PineconeClient()

In [33]:
pc.list_indexes()

[]

In [38]:
def create_pinecone_index(indexname: str, drop_index: bool = False):
    pc = PineconeClient()
    index_list = pc.list_indexes()

    print(index_list)
    if index_list is not []:
        if indexname not in index_list:
            print(f"Index {indexname} does not exist")
            pc.create_index(
                name=indexname,
                dimension=1536,
                metric='dotproduct',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )
        else:
            if drop_index:
                pc.delete_index(indexname)
            else:
                return "Index already exists"
    else:
        pc.create_index(
            name=indexname,
            dimension=1536,
            metric='dotproduct',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )

        return "Index created"

In [39]:
create_pinecone_index('langchain-hybrid')

[]
Index langchain-hybrid does not exist


In [40]:
index = pc.Index('langchain-hybrid')

In [42]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [43]:
from langchain_openai import OpenAIEmbeddings

In [44]:
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

In [46]:
len(embeddings.embed_query("Test Embedding Dimension"))

1536

In [47]:
from pinecone_text.sparse import BM25Encoder

In [49]:
encoder = BM25Encoder().default()

In [55]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader(file_path="/Users/mohitverma/Downloads/db2ohk8mp.txt").load()

In [56]:
loader

[Document(metadata={'source': '/Users/mohitverma/Downloads/db2ohk8mp.txt'}, page_content="Call Direction: Inbound\n\n<br>StartMs:0.64|EndMs:1.14|customer:Hello?<br>StartMs:2.4|EndMs:5.62|Agent:Good afternoon. May I speak to Renee? Can you hear me?<br>StartMs:2.88|EndMs:3.38|customer:Hello?<br>StartMs:5.12|EndMs:7.06|customer:Yes. It is me. Yes. I can.<br>StartMs:6.64|EndMs:14.405|Agent:Hi. My name is Raven from Satteroff Legal Group on a recorded line calling you about your tech on file. Do you have time to complete that?<br>StartMs:11.165|EndMs:11.665|customer:Okay.<br>StartMs:14.845|EndMs:17.505|customer:I actually do. And this is about the talcum powder. Right?<br>StartMs:17.79|EndMs:21.01|Agent:Yes, ma'am. The tech about the survey you submitted to us.<br>StartMs:21.91|EndMs:32.975|customer:Wait. Wait. Wait. Yeah. About the okay. So I've I actually have to tell you something. The only reason I submitted it is because I didn't feel like changing all the info, but it's actually not f

In [58]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

In [75]:
def split_text(text: list[Document], chunk_size: int = 500, chunk_overlap: int=100) -> list[Document]:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(text)


In [76]:
doc_chunks = split_text(loader)

In [77]:
doc_chunks

[Document(metadata={'source': '/Users/mohitverma/Downloads/db2ohk8mp.txt'}, page_content='Call Direction: Inbound'),
 Document(metadata={'source': '/Users/mohitverma/Downloads/db2ohk8mp.txt'}, page_content='<br>StartMs:0.64|EndMs:1.14|customer:Hello?<br>StartMs:2.4|EndMs:5.62|Agent:Good afternoon. May I speak to Renee? Can you hear me?<br>StartMs:2.88|EndMs:3.38|customer:Hello?<br>StartMs:5.12|EndMs:7.06|customer:Yes. It is me. Yes. I can.<br>StartMs:6.64|EndMs:14.405|Agent:Hi. My name is Raven from Satteroff Legal Group on a recorded line calling you about your tech on file. Do you have time to complete that?<br>StartMs:11.165|EndMs:11.665|customer:Okay.<br>StartMs:14.845|EndMs:17.505|customer:I'),
 Document(metadata={'source': '/Users/mohitverma/Downloads/db2ohk8mp.txt'}, page_content="that?<br>StartMs:11.165|EndMs:11.665|customer:Okay.<br>StartMs:14.845|EndMs:17.505|customer:I actually do. And this is about the talcum powder. Right?<br>StartMs:17.79|EndMs:21.01|Agent:Yes, ma'am. The

In [78]:
def convert_chunks_into_corpus(doc_chunks: list[Document]) -> list[str]:
    chunks = []

    for chunk in doc_chunks:
        chunks.append(chunk.page_content)

    return chunks

In [79]:
corpus = convert_chunks_into_corpus(doc_chunks)

In [80]:
corpus

['Call Direction: Inbound',
 '<br>StartMs:0.64|EndMs:1.14|customer:Hello?<br>StartMs:2.4|EndMs:5.62|Agent:Good afternoon. May I speak to Renee? Can you hear me?<br>StartMs:2.88|EndMs:3.38|customer:Hello?<br>StartMs:5.12|EndMs:7.06|customer:Yes. It is me. Yes. I can.<br>StartMs:6.64|EndMs:14.405|Agent:Hi. My name is Raven from Satteroff Legal Group on a recorded line calling you about your tech on file. Do you have time to complete that?<br>StartMs:11.165|EndMs:11.665|customer:Okay.<br>StartMs:14.845|EndMs:17.505|customer:I',
 "that?<br>StartMs:11.165|EndMs:11.665|customer:Okay.<br>StartMs:14.845|EndMs:17.505|customer:I actually do. And this is about the talcum powder. Right?<br>StartMs:17.79|EndMs:21.01|Agent:Yes, ma'am. The tech about the survey you submitted to us.<br>StartMs:21.91|EndMs:32.975|customer:Wait. Wait. Wait. Yeah. About the okay. So I've I actually have to tell you something. The only reason I submitted it is because I didn't feel like changing all the info, but it's act

In [81]:
encoder.fit(corpus)

100%|██████████| 69/69 [00:00<00:00, 499.30it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x115fbafc0>

In [85]:
encoder.dump('sparse.json')

In [86]:
BM25 = BM25Encoder().load('sparse.json')

In [None]:
encoder.load()

In [87]:
retreiver=PineconeHybridSearchRetriever(
    index=index,
    sparse_encoder=BM25,
    embeddings=embeddings
)

In [89]:
retreiver

PineconeHybridSearchRetriever(embeddings=OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x112b6e6f0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x114e22180>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x134a06d50>, index=<pinecone.data.index.Index object at 0x11295a

In [90]:
retreiver.add_texts(
    texts=corpus
)

100%|██████████| 3/3 [00:10<00:00,  3.60s/it]


In [91]:
retreiver.invoke("Agent name")

[Document(metadata={'score': 0.307389528}, page_content='name is Martha.<br>StartMs:403.34|EndMs:405.04|Agent:Martha. And your last name?<br>StartMs:405.745|EndMs:406.245|customer:Copas,<br>StartMs:407.42|EndMs:409.36|Agent:Could you spell that for me, please?<br>StartMs:409.985|EndMs:411.445|customer:b o p a s.<br>StartMs:414.715|EndMs:415.215|Agent:Okay.<br>StartMs:425.8|EndMs:435.055|Agent:Alright. So I just have to state this. I stated it with your daughter already, but I have to because you are I just have to make sure that'),
 Document(metadata={'score': 0.289944172}, page_content='took it upon herself because she found the she found it<br>StartMs:388.34|EndMs:389.33|Agent:Just okay.<br>StartMs:392.775|EndMs:396.15|customer:the the claim for the the cancer powder<br>StartMs:397.49|EndMs:397.99|customer:lawyers.<br>StartMs:398.655|EndMs:400.835|Agent:Okay. What is your first name,<br>StartMs:401.49|EndMs:402.55|customer:My name is Martha.<br>StartMs:403.34|EndMs:405.04|Agent:Marth

In [None]:
retreiver.aget_relevant_documents("Agent name")