In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import os
import docx
import re
from langchain.docstore.document import Document

# List files in directory articles
files = os.listdir('articles')

docs = []

for f in files:
    # Open file
    d = docx.Document('articles/' + f)
    # Get text from paragraphs
    text = [p.text for p in d.paragraphs]
    # Join paragraphs into a single string
    text = ' '.join(text)
    text = re.sub(' +', ' ', text)
    # Create document object
    # Id as UUID
    doc = Document(page_content=text, metadata={'source': f[:-5]})
    docs.append(doc)

In [None]:
from langchain.text_splitter import SpacyTextSplitter

# Split the documents into sentences
parsed = SpacyTextSplitter(chunk_size=1000, chunk_overlap=200, separator=" ").split_documents(docs)

In [None]:
split_files = []
section = 0

for i, d in enumerate(parsed):
    if d.metadata["source"] != parsed[i-1].metadata["source"]:
        section = 0
    
    section += 1

    new_doc = {
        "source": d.metadata["source"],
        "section": section,
        "page_content": d.page_content,
    }
    split_files.append(new_doc)

In [None]:
import pandas as pd

df = pd.DataFrame(split_files)
df.head()

In [None]:
from sentence_transformers import SentenceTransformer

retriever = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [None]:
from pinecone import ServerlessSpec
index_name = "abstractive-question-answering"

# check if the abstractive-question-answering index exists
if index_name not in pc.list_indexes():
    # create the index if it does not exist
    pc.create_index(
        index_name,
        dimension=768,
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

# connect to abstractive-question-answering index we created
index = pc.Index(index_name)

In [None]:
from tqdm.auto import tqdm
# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch["page_content"].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient="records")
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)
# check that we have all vectors in index
index.describe_index_stats()

In [None]:
def query_pinecone(query, top_k):
    # generate embeddings for the query
    xq = retriever.encode([query]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
    return xc

In [None]:
# Use this function to query
query_pinecone("<Your query here>", top_k=5)