<a href="https://colab.research.google.com/github/navneetkrc/langchain_colab_experiments/blob/main/QA_app_using_Pinecone_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install

In [None]:
!pip install openai pinecone-client python-docx

# Import

In [None]:
import pinecone
from openai.embeddings_utils import get_embedding
from tqdm import tqdm
import docx
import os
import openai

openai.api_key = "YOUR_API_KEY_HERE"

# Mount drive a specify the folder

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
docs_path = "/content/drive/MyDrive/Your Docs Folder"

# Parse Documents

In [None]:
text_chunks = []
for f_name in os.listdir(docs_path):
  doc_path = os.path.join(docs_path, f_name)
  doc = docx.Document(doc_path)
  for para in doc.paragraphs:
    text_chunks.append(para.text)

In [None]:
# remove all chunks shorter than 10 words and strip the rest
text_chunks = [string.strip().strip('\n') for string in text_chunks if len(string.split()) >= 10]

# Generate embeddigns

In [None]:
chunks_with_embeddigns = []
for chunk in tqdm(text_chunks):
  embedding = get_embedding(chunk, engine='text-embedding-ada-002')
  chunks_with_embeddigns.append({"text": chunk, "embedding": embedding})

# Upload to Pinecone

In [None]:
pinecone.init(
    api_key="",
    environment="us-east1-gcp"
)

In [None]:
# create or connect to index
index_name = "tiktok-trends-2023"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536)
# connect to index
index = pinecone.Index(index_name)

In [None]:
batch_size = 64  # process everything in batches of 64
for i in tqdm(range(0, len(chunks_with_embeddigns), batch_size)):
    data_batch = chunks_with_embeddigns.iloc[i: i+batch_size]
    # set end position of batch
    i_end = min(i+batch_size, len(chunks_with_embeddigns))
    # get batch meta
    text_batch = [item['text'] for item in data_batch]
    # get ids
    ids_batch = [str(n) for n in range(i, i_end)]
    # get embeddings
    embeds = [item['embedding'] for item in data_batch]
    # prep metadata and upsert batch
    meta = [{'text': text_batch} for text_batch in zip(text_batch)] # you can add more fields here
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

# Query Index

In [None]:
def search_docs(query):
  xq = openai.Embedding.create(input=query, engine="text-embedding-ada-002")['data'][0]['embedding']
  res = index.query([xq], top_k=5, include_metadata=True)
  chosen_text = []
  for match in res['matches']:
    chosen_text = match['metadata']
  return res['matches']

In [None]:
matches = search_docs("What are some predictions for tiktok?")
for match in matches:
    print(f"{match['score']:.2f}: {match['metadata']}")

# Construct Prompt

In [None]:
def construct_prompt(query):
  matches = search_docs(query)

  chosen_text = []
  for match in matches:
    chosen_text.append(match['metadata']['text'])

  prompt = """Answer the question as truthfully as possible using the context below, and if the answer is no within the context, say 'I don't know.'"""
  prompt += "\n\n"
  prompt += "Context: " + "\n".join(chosen_text)
  prompt += "\n\n"
  prompt += "Question: " + query
  prompt += "\n"
  prompt += "Answer: "
  return prompt

# Run the model

In [None]:
def answer_question(query):
  prompt = construct_prompt(query)
  res = openai.Completion.create(
      prompt=prompt,
      model="text-davinci-003",
      max_tokens=500,
      temperature=0.0,
  )
  
  return res.choices[0].message

In [None]:
print(answer_question("What will be the top platform in 2023?"))