<a href="https://colab.research.google.com/github/mekhiya/vector-database-ai-apps/blob/main/RAG_OPENAI_wikipidea_article.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
# RAG
import warnings
warnings.filterwarnings('ignore')
!pip install python-dotenv

In [62]:
%%writefile requirements.txt
# requirements file
# note which revision of python, for example 3.9.6
# in this file, insert all the pip install needs, include revision

#for example:
#torch==2.0.1
#matplotlib==3.7.2

python-dotenv==1.0.0

numpy==1.25.2
pandas==2.1.3
scikit-learn==1.3.2
sentence-transformers==2.2.2
matplotlib==3.8.2
torch==2.1.1

langchain==0.0.346
openai==0.28.1 ## From the notebooks

pinecone-client==3.0.0dev4
pinecone-datasets==0.5.0rc11
pinecone-text==0.7.1

tiktoken==0.5.2
tqdm==4.66.1

datasets==2.15.0
deepface==0.0.79

In [64]:
!pip install -r requirements.txt

In [65]:
%%writefile DLAIUtils.py
import os
import sys
from dotenv import load_dotenv, find_dotenv

class Utils:
  def __init__(self):
    pass

  def create_dlai_index_name(self, index_name):
    openai_key = ''
    if self.is_colab(): # google colab
      from google.colab import userdata
      openai_key = userdata.get("OPENAI_API_KEY")
    else: # jupyter notebook
      openai_key = os.getenv("OPENAI_API_KEY")
    return f'{index_name}-{openai_key[-36:].lower().replace("_", "-")}'

  def is_colab(self):
    return 'google.colab' in sys.modules

  def get_openai_api_key(self):
    _ = load_dotenv(find_dotenv())
    return os.getenv("OPENAI_API_KEY")

  def get_pinecone_api_key(self):
    _ = load_dotenv(find_dotenv())
    return os.getenv("PINECONE_API_KEY")

In [7]:
!python DLAIUtils.py

In [66]:
!pip install datasets
!pip install -U sentence-transformers
!pip install pinecone-client

In [67]:
!pip install openai

In [13]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import DLAIUtils
from openai import OpenAI


import os
import time
import torch

import ast
import pandas as pd

In [10]:
from tqdm.auto import tqdm

In [14]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [68]:
utils = Utils()
PINCECODE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()

print(PINCECODE_API_KEY)
print(OPENAI_API_KEY)
OPENAI_API_KEY[-36:].lower().replace("_", "-")

In [69]:
INDEX_NAME = 'dl-ai' + OPENAI_API_KEY[-36:].lower().replace("_", "-")
INDEX_NAME

In [71]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)
  print(f'Deleting Index {INDEX_NAME}')
print(INDEX_NAME)

In [72]:
pinecone.create_index(name=INDEX_NAME,dimension=1536,metric='cosine',
                      spec=ServerlessSpec(cloud='aws',region='us-west-2'))

index = pinecone.Index(INDEX_NAME)
print(index)

In [73]:
#!wget -q -O wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"
# !mkdir sample_data
# !unzip wiki.csv.zip -d sample_data/

In [74]:
max_articles_num = 500
df = pd.read_csv('./sample_data/wiki.csv', nrows=max_articles_num)
df.head()

In [75]:
prepped = []
print(df.shape[0])
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
  meta = ast.literal_eval(row['metadata'])
  prepped.append({'id':row['id'],
                  'values':ast.literal_eval(row['values']),
                  'metadata':meta})
  if len(prepped) >= 200:
    index.upsert(prepped)
    prepped = []

In [76]:
index.describe_index_stats()

In [None]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [53]:
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPENAI_API_KEY)

def get_embeddings(articles, model='text-embedding-ada-002'):
  return openai_client.embeddings.create(input=articles, model=model)

In [None]:
# Quering vector db to fetch grounded info
query = "what is great wall of china?"

embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)
text = [r['metadata']['text'] for r in res['matches']]
print('\n'.join(text))

In [77]:
# Building prompt
query = "write an article titled: what is great wall of china?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question base on the context below. \n\n" +
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) +
    prompt_end
)
print(prompt)

In [78]:
res = openai_client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
    temperature=0,
    max_tokens=636,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)
print('-' * 80)
print(res.choices[0].text)