<a href="https://colab.research.google.com/github/mekhiya/vector-database-ai-apps/blob/main/Semantic_Search_Vector_db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [2]:
%%writefile requirements.txt
# requirements file
# note which revision of python, for example 3.9.6
# in this file, insert all the pip install needs, include revision

#for example:
#torch==2.0.1
#matplotlib==3.7.2

python-dotenv==1.0.0

numpy==1.25.2
pandas==2.1.3
scikit-learn==1.3.2
sentence-transformers==2.2.2
matplotlib==3.8.2
torch==2.1.1

langchain==0.0.346
openai==0.28.1 ## From the notebooks

pinecone-client==3.0.0dev4
pinecone-datasets==0.5.0rc11
pinecone-text==0.7.1

tiktoken==0.5.2
tqdm==4.66.1

datasets==2.15.0
deepface==0.0.79

Writing requirements.txt


In [9]:
# !pip install -r requirements.txt

In [3]:
%%writefile DLAIUtils.py
import os
import sys
from dotenv import load_dotenv, find_dotenv

class Utils:
  def __init__(self):
    pass

  def create_dlai_index_name(self, index_name):
    openai_key = ''
    if self.is_colab(): # google colab
      from google.colab import userdata
      openai_key = userdata.get("OPENAI_API_KEY")
    else: # jupyter notebook
      openai_key = os.getenv("OPENAI_API_KEY")
    return f'{index_name}-{openai_key[-36:].lower().replace("_", "-")}'

  def is_colab(self):
    return 'google.colab' in sys.modules

  def get_openai_api_key(self):
    _ = load_dotenv(find_dotenv())
    return os.getenv("OPENAI_API_KEY")

  def get_pinecone_api_key(self):
    _ = load_dotenv(find_dotenv())
    return os.getenv("PINECONE_API_KEY")

Writing DLAIUtils.py


In [5]:
!python -m pip install python-dotenv
!python DLAIUtils.py



In [55]:
'''
add .env file with keys for

OPENAI_API_KEY = ''
PINECONE_API_KEY = ''
'''

"\nadd .env file with keys for\n\nOPENAI_API_KEY = ''\nPINECONE_API_KEY = ''\n"

In [8]:
# !pip install datasets
# !pip install -U sentence-transformers
# !pip install pinecone-client

In [7]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import DLAIUtils

import os
import time
import torch

In [10]:
from tqdm.auto import tqdm

In [11]:
dataset = load_dataset('quora', split='train[240000:290000]')

Downloading data:   0%|          | 0.00/35.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [12]:
dataset[:1]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]}],
 'is_duplicate': [False]}

In [73]:
questions = []
for record in dataset['questions']:
  questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-'*50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [67]:
query = 'which city is the costliest in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [68]:
utils = Utils()
PINCECODE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()

In [17]:
print(PINCECODE_API_KEY)
print(OPENAI_API_KEY)
OPENAI_API_KEY[-36:].lower().replace("_", "-")

ee8a2b59-62cb-4d6f-a8f2-26870d88a0f5
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


'4moiewzrt3blbkfjjsgo0r70hk3mut42mfj4'

In [18]:
INDEX_NAME = 'dl-ai' + OPENAI_API_KEY[-36:].lower().replace("_", "-")
INDEX_NAME

'dl-ai4moiewzrt3blbkfjjsgo0r70hk3mut42mfj4'

In [74]:
pinecone = Pinecone(api_key=PINCECODE_API_KEY)
#INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)
  print(f'deleting index : {INDEX_NAME}')
print(INDEX_NAME)

deleting index : dl-ai4moiewzrt3blbkfjjsgo0r70hk3mut42mfj4
dl-ai4moiewzrt3blbkfjjsgo0r70hk3mut42mfj4


In [75]:
pinecone.create_index(name=INDEX_NAME,
                      dimension=model.get_sentence_embedding_dimension(),
                      metric='cosine',
                      spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)
print(index)

<pinecone.data.index.Index object at 0x7e5a21310af0>


In [97]:
# i = 0
# i_end = 200
# ids = [str(x) for x in range(i, i_end)]
# print(f'ids is {ids}')
# s = [str(n) for n in range(0,10)]
# print(s)

In [76]:
#Creating embeddings & upsert pinecone
batch_size = 200
vector_limit = 10000
print(f'len(questions) is {len(questions)}')
print(f'batch_size is {batch_size}')
questions = questions[:vector_limit]
# len(questions)

for i in tqdm(range(0,len(questions), batch_size)):
  i_end = min(i+batch_size, len(questions))
  ids = [str(x) for x in range(i, i_end)]
  metadatas = [{'text' : text } for text in questions[i:i_end]]
  xc = model.encode(questions[i:i_end])

  # print(f'len(questions) is {len(questions)}')
  # print(f'i is {i}')
  # print(f'i_end is {i_end}')
  # print(f'ids is {ids}')
  # print(f'metadatas is {metadatas}')
  # print(f'xc.shape is {xc.shape}')
  ## print(f'xc[1,:] is {xc[1,:]}')
  ##break

  records = zip(ids, xc, metadatas)
  index.upsert(records)

len(questions) is 100000
batch_size is 200


  0%|          | 0/50 [00:00<?, ?it/s]

In [84]:
## Run Query
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_value=False)
  for result in results['matches']:
    print(f"{round(result['score'],2)} : {result['metadata']['text']}")


In [95]:
query = 'what breakfast should I have after run?'
run_query(query)

0.59 : What are the breakfast items that can be cooked in 5 minutes?
0.55 : How soon should I eat after a workout?
0.54 : What are some vegetarian breakfast items which could be prepared in 5 minutes?
0.51 : What is the best food to eat on an empty stomach?
0.47 : What should i eat after workout for reducing body fat and gaining muscle ?
0.46 : What is the first you do in the morning?
0.43 : Do I have to eat salad?
0.43 : What should a single guy do for the healthy meals?
0.41 : What is the correct time to take green tea for weight loss?
0.41 : Which is the best time to workout, morning or evening?


In [96]:
query = 'which city is coldest?'
run_query(query)

0.76 : Which is the coldest country in the world?
0.71 : What are the coldest countries in the world?
0.63 : How cold is Scotland?
0.6 : Where is the most beautiful city in the world?
0.58 : Which is the most beautiful city in world?
0.52 : How hot does Austin, TX get?
0.51 : Which is the best city to reside in India?
0.48 : Where is the best place to travel?
0.48 : What is the weather like in port Townsend, WA compared to Tacoma?
0.48 : Why do most people hate winters?
