In [10]:
import warnings
warnings.filterwarnings('ignore')

In [11]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm

import os
import time
import torch

In [13]:
dataset = load_dataset('quora', split='train[240000:290000]', trust_remote_code=True)

Downloading data: 100%|██████████| 58.2M/58.2M [00:01<00:00, 40.8MB/s]
Generating train split: 100%|██████████| 404290/404290 [00:33<00:00, 12017.99 examples/s]


In [14]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


In [16]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [21]:
import os
import sys
from dotenv import load_dotenv, find_dotenv

class Utils:
  def __init__(self):
    pass
  
  def create_dlai_index_name(self, index_name):
    openai_key = ''
    if self.is_colab(): # google colab
      from google.colab import userdata
      openai_key = userdata.get("OPENAI_API_KEY")
    else: # jupyter notebook
      openai_key = os.getenv("OPENAI_API_KEY")
    return f'{index_name}-{openai_key[-36:].lower().replace("_", "-")}'
    
  def is_colab(self):
    return 'google.colab' in sys.modules

  def get_openai_api_key(self):
    _ = load_dotenv(find_dotenv())
    return os.getenv("OPENAI_API_KEY")
    
  def get_pinecone_api_key(self):
    _ = load_dotenv(find_dotenv())
    return os.getenv("PINECONE_API_KEY")

In [22]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [24]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)
print(index)

dl-ai-dqqqxduwtcecsctr1h1epizf2ykv7cwxgrka
<pinecone.data.index.Index object at 0x0000023D7EFDAC90>


In [25]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|██████████| 50/50 [01:53<00:00,  2.26s/it]


In [26]:
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [27]:
run_query('which city has the highest population in the world?')

0.64: Which city has the most museums per capita?
0.59: Which is the highest peak of the world?
0.59: Which are the top 10 largest cities of India by area?
0.58: What is the most dangerous city in USA?
0.55: How many cities in china?
0.54: What are the 20 most richest countries in the world?
0.53: Why is Uttar Pradesh the most populous state in India?
0.51: How many hotels are situated in cities?
0.49: How do I Construct a multiple bar chart to show population in 10000 of the given cities?
0.49: What were the largest cities in the Roman Empire?


In [28]:
query = 'how do i make chocolate cake?'
run_query(query)

0.87: How do I make cake?
0.61: What is a cake mix?
0.6: How do I bake a cake without an oven?
0.55: What is the difference between chocolate and truffles and how are they made?
0.53: How do you make a perfume out of Skittles?
0.52: How do I make my chocolate last longer (preservation)?
0.51: Where can I found adorable baked cupcakes in Gold Coast?
0.49: How can I make a banana pudding without bananas?
0.48: Where can I get great range of flavours for cupcakes at Gold Coast?
0.43: Why is there no chocolate-flavored chewing gum?
