In [6]:
#!pip install  "pinecone-client[grpc]"==2.2.1 


In [7]:
#!pip install sentence-transformers==2.2.2

In [8]:
#! pip install pinecone-datasets=='0.5.0rc11'

In [2]:
from pinecone_datasets import load_dataset

dataset = load_dataset('quora_all-MiniLM-L6-bm25')
# we drop sparse_values as they are not needed for this example
dataset.documents.drop(['metadata'], axis=1, inplace=True)
dataset.documents.rename(columns={'blob': 'metadata'}, inplace=True)
# we will use 80K rows of the dataset between rows 240K -> 320K
# dataset.documents.drop(dataset.documents.index[320_000:], inplace=True)
# dataset.documents.drop(dataset.documents.index[:240_000], inplace=True)
# we will use 10K rows of the dataset between rows 310K -> 320K
dataset.documents.drop(dataset.documents.index[320_000:], inplace=True)
dataset.documents.drop(dataset.documents.index[:310_000], inplace=True)
dataset.head()

Unnamed: 0,id,values,sparse_values,metadata
310000,104041,"[0.106361754, -0.030782843, 0.003027487, -0.03...","{'indices': [5048, 5532, 6296, 10281, 11322, 1...",{'text': ' Which highly situated people in US ...
310001,104042,"[-0.029425986, -0.016797818, -0.0015894377, 0....","{'indices': [11104, 13677, 27058, 32833, 39832...",{'text': ' What is a good word for 隐忍 in Engli...
310002,104043,"[0.009556795, -0.041521855, -0.04541965, 0.064...","{'indices': [13546, 13677, 24734, 38179, 39832...",{'text': ' Where is the best place to sell raw...
310003,104044,"[0.060474053, -0.04924797, -0.024456615, 0.053...","{'indices': [11393, 12632, 13730, 13930, 20064...",{'text': ' How does one get to have many sexua...
310004,104045,"[0.04064414, 0.007470246, -0.015795719, 0.0602...","{'indices': [11393, 12632, 13930, 20064, 22399...",{'text': ' How many sexual partners have you h...


In [3]:
print(len(dataset))

10000


In [4]:
import os
import pinecone

# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [5]:
print(os.environ.get('PINECONE_API_KEY'))

None


In [7]:
index_name = 'semantic-search-fast'

In [8]:
len(dataset.documents.iloc[0]['values'])

384

In [9]:
import time

# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=len(dataset.documents.iloc[0]['values']),
        metric='cosine'
    )
    # wait a moment for the index to be fully initialized
    time.sleep(1)


In [10]:

# now connect to the index
index = pinecone.Index(index_name)

In [11]:
for batch in dataset.iter_documents(batch_size=100):
    index.upsert(batch)

In [12]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [13]:
query = "which city has the highest population in the world?"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(xq, top_k=5, include_metadata=True)
xc

{'matches': [{'id': '109231',
              'metadata': {'text': ' Where is the most beautiful city in the '
                                   'world?'},
              'score': 0.696097791,
              'values': []},
             {'id': '109230',
              'metadata': {'text': ' What is the greatest, most beautiful city '
                                   'in the world?'},
              'score': 0.658223569,
              'values': []},
             {'id': '106974',
              'metadata': {'text': ' Which is the most polluted city In '
                                   'India?'},
              'score': 0.578280807,
              'values': []},
             {'id': '108898',
              'metadata': {'text': ' Which is the coolest country in the '
                                   'world?'},
              'score': 0.557058036,
              'values': []},
             {'id': '110500',
              'metadata': {'text': ' Which country has the most healthy '
                

In [14]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

0.7:  Where is the most beautiful city in the world?
0.66:  What is the greatest, most beautiful city in the world?
0.58:  Which is the most polluted city In India?
0.56:  Which is the coolest country in the world?
0.53:  Which country has the most healthy people?


In [15]:
pinecone.delete_index(index_name)