In [2]:
#!pip install pinecone-client openai sentence-transformers tiktoken datasets

In [1]:
from openai import OpenAI
from datetime import datetime
import hashlib
import re
import os 
from sentence_transformers import CrossEncoder

from tqdm import tqdm
import numpy as np
from torch import nn
import logging
from pinecone import Pinecone,ServerlessSpec
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
import tqdm

  from tqdm.autonotebook import tqdm, trange





In [2]:
api_key=os.environ.get("OPENAI_API_KEY")
pinecone_key=os.environ.get('PINCONE_KEY')


In [3]:
client=OpenAI(api_key=api_key)
INDEX_NAME='semantic-search-test'
NAMESPACE='default'
ENGINE ='text-embedding-3-large' # has vector size 3072
pc=Pinecone(api_key=pinecone_key)

In [4]:
def get_embeddings(texts, engine=ENGINE):
    response = client.embeddings.create(input=texts, model=engine)
  
    return [item.embedding for item in response.data] 

def get_embedding(text, engine=ENGINE):
    if isinstance(text, list):
        return get_embeddings(text, engine)
    else:
        return get_embeddings([text], engine)[0]

# Test both cases
print(len(get_embedding('hi')))  # Should return the length of a single embedding
print([len(embedding) for embedding in get_embedding(['hi', 'hello'])])  # Should return lengths of multiple embeddings


3072
[3072, 3072]


In [5]:
if INDEX_NAME not in pc.list_indexes().names():
    print(f'creating index {INDEX_NAME}')
    pc.create_index(
        name=INDEX_NAME,# le nom d'index
        dimension=3072,
        metric='cosine',
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) )# the similarity to  use when seaching the index
index = pc.Index(name=INDEX_NAME)

In [6]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 983}},
 'total_vector_count': 983}

In [7]:
def my_hash(s):
    return hashlib.md5(s.encode()).hexdigest()
                       
                       
my_hash("salam")

'de6838252f95d3b9e803b28df33b4baa'

In [32]:
def prepare_for_pinecone(texts, engine=ENGINE):
    now = datetime.utcnow()
    print("le text est \n")
    print(texts)
    embeddings = get_embeddings(texts, engine=engine)
    # Create tuples of (ID, embedding, metadata) for each text
    return [
        (my_hash(text),  # unique ID for the string
         embedding,      # the embedding for the text
         dict(text=text, date_uploaded=now))  # metadata dictionary
        for text, embedding in zip(texts, embeddings)
    ]

# Example usage:
texts = ['Bonjour', 'je', "what", "fin", "wahat"]
prepared_data = prepare_for_pinecone(texts)

# Unpacking the first result
for _id, embedding, metadata in prepared_data:
    print('ID:', _id)
    print('Embedding length:', len(embedding))
    print('Metadata:', metadata)


le text est 

['Bonjour', 'je', 'what', 'fin', 'wahat']
ID: ebc58ab2cb4848d04ec23d83f7ddf985
Embedding length: 3072
Metadata: {'text': 'Bonjour', 'date_uploaded': datetime.datetime(2024, 10, 13, 11, 42, 43, 228563)}
ID: 79563e90630af3525dff01b6638b0886
Embedding length: 3072
Metadata: {'text': 'je', 'date_uploaded': datetime.datetime(2024, 10, 13, 11, 42, 43, 228563)}
ID: 4a2028eceac5e1f4d252ea13c71ecec6
Embedding length: 3072
Metadata: {'text': 'what', 'date_uploaded': datetime.datetime(2024, 10, 13, 11, 42, 43, 228563)}
ID: d79695776a5b40f7cadbee1f91a85c82
Embedding length: 3072
Metadata: {'text': 'fin', 'date_uploaded': datetime.datetime(2024, 10, 13, 11, 42, 43, 228563)}
ID: 95bdfaa8cc9121e57e87ee55bc9c43c7
Embedding length: 3072
Metadata: {'text': 'wahat', 'date_uploaded': datetime.datetime(2024, 10, 13, 11, 42, 43, 228563)}


In [9]:
def upload_texts_to_pinecone(texts,namespace=NAMESPACE,batch_size=None , Show_progress_bar=False):
    total_upserted=0
    if not batch_size:
        batch_size=len(texts)
    _range=range(0,len(texts),batch_size)
    for i in tqdm(_range) if Show_progress_bar else _range:
        batch=texts[i:i+batch_size]
        prepared_texts = prepare_for_pinecone(batch)
        total_upserted+=index.upsert(
            vectors=prepared_texts,
            namespace=namespace)['upserted_count']
    return total_upserted
upload_texts_to_pinecone(texts)

5

In [10]:
def query_from_pinecone(query, top_k=3, include_metadata=True):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=include_metadata  
    ).get('matches')

In [11]:
def delete_texts_from_pinecone(texts, namespace=NAMESPACE):
    hashes = [hashlib.md5(text.encode()).hexdigest() for text in texts]

    return index.delete(ids=hashes, namespace=namespace)

# OPEN SOURCE ALTERNATIVE TO EMBEDDING

In [12]:
from datasets import load_dataset
dataset = load_dataset("xtreme","MLQA.en.en")
dataset['train']=dataset["test"]
dataset['test'] = dataset['validation']
del dataset['validation']
dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1148
    })
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11590
    })
})

In [13]:
dataset['train'][1] , dataset['train'][1]


({'id': 'f251ea56c4f1aa1df270137f7e6d89c0cc1b6ef4',
  'title': 'Area 51',
  'context': 'In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its dut

In [34]:
unique_passages = list(set(dataset['test']['context']))
unique_passages=unique_passages[:5]

for idx in tqdm(range(0, len(unique_passages), 32)):
    passages = unique_passages[idx:idx + 32]
    print("#################")
    print(passages)
    print("###############")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]

#################
['There are many species of bacteria and other microorganisms that live on or inside the healthy human body. In fact, 90% of the cells in (or on) a human body are microbes, by number (much less by mass or volume). Some of these symbionts are necessary for our health. Those that neither help nor harm humans are called commensal organisms.', 'The artwork was created by their associate, George Hardie.  Hipgnosis offered the band a choice of seven designs, but all four members agreed that the prism was by far the best. The final design depicts a glass prism dispersing light into colour. The design represents three elements: the band\'s stage lighting, the album lyrics, and Wright\'s request for a "simple and bold" design.  The spectrum of light continues through to the gatefold – an idea that Waters came up with. Added shortly afterwards, the gatefold design also includes a visual representation of the heartbeat sound used throughout the album, and the back of the album c




In [35]:
from tqdm import tqdm
for idx in tqdm(range(0, len(unique_passages), 32)):
    passages = unique_passages[idx:idx + 32]
    upload_texts_to_pinecone(passages)

  0%|                                                                                                                                                                                                        | 0/1 [00:00<?, ?it/s]

le text est 

['There are many species of bacteria and other microorganisms that live on or inside the healthy human body. In fact, 90% of the cells in (or on) a human body are microbes, by number (much less by mass or volume). Some of these symbionts are necessary for our health. Those that neither help nor harm humans are called commensal organisms.', 'The artwork was created by their associate, George Hardie.  Hipgnosis offered the band a choice of seven designs, but all four members agreed that the prism was by far the best. The final design depicts a glass prism dispersing light into colour. The design represents three elements: the band\'s stage lighting, the album lyrics, and Wright\'s request for a "simple and bold" design.  The spectrum of light continues through to the gatefold – an idea that Waters came up with. Added shortly afterwards, the gatefold design also includes a visual representation of the heartbeat sound used throughout the album, and the back of the album cover

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.89s/it]


In [31]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 983}},
 'total_vector_count': 983}

In [None]:
dataset['test'][0]

In [32]:
query_from_pinecone("Does an infection for Sandflies go away over time ?")

[{'id': '2f90090e21f19450887d5f3ff781e541',
  'metadata': {'date_uploaded': '2024-10-11T08:04:25.984832',
               'text': 'Pappataci fever is prevalent in the subtropical zone of '
                       'the Eastern Hemisphere between 20°N and 45°N, '
                       'particularly in Southern Europe, North Africa, the '
                       'Balkans, Eastern Mediterranean, Iraq, Iran, Pakistan, '
                       'Afghanistan and India.The disease is transmitted by the '
                       'bites of phlebotomine sandflies of the Genus '
                       'Phlebotomus, in particular, Phlebotomus papatasi, '
                       'Phlebotomus perniciosus and Phlebotomus perfiliewi. The '
                       'sandfly becomes infected when biting an infected human '
                       'in the period between 48 hours before the onset of '
                       'fever and 24 hours after the end of the fever, and '
                       'remains infec

#  Using cross-encoder

In [34]:
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np
from torch import nn



In [35]:
from copy import copy

def get_results_from_pinecone(query, top_k=3, re_rank_model=None, verbose=True, correct_hash=None):
    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    if not results_from_pinecone:
        return []
    if verbose:
        print("Query:", query)
    final_results = []

    retrieved_correct_position, reranked_correct_position = None, None
    for idx, result_from_pinecone in enumerate(results_from_pinecone):
        if correct_hash and result_from_pinecone['id'] == correct_hash:
            retrieved_correct_position = idx

    if re_rank_model is not None:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]

        # Compute the similarity scores for these combinations
        similarity_scores = re_rank_model.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
        sim_scores_sort = list(reversed(np.sort(similarity_scores)))
        top_re_rank_score = sim_scores_sort[0]

        # Print the scores
        for idx in sim_scores_argsort:
            result_from_pinecone = results_from_pinecone[idx]
            if correct_hash and result_from_pinecone['id'] == correct_hash:
                reranked_correct_position = idx
            final_results.append({'score': similarity_scores[idx], 'id': result_from_pinecone['id'], 'metadata': result_from_pinecone['metadata']})
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.6f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return {'final_results': final_results, 'retrieved_correct_position': retrieved_correct_position, 'reranked_correct_position': reranked_correct_position, 'results_from_pinecone': results_from_pinecone, 'top_re_rank_score': top_re_rank_score}

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return {'final_results': final_results, 'retrieved_correct_position': retrieved_correct_position, 'reranked_correct_position': reranked_correct_position}

In [36]:
# Pre-trained cross encoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', num_labels=1)

q_to_hash = {data['question']: my_hash(data['context']) for data in dataset['test']}

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [39]:
unique_inputs = list(set(dataset['test']['question']))
len(unique_inputs)

1148

In [40]:
query = unique_inputs[0]

query_result = get_results_from_pinecone(
    query, 
    top_k=2, # grab 2 results
    re_rank_model=cross_encoder, 
    correct_hash=q_to_hash[query],
    verbose=False
    )

query_result['retrieved_correct_position'], query_result['reranked_correct_position']

  attn_output = torch.nn.functional.scaled_dot_product_attention(


(0, 0)

In [46]:
from sentence_transformers import SentenceTransformer


In [47]:
bi_encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


In [48]:
docs = dataset['test']['context']
doc_emb = bi_encoder.encode(docs, batch_size=32, show_progress_bar=True)

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

In [49]:
doc_emb.shape


(1148, 768)

768


In [None]:
query = unique_inputs[0]
print(query)

for t in dataset['test']:
    if t['question'] == query:
        print(t['context'])

In [55]:
from sentence_transformers.util import semantic_search

def find_most_similar(text, embeddings, documents, k=3):
    query_embedding = bi_encoder.encode([text], show_progress_bar=False)
    similarities = semantic_search(query_embedding, embeddings, top_k=k)
    return [(documents[sim['corpus_id']], sim['score'], sim['corpus_id']) for sim in similarities[0]]

In [56]:
from random import sample

query = sample(dataset['test']['question'], 1)[0]
print(query)

What is Merced also known as?


In [57]:
def eval_ranking_open_source(query, top_k=3, re_rank_model=None):
    ans = {'retrieved_correct_position': None}
    correct_hash = q_to_hash[query]
    results = find_most_similar(query, doc_emb, docs, k=top_k)
    for idx, (passage, score, doc_idx) in enumerate(results):
        if correct_hash == my_hash(passage):
            ans['retrieved_correct_position'] =  idx
    if re_rank_model is not None:
        ans['reranked_correct_position'] = None
        sentence_combinations = [(query, r[0]) for r in results]

        # Compute the similarity scores for these combinations
        similarity_scores = re_rank_model.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
        for i, idx in enumerate(sim_scores_argsort):
            r = results[idx]
            if correct_hash and my_hash(r[0]) == correct_hash:
                ans['reranked_correct_position'] = i

    return ans

In [59]:
eval_ranking_open_source(query, top_k=3, re_rank_model=cross_encoder)


{'retrieved_correct_position': 0, 'reranked_correct_position': 0}