In [28]:
from openai import OpenAI
from pinecone import Pinecone,ServerlessSpec
import hashlib
from tqdm import tqdm
import os

In [29]:
pinecone_key= os.environ.get("PINCONE_KEY")

In [30]:
pinecone_key

'1730e46c-a2b9-4493-9dbc-edd7a03dfbc0'

In [31]:
client=OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
INDEX_NAME='semantic-search-rag'
NAMESPACE="default"
ENGINE='text-embedding-3-small'
pc=Pinecone(api_key=pinecone_key)
              

In [32]:
def get_embeddings(texts,engine=ENGINE):
    response=client.embeddings.create(
        input=texts,
        model=engine
    )
    return [d.embedding for d in list(response.data)]
def get_embedding(text,engine=ENGINE):
    return get_embeddings([text],engine)[0]

len(get_embedding('hi')),len(get_embeddings(['hi','hello']))

(1536, 2)

In [33]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False, urls=None):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        text_batch = texts[i: i + batch_size]
        if urls:
            url_batch = urls[i: i + batch_size]
            prepared_texts = prepare_for_pinecone(text_batch, urls=url_batch)
        else:
            prepared_texts = prepare_for_pinecone(text_batch)


        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            vectors=prepared_texts,
            namespace=namespace
        )['upserted_count']


    return total_upserted

In [34]:
if INDEX_NAME not in pc.list_indexes().names():  # need to create the index
    print(f'Creating index {INDEX_NAME}')
    pc.create_index(
        name=INDEX_NAME,  # The name of the index
        dimension=1536,  # The dimensionality of the vectors for our OpenAI embedder
        metric='cosine',  # The similarity metric to use when searching the index
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Store the index as a variable
index = pc.Index(name=INDEX_NAME)
index

<pinecone.data.index.Index at 0x1e04a29dcd0>

In [35]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [36]:
def my_hash(s):
    return hashlib.md5(s.encode()).hexdigest()
my_hash('I love to hash it')

'ae76cc4dfd345ecaeea9b8ba0d5c3437'

In [37]:
def prepare_for_pinecone(texts, engine=ENGINE, urls=None):
    now = datetime.utcnow()

    embeddings = get_embeddings(texts, engine=engine)

    
    responses = [
        (
            my_hash(text),
            embedding,  
            dict(text=text, date_uploaded=now)  
        )
        for text, embedding in zip(texts, embeddings)  
    ]
    if urls and len(urls) == len(texts):
        for response, url in zip(responses, urls):
            response[-1]['url'] = url

    return responses

In [38]:
from datetime import datetime

In [22]:
texts = ['hi']

_id, embedding, metadata = prepare_for_pinecone(texts)[0]

print('ID:  ',_id, '\nLEN: ', len(embedding), '\nMETA:', metadata)


ID:   49f68a5c8493ec2c0bf489821c21fc3b 
LEN:  1536 
META: {'text': 'hi', 'date_uploaded': datetime.datetime(2024, 10, 11, 14, 22, 44, 464988)}


In [39]:
base_url = 'https://faq.ssa.gov'
medicare_faqs = base_url + '/en-US/topic?id=CAT-01092'
print(medicare_faqs)

from bs4 import BeautifulSoup
import requests

# get all links from medicare_faqs
urls = []
r = requests.get(medicare_faqs)
soup = BeautifulSoup(r.content, 'html.parser')
for link in soup.find_all('a'):
    if 'href' in link.attrs:
        if link['href'].startswith('/') and 'article' in link['href']:
            urls.append(base_url + link['href'])

https://faq.ssa.gov/en-US/topic?id=CAT-01092


In [40]:
urls

['https://faq.ssa.gov/en-us/Topic/article/KA-01735',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02713',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02125',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02131',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02166',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02983',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02995',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02137',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02154',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02113',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02148',
 'https://faq.ssa.gov/en-us/Topic/article/KA-02989']

In [41]:
texts = []
for url in tqdm(urls):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    body = soup.find('body').get_text()
    texts.append(body)

texts[0]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:19<00:00,  1.65s/it]


'\n\n\n\nYou’re offline. This is a read only version of the page.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to content\n\n\n  \n\n\n\n\n\n\n\nProtect Yourself from Scams \n\n\n\n \n\n \n\n\n\n\nProtect Yourself from Scams\n\n\n\nSkip to main content Social Security Search  Menu  Español  Sign in\n\n\n\n\nFrequently Asked Questions\n\n\n\n\nLast Modified: \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nFAQ Home\n\n\nTopics\n\n\r\n\t\t\t\t\tKA-01735\r\n\t\t\t\t\n\n\n\n\n\n Print\n\n\n\nHow do I get a replacement Medicare card? \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nViews: \n\n\n\nIf your Medicare card was lost, stolen, or destroyed, you can request a replacement online at Medicare.gov.\nYou can print an official copy of your card from your online Medicare account \nor call 1-800-MEDICARE (1-800-633-4227 TTY 1-877-486-2048) to order a replacement card to be sent in the mail.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nComments (0)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nFooter menu

In [42]:
def upload_texts_to_pinecone(texts, namespace=NAMESPACE, batch_size=None, show_progress_bar=False, urls=None):
    # Call the prepare_for_pinecone function to prepare the input texts for indexing
    total_upserted = 0
    if not batch_size:
        batch_size = len(texts)

    _range = range(0, len(texts), batch_size)
    for i in tqdm(_range) if show_progress_bar else _range:
        text_batch = texts[i: i + batch_size]
        if urls:
            url_batch = urls[i: i + batch_size]
            prepared_texts = prepare_for_pinecone(text_batch, urls=url_batch)
        else:
            prepared_texts = prepare_for_pinecone(text_batch)


        # Use the upsert() method of the index object to upload the prepared texts to Pinecone
        total_upserted += index.upsert(
            vectors=prepared_texts,
            namespace=namespace
        )['upserted_count']


    return total_upserted

In [43]:
BATCH_SIZE = 4
upload_texts_to_pinecone(texts, batch_size=BATCH_SIZE, urls=urls, show_progress_bar=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.12s/it]


12

In [None]:
 results = query_from_pinecone('I lost my medicare card', top_k=3)
 for result in results:
    print(result['metadata']['url'], result['score'], result['metadata']['text'][:50])

In [45]:
def query_from_pinecone(query, top_k=3, include_metadata=True):
    # get embedding from THE SAME embedder as the documents
    query_embedding = get_embedding(query, engine=ENGINE)

    return index.query(
      vector=query_embedding,
      top_k=top_k,
      namespace=NAMESPACE,
      include_metadata=include_metadata   # gets the metadata (dates, text, etc)
    ).get('matches')

In [46]:
 results = query_from_pinecone('I lost my medicare card', top_k=3)
 for result in results:
    print(result['metadata']['url'], result['score'], result['metadata']['text'][:50])

https://faq.ssa.gov/en-us/Topic/article/KA-01735 0.688788414 



You’re offline. This is a read only version of
https://faq.ssa.gov/en-us/Topic/article/KA-02713 0.518452227 



You’re offline. This is a read only version of
https://faq.ssa.gov/en-us/Topic/article/KA-02113 0.504733 



You’re offline. This is a read only version of


Collecting supabase
  Downloading supabase-2.9.0-py3-none-any.whl (16 kB)
Collecting realtime<3.0.0,>=2.0.0
  Downloading realtime-2.0.5-py3-none-any.whl (20 kB)
Collecting postgrest<0.18.0,>=0.17.0
  Downloading postgrest-0.17.1-py3-none-any.whl (22 kB)
Collecting storage3<0.9.0,>=0.8.0
  Downloading storage3-0.8.1-py3-none-any.whl (16 kB)
Collecting gotrue<3.0.0,>=2.7.0
  Downloading gotrue-2.9.2-py3-none-any.whl (48 kB)
     ---------------------------------------- 48.6/48.6 kB 2.4 MB/s eta 0:00:00
Collecting supafunc<0.7.0,>=0.6.0
  Downloading supafunc-0.6.1-py3-none-any.whl (6.6 kB)
Collecting strenum<0.5.0,>=0.4.9
  Downloading StrEnum-0.4.15-py3-none-any.whl (8.9 kB)
Collecting deprecation<3.0.0,>=2.1.0
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting aiohttp<4.0.0,>=3.10.6
  Downloading aiohttp-3.10.10-cp39-cp39-win_amd64.whl (381 kB)
     -------------------------------------- 381.8/381.8 kB 4.8 MB/s eta 0:00:00
Collecting yarl<2.0,>=1.12.0
  Downloadin

ERROR: Could not install packages due to an OSError: [WinError 5] Accès refusé: 'C:\\Users\\simo_\\anaconda3\\Lib\\site-packages\\~arl\\_quoting_c.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.

