In [None]:
# https://www.pinecone.io/learn/openai-gen-qa/
# Pincone is a service that provides Indexing and vector search as a service

In [None]:
%pip install -qU openai pinecone datasets cohere tiktoken --upgrade

^C
Note: you may need to restart the kernel to use updated packages.


In [13]:
# Get the openai secret key:
import getpass

secret_key = getpass.getpass('Please enter your openai key:')

In [3]:
from openai import OpenAI
from IPython.display import display, Math, Markdown

client = OpenAI(api_key=secret_key)

In [29]:
from pinecone import Pinecone, PodSpec
import os

PINECONE_API_KEY = getpass.getpass("Please enter your pinecone key: ")

# Initialize connection (get API key at app.pinecone.io):
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
prompt = "Tell me about the history of the United States of America."

response = client.chat.completions.create(
    model ="gpt-4o-mini",
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ],
    max_tokens=4000,
    temperature=1,
)
response.choices[0].message.content

"The history of the United States is rich and complex, spanning several centuries and involving various cultural and political developments. Below is an overview of key events and periods in U.S. history:\n\n### Pre-Columbian and Indigenous Peoples (Before 1492)\nBefore European contact, the Americas were inhabited by a diverse array of indigenous peoples, with distinct cultures, languages, and societies. Major civilizations such as the Aztecs, Maya, and Inca thrived in other parts of the Americas, while North America was home to tribes like the Sioux, Navajo, and Iroquois.\n\n### European Exploration and Colonization (1492-1607)\nChristopher Columbus's voyage in 1492 marked the beginning of European exploration of the Americas. Subsequent expeditions led to the establishment of various colonies by Spain, France, England, and the Netherlands. The Spanish established settlements in the Southwest, while the French focused on the fur trade in the Northeast and along the Mississippi River.

In [49]:
# First let's make it simpler to get answers
def askGpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user", 
                "content": prompt
            }
        ],
        max_tokens=4000,
        temperature=1,
    )
    return response.choices[0].message.content

query = (
    "Which training approach is best for sentence transformers when I only have related sentence pairs?"
)

askGpt(query)

"When you have only related sentence pairs, a few training approaches can be especially effective for training sentence transformers:\n\n1. **Contrastive Learning**: In this approach, you focus on minimizing the distance between related sentence pairs while maximizing the distance between unrelated sentences (or sampling negative instances). If you only have related pairs, you can still generate negative samples by using random sentences from your dataset or other datasets.\n\n2. **Triplet Loss**: You can create triplets consisting of an anchor (one sentence), a positive (a related sentence), and a negative (an unrelated sentence) sample. The goal is to ensure that the positive sample is closer to the anchor than the negative one. Similar to contrastive learning, you'll need to create or sample negative pairs.\n\n3. **Siamese Networks**: You can use a Siamese network architecture where two identical subnetworks process the two sentences in a pair. The loss can be based on the similarit

In [17]:
embed_model = "text-embedding-ada-002"

res = client.embeddings.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], model=embed_model
)

In [None]:
# Vector embeddings for each document generated by the model
res.data

[Embedding(embedding=[-0.0031264047138392925, 0.011723595671355724, -0.005217977799475193, -0.027155056595802307, -0.016435954719781876, 0.03246741741895676, -0.01624719239771366, -0.0010095506440848112, -0.025928091257810593, -0.006556179840117693, 0.0201303381472826, 0.01667865179479122, -0.009155056439340115, 0.02344719134271145, -0.010247191414237022, 0.013449438847601414, 0.025213483721017838, -0.016853932291269302, 0.0121617978438735, -0.01624719239771366, -0.0043820226565003395, -0.006495506037026644, -0.004368539433926344, 0.020723596215248108, -0.01062471978366375, -0.0037584269884973764, 0.013712359592318535, -0.026251686736941338, -0.0003962781047448516, -0.002147190971300006, 0.005831460934132338, -0.010092135518789291, -0.028179775923490524, -0.016206741333007812, -0.004317977465689182, 0.007469663396477699, -0.002902247244492173, -0.031550563871860504, 0.024000000208616257, -0.03338427096605301, -0.0003642556257545948, 0.013051685877144337, 0.007186517119407654, -0.005642

In [19]:
# We have created two vectors (one for each sentence input)
len(res.data)

2

In [20]:
# We have created two 1536-dimensional vectors
len(res.data[0].embedding), len(res.data[1].embedding)

(1536, 1536)

In [21]:
# We can also get the vector for a single sentence
res.data[0].embedding

[-0.0031264047138392925,
 0.011723595671355724,
 -0.005217977799475193,
 -0.027155056595802307,
 -0.016435954719781876,
 0.03246741741895676,
 -0.01624719239771366,
 -0.0010095506440848112,
 -0.025928091257810593,
 -0.006556179840117693,
 0.0201303381472826,
 0.01667865179479122,
 -0.009155056439340115,
 0.02344719134271145,
 -0.010247191414237022,
 0.013449438847601414,
 0.025213483721017838,
 -0.016853932291269302,
 0.0121617978438735,
 -0.01624719239771366,
 -0.0043820226565003395,
 -0.006495506037026644,
 -0.004368539433926344,
 0.020723596215248108,
 -0.01062471978366375,
 -0.0037584269884973764,
 0.013712359592318535,
 -0.026251686736941338,
 -0.0003962781047448516,
 -0.002147190971300006,
 0.005831460934132338,
 -0.010092135518789291,
 -0.028179775923490524,
 -0.016206741333007812,
 -0.004317977465689182,
 0.007469663396477699,
 -0.002902247244492173,
 -0.031550563871860504,
 0.024000000208616257,
 -0.03338427096605301,
 -0.0003642556257545948,
 0.013051685877144337,
 0.00718651

In [None]:
#Downloading data from a youtubbe transcription dataset
from datasets import load_dataset

data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 208619/208619 [00:00<00:00, 1711554.12 examples/s]


Dataset({
    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
    num_rows: 208619
})

In [24]:
data[0]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'video_id': '35Pdoyi6ZoQ',
 'channel_id': 'UCv83tO5cePwHMt1952IVVHw',
 'id': '35Pdoyi6ZoQ-t0.0',
 'text': 'Hi, welcome to the video.',
 'start': 0.0,
 'end': 9.36}

In [None]:
# tqdm.auto automatically selects the best progress bar visualization
from tqdm.auto import tqdm

new_data = []

window = 20  # number of sentences to combine
stride = 4  # number of sentences to 'stride' over, used to create overlap

for i in tqdm(range(0, len(data), stride)):
    i_end = min(len(data)-1, i+window)
    if data[i]['title'] != data[i_end]['title']:
        # in this case we skip this entry as we have start/end of two videos
        continue
    text = ' '.join(data[i:i_end]['text'])
    # create the new merged dataset
    new_data.append({
        'start': data[i]['start'],
        'end': data[i_end]['end'],
        'title': data[i]['title'],
        'text': text,
        'id': data[i]['id'],
        'url': data[i]['url'],
        'published': data[i]['published'],
        'channel_id': data[i]['channel_id']
    })

100%|██████████| 52155/52155 [00:24<00:00, 2110.26it/s]


In [26]:
new_data[0]

{'start': 0.0,
 'end': 74.12,
 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'text': "Hi, welcome to the video. So this is the fourth video in a Transformers from Scratch mini series. So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data. We built a tokenizer with it. And then we've set up our input pipeline ready to begin actually training our model, which is what we're going to cover in this video. So let's move over to the code. And we see here that we have essentially everything we've done so far. So we've built our input data, our input pipeline. And we're now at a point where we have a data loader, PyTorch data loader, ready. And we can begin training a model with it. So there are a few things to be aware of. So I mean, first, let's just have a quick look at the structure of our data.",
 'id': '35Pdoyi6ZoQ-t0.0',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'published': '2021-07-06 13:0

In [None]:
#Previously I created this index on the pinecone dashboard
index_name = "firstdatabase"
environment = "us-east-1"
pc = Pinecone()  # This reads the PINECONE_API_KEY env var

# Create the index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # Using the same vector dimensions as text-embedding-ada-002
        metric="cosine",
        spec=PodSpec(
            environment=environment,
            pod_type="p1.x1",
            pods=1,
            metadata_config={"indexed": ["batch"]},
        ),
    )

In [31]:
# Connect to Index:
index = pc.Index(name=index_name)

In [32]:
# Describe the Index:
description = pc.describe_index(name=index_name)
print(description)

{'deletion_protection': 'disabled',
 'dimension': 1536,
 'host': 'firstdatabase-tri9qtn.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'firstdatabase',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': {'embedding_model': 'text-embedding-3-small'},
 'vector_type': 'dense'}


In [35]:
# Insert the data into the Pinecone index
# We will insert the embeddings in batches to avoid rate limiting
# We will also insert the metadata along with the embeddings
from tqdm.auto import tqdm
import datetime
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(new_data), batch_size)):
    # find end of batch
    i_end = min(len(new_data), i+batch_size)
    meta_batch = new_data[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = client.embeddings.create(input=texts, model=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = client.embeddings.create(input=texts, model=embed_model)
                done = True
            except:
                pass
    embeds = [record.embedding for record in res.data]
    # cleanup metadata
    # we only need a few fields from the metadata
    meta_batch = [{
        'start': x['start'],
        'end': x['end'],
        'title': x['title'],
        'text': x['text'],
        'url': x['url'],
        'published': x['published'],
        'channel_id': x['channel_id']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    
    # Upsert to Pinecone
    index.upsert(vectors=to_upsert)

100%|██████████| 487/487 [22:34<00:00,  2.78s/it]


In [38]:
res = client.embeddings.create(
    input=[query],
    model=embed_model
)

# retrieve from Pinecone
text_vector = res.data[0].embedding

# get relevant contexts (including the questions)
res = index.query(vector=text_vector, top_k=2, include_metadata=True)

In [39]:
res

{'matches': [{'id': 'NNS5pOpjvAQ-t503.36',
              'metadata': {'channel_id': 'UCv83tO5cePwHMt1952IVVHw',
                           'end': 673.76,
                           'published': '2021-11-04 13:00:10 UTC',
                           'start': 503.36,
                           'text': 'And another thing is that we also need '
                                   'something called hard negatives in the '
                                   'training data in order for this model to '
                                   'perform well. So what I mean by hard '
                                   'negative is, say we have our, you know, we '
                                   'have our source sentence A here and we '
                                   'have this source B, which is like a '
                                   'similar sentence, a high similarity '
                                   'sentence. They mean basically the same '
                                   "thing. W

In [43]:
limit = 3750
query = (
    "Which training approach is best for sentence transformers when I only have related sentence pairs?"
)

def retrieve(query):
    res = client.embeddings.create(
        input=[query],
        model=embed_model
    )

    # retrieve from Pinecone
    text_vector = res.data[0].embedding

    # get relevant contexts
    res = index.query(vector=text_vector, top_k=3, include_metadata=True)
    contexts = [
        x['metadata']['text'] for x in res['matches']
    ]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    # if we hit the limit, we stop and use the contexts up to that point
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt

In [44]:
# First we retrieve relevant items from Pinecone
query_with_contexts = retrieve(query)
query_with_contexts

"Answer the question based on the context below.\n\nContext:\nAnd another thing is that we also need something called hard negatives in the training data in order for this model to perform well. So what I mean by hard negative is, say we have our, you know, we have our source sentence A here and we have this source B, which is like a similar sentence, a high similarity sentence. They mean basically the same thing. We'd also have to add a source C and this source C will have to be similar in the words that uses to source A, but actually means something different. So it's harder for the model to differentiate between them. Again, the model would have to figure out, you know, these two sentences are not similar, even though they seem similar at first, but they're not. So it makes the task, the training task harder for the model, which of course makes the model better. So that is training approach number one. And we've mentioned the parallel data there. That's the data set we're going to b

In [45]:
print(query_with_contexts)

Answer the question based on the context below.

Context:
And another thing is that we also need something called hard negatives in the training data in order for this model to perform well. So what I mean by hard negative is, say we have our, you know, we have our source sentence A here and we have this source B, which is like a similar sentence, a high similarity sentence. They mean basically the same thing. We'd also have to add a source C and this source C will have to be similar in the words that uses to source A, but actually means something different. So it's harder for the model to differentiate between them. Again, the model would have to figure out, you know, these two sentences are not similar, even though they seem similar at first, but they're not. So it makes the task, the training task harder for the model, which of course makes the model better. So that is training approach number one. And we've mentioned the parallel data there. That's the data set we're going to be us

In [50]:
askGpt(query_with_contexts)

"The best training approach for sentence transformers when you only have related sentence pairs is to use Natural Language Inference (NLI) with multiple negatives ranking loss. This approach allows you to train or fine-tune the model using only entailment (positive) pairs while also being able to utilize contradictory pairs to further improve performance, even if you don't have a full dataset of different types of pairs."