In [21]:
from datasets import load_dataset
from dotenv import load_dotenv, find_dotenv
import lancedb
from lancedb.context import contextualize
from lancedb.embeddings import with_embeddings
from openai import OpenAI
import os
import pandas as pd


load_dotenv(find_dotenv())
gptAPI = os.getenv("OPENAI_API_KEY")

In [2]:
uri = "data/sample-lancedb"
db = lancedb.connect(uri)
table = db.create_table("my_table",
                         data=[{"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
                               {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}],
                        exist_ok=True)
result = table.search([100, 100]).limit(2).to_pandas()

In [3]:
result

Unnamed: 0,vector,item,price,_distance
0,"[5.9, 26.5]",bar,20.0,14257.05957
1,"[3.1, 4.1]",foo,10.0,18586.421875


In [12]:
## Load the dataset
data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

Downloading readme: 100%|██████████████████| 2.13k/2.13k [00:00<00:00, 3.50MB/s]
Downloading data: 100%|████████████████████| 79.8M/79.8M [00:11<00:00, 6.83MB/s]
Generating train split: 208619 examples [00:00, 1948306.01 examples/s]


Dataset({
    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
    num_rows: 208619
})

In [15]:
## Prepare the context
df = (contextualize(data.to_pandas())
      .groupby("title").text_col("text")
      .window(20).stride(4)
      .to_pandas())
df.head(3)

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end
177622,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t0.0,Imagine an AI where all in the same model you ...,0.0,24.0
177626,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t33.0,"So when you're done, you probably want to keep...",33.0,45.0
177630,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t66.0,You can download multiple size variants all th...,66.0,77.0


In [16]:
df.tail(3)

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end
23451,🤗 Hugging Face just released *Diffusers* - for...,2022-07-26 15:27:46 UTC,https://youtu.be/UzkdOg7wWmI,UzkdOg7wWmI,UCv83tO5cePwHMt1952IVVHw,UzkdOg7wWmI-t887.76,"But yeah, it's literally very early days with ...",887.76,894.8
23455,🤗 Hugging Face just released *Diffusers* - for...,2022-07-26 15:27:46 UTC,https://youtu.be/UzkdOg7wWmI,UzkdOg7wWmI,UCv83tO5cePwHMt1952IVVHw,UzkdOg7wWmI-t904.72,So I hope this is interesting to see. I'm pret...,904.72,912.16
23459,🤗 Hugging Face just released *Diffusers* - for...,2022-07-26 15:27:46 UTC,https://youtu.be/UzkdOg7wWmI,UzkdOg7wWmI,UCv83tO5cePwHMt1952IVVHw,UzkdOg7wWmI-t923.76,So thank you very much for watching. I hope th...,923.76,929.36


In [19]:
## Call OPENAI Embeddings API
client = OpenAI()
assert len(client.models.list().data) > 0

In [20]:
## Chose and embeddings model
def embed_func(c):    
    rs = client.embeddings.create(input=c, model="text-embedding-ada-002")
    return [rs.data[0].embedding]

In [22]:
## Create LanceDB Table
data = with_embeddings(embed_func, df, show_progress=True)
data.to_pandas().head(1)

  data = with_embeddings(embed_func, df, show_progress=True)




100%|███████████████████████████████████████████| 53/53 [17:55<00:00, 20.29s/it]


ArrowInvalid: Added column's length must match table's length. Expected length 52250 but got length 53