In [10]:
!pip install \
  pymilvus==2.3.4 \
  langchain==0.0.352 \
  openai==1.6.1 \
  pytube==15.0.0 \
  youtube-transcript-api==0.6.1 \
  pyarrow==14.0.2 \
  typing_extensions==4.9.0 \
  comet-ml==3.35.5

# Restart the runtime after pip installing (CTRL + M)  Otherwise, the runtime
# remembers the old version of pyArrow and causes issues for pyMilvus


Collecting pymilvus==2.3.4
  Downloading pymilvus-2.3.4-py3-none-any.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.0.352
  Downloading langchain-0.0.352-py3-none-any.whl (794 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.4/794.4 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai==1.6.1
  Downloading openai-1.6.1-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytube==15.0.0
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting youtube-transcript-api==0.6.1
  Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)
Collecting typing_extensions==4.9.0
  Downloading ty

In [1]:
from pymilvus import (MilvusClient
                      , connections
                      , Collection
                      , CollectionSchema
                      , FieldSchema
                      , DataType
                      , utility)
import json


COLLECTION_NAME = 'youtube'
EMBEDDING_DIMENSION = 1536  # Embedding vector size in this example
ZILLIZ_CLUSTER_URI = 'YOUR ZILLIZ URI'  # Endpoint URI obtained from Zilliz Cloud
ZILLIZ_API_KEY = 'YOUR ZILLIZ API KEY'

YT_VIDEO_URLS = [
    "https://www.youtube.com/watch?v=Q4OBx3S0Ysw&t=118s",
    "https://youtu.be/4OZip0cgOho?si=KHUsA4J8L3rbZAAZ"]

# Connect to the zilliz cluster
connections.connect(uri=ZILLIZ_CLUSTER_URI, token=ZILLIZ_API_KEY, secure=True)

client = MilvusClient(
    uri=ZILLIZ_CLUSTER_URI,
    token=ZILLIZ_API_KEY)

# Remove any previous collections with the same name
if utility.has_collection(COLLECTION_NAME):
    utility.drop_collection(COLLECTION_NAME)

# Create collection which includes the id, title, and embedding.
fields = [
  FieldSchema(name='id', dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=36),
  FieldSchema(name='video_id', dtype=DataType.INT64,),
  FieldSchema(name='title', dtype=DataType.VARCHAR, description='Title texts', max_length=500),
  FieldSchema(name='author', dtype=DataType.VARCHAR, description='Author', max_length=200),
  FieldSchema(name='part_id', dtype=DataType.INT64),
  FieldSchema(name='max_part_id', dtype=DataType.INT64),
  FieldSchema(name='text', dtype=DataType.VARCHAR, description='Text of chunk', max_length=2000),
  FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=EMBEDDING_DIMENSION)
]

schema = CollectionSchema(fields=fields)

collection = Collection(name=COLLECTION_NAME, schema=schema)

# Create an index for the collection.
index_params = {
    'index_type': 'AUTOINDEX',
    'metric_type': 'IP',
    'params': {}
}


collection.create_index(field_name="embedding", index_params=index_params)



DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 62c844c69a1f4c29a8146d1ea1f1d3d8


Status(code=0, message=)

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from openai import OpenAI
from pymilvus import MilvusClient, connections
from uuid import uuid4
from langchain.document_loaders import YoutubeLoader
import youtube_transcript_api
import pytube

connections.connect(uri=ZILLIZ_CLUSTER_URI, token=ZILLIZ_API_KEY, secure=True)

client = MilvusClient(
    uri=ZILLIZ_CLUSTER_URI,
    token=ZILLIZ_API_KEY)



openai_client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="YOUR OPENAI API KEY",
)

# Extract embedding from text using OpenAI  string -> vector
# This function is directly from https://docs.zilliz.com/docs/similarity-search-with-zilliz-cloud-and-openai, but with "text-embedding-ada-002" added.
def create_embedding_from_string(text):
    return openai_client.embeddings.create(
        input=text,
        model='text-embedding-ada-002').data[0].embedding

text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 1000,
  chunk_overlap  = 50,
  length_function = len,
  add_start_index = True,
)

for video_id, url in enumerate(YT_VIDEO_URLS):

  yt_data = YoutubeLoader.from_youtube_url(url, add_video_info=True).load()[0]
  video_parts = text_splitter.create_documents([yt_data.page_content])

  for part_id, part in enumerate(video_parts):
      id = str(uuid4())
      print(f'uplading document {id}... {yt_data.metadata["title"]}')
      client.insert(
        collection_name=COLLECTION_NAME,
        data={
            'id': id,
            'video_id': video_id,
            'title': yt_data.metadata['title'],
            'author': yt_data.metadata['author'],
            'part_id': part_id,
            'max_part_id': len(video_parts),
            'text': part.page_content,
            'embedding': create_embedding_from_string(part.page_content)
        })

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 9fbc8ca209b149cc8bc0043da05f882b


uplading document f81e3520-153d-402f-87b6-ff0c4e66994b... Vector Similarity Search using Images with Zilliz
uplading document e89162f4-8ed8-49a6-ab51-781caff8e87a... Vector Similarity Search using Images with Zilliz
uplading document 0f146ba0-e625-4850-9695-bf31c7743652... Vector Similarity Search using Images with Zilliz
uplading document bf00d2d7-8ddf-4905-b0e0-4d6b083d24c4... Vector Similarity Search using Images with Zilliz
uplading document 911ff9f6-8be2-44b8-9c49-f650c2c90a93... Vector Similarity Search using Images with Zilliz
uplading document 269a2e29-c6b3-473b-91ea-dd58e6460e83... Vector Similarity Search using Images with Zilliz
uplading document 74a2fd45-15f7-4186-9f77-87310b907d71... Vector Similarity Search using Images with Zilliz
uplading document 5a6c877e-39c6-4464-a1cf-feb2d333ea9f... Vector Similarity Search using Images with Zilliz
uplading document e8eb56d8-4947-4e49-a4d1-c19a1b76eb41... Vector Similarity Search using Images with Zilliz
uplading document 33b74bd9-3

In [4]:
# The collection must be "LOADED" in zilliz for this to work.
results = collection.query(
    expr='title != "none"',
    output_fields=['title', 'author', 'part_id', 'max_part_id', 'text'])

with open('data.json', 'w') as file:
  file.write(json.dumps(results, indent=2))

In [5]:
## First, we’ll pip install the Comet library
!pip install comet_ml

## import comet_ml
from comet_ml import Experiment

## Create an experiment with your api key
experiment = Experiment(
    api_key='YOUR COMET API KEY',
    project_name='youtube_transcriptions',
    workspace='YOUR USERNAME'
)




[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/machine-learning-upgrade/youtube-transcriptions/be526e7183d94f949fc4421504c66a74



In [8]:
artifact = Artifact(name="milvus-query-results", artifact_type="dataset")
artifact.add("data.json")

experiment.log_artifact(artifact)
experiment.end()

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/machine-learning-upgrade/youtube-transcriptions/22a5d0e5eae144b6892606cd88b49e69

[1;38;5;39mCOMET INFO:[0m Artifact 'milvus-query-results' version 10.0.0 created (previous was: 9.0.0)
[1;38;5;39mCOMET INFO:[0m Scheduling the upload of 1 assets for a size of 21.01 KB, this can take some time
[1;38;5;39mCOMET INFO:[0m Artifact 'machine-learning-upgrade/milvus-query-results:10.0.0' has started uploading asynchronously
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1

In [7]:


from langchain.document_loaders import YoutubeLoader

loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=Q4OBx3S0Ysw&t=118s", add_video_info=True
)

data = loader.load()
data[0].page_content


"foreign [Applause] to perform Vector similarity search on images zillas is a vector database it is designed to handle massive data sets containing vectors vectors are just numerical representations of data so this is your text documents audio and that also includes images traditional methods of searching through these large data sets has been consuming and computationally expensive so zilis uses Advanced algorithms and data structures tailored specifically for Vector similarity search instead of comparing each point individually zilla's organizes these vectors in a way that optimizes similarity queries this allows you to quickly find items that are similar to a given query vector so is this focused on cutting-edge Technologies for data indexing storage and retrieval with an emphasis on GPU accelerated computing for a high level overview of this project we are going to get the connection from The Notebook to Melvis and set up a cluster we're going to import images from Google Drive I'l