### Import Packages

In [1]:
from dotenv import load_dotenv # add this line
load_dotenv()

True

In [2]:
import time

In [3]:
import os
import googleapiclient.discovery
import urllib.parse as p
import pandas as pd
import json
import subprocess
import requests
import re

### Set-up APIs & Variables

load enviroment variables.

In [4]:
load_dotenv()

True

Load firebase credentials.

In [6]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# Use a service account.
cred = credentials.Certificate(os.getenv('PROJECT_ID')+'.json')

app = firebase_admin.initialize_app(cred)

db = firestore.client(app)
batch = db.batch()

# Semantic Search

In [8]:
from annoy import AnnoyIndex
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def create_df_from_collection_list(collection,columns):
    data = {}
    for col in columns:
        data[col] = [x[col] for x in collection]
    df = pd.DataFrame(data=data)
    return df


def index_full_text(corpus, collection_index_tag="None"):
    s = time.time()
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    # Create the search index, pass the size of embedding
    search_index = AnnoyIndex(corpus_embeddings.shape[1], 'angular')
    # search_index.load('test.ann')
    for i in range(len(corpus_embeddings)):
        search_index.add_item(i, corpus_embeddings[i])
    search_index.build(10) # 10 trees
    search_index.save('test.ann')

    print("Index time: ",s- time.time())
    return search_index

def query_corpus(query,search_index):
    s = time.time()


    # Get the query's embedding
    query_embed = embedder.encode(query, convert_to_tensor=True)

    # Retrieve the nearest neighbors
    similar_item_ids = search_index.get_nns_by_vector(query_embed,10,
                                                  include_distances=True)
    print("Query time: ",s- time.time())

    return similar_item_ids

In [11]:
def semantic_search_videos(query, collection,):

    text_data = pd.DataFrame(collection)[["text","videoId","transcript"]]
    text_search_index = index_full_text(text_data['text'])

    similar_item_ids = query_corpus(query,text_search_index )
    # Format the results

    results = pd.DataFrame(data={'texts': text_data.iloc[similar_item_ids[0]]['text'],
                              'distance': similar_item_ids[1],
                              'id': text_data.iloc[similar_item_ids[0]]['videoId'],
                              'transcript': text_data.iloc[similar_item_ids[0]]['transcript']})

    return results

In [None]:
def semantic_search_keyword(query, collection):

    list_of_transcript_dfs = []
    for video in collection:


        transcript = pd.DataFrame(video['transcript'])
        transcript['videoId'] = video['videoId']
        list_of_transcript_dfs.append(transcript)

        
    full_sent_data = pd.concat(list_of_transcript_dfs)
    sent_data = full_sent_data[['text','id','videoId']]

    sent_search_index = index_full_text(list(sent_data['text']))
    similar_item_ids = query_corpus(query, sent_search_index)
    sentence_results = pd.DataFrame(data={'text': sent_data.iloc[similar_item_ids[0]]['text'],
                            'distance': similar_item_ids[1],
                            'id': sent_data.iloc[similar_item_ids[0]]['id'],
                            'videoId': sent_data.iloc[similar_item_ids[0]]['videoId']})

    return sentence_results


In [12]:
def getTranscripts(channelId):
    docs = db.collection(u'Transcripts').stream()

    collection = []
    for doc in docs:
        vid =doc.to_dict()
        # if "#shorts" not in vid['title']:
        collection.append(vid)
    return collection

In [14]:
collection = getTranscripts('UCamLstJyCa-t5gfZegxsFMw')
results = semantic_search_videos("create a list of 5 products / brands launched by creators?", collection)
results

Index time:  -3.059123992919922
Query time:  -0.012986898422241211


Unnamed: 0,texts,distance,id,transcript
47,We bought merch from a bunch of top creators ...,1.010816,YVuIm8OLz-8,"[{'end': 6.04, 'start': 0.0, 'text': ' We boug..."
14,"What's up everyone, welcome to the Colin and ...",1.033572,9HsnKZnREgM,"[{'start': 0.0, 'end': 2.36, 'text': ' What's ..."
55,Today on the channel we'll be talking about e...,1.128185,f5WZgw0WGto,"[{'start': 0.0, 'text': ' Today on the channel..."
62,"Back in the car, back when the other breakdow...",1.165087,i05bI03nzv4,"[{'text': ' Back in the car, back when the oth..."
43,What the hell do I do now? This is Andrew Ray...,1.165928,VbNIh88Nq5k,"[{'end': 4.64, 'id': 0, 'start': 0.0, 'text': ..."
1,I am shocked people are still watching my vid...,1.19308,1t5oYKEn-1E,"[{'start': 0.0, 'end': 3.36, 'text': ' I am sh..."
69,"Big Cloud, it's out in the marker right now. ...",1.193215,knl2UOFr8bk,"[{'text': ' Big Cloud, it's out in the marker ..."
77,"In the future, we think Mr. Beast will sell h...",1.212363,o8UBXsiiS24,"[{'start': 0.0, 'end': 4.96, 'id': 0, 'text': ..."
26,"Two years ago, Mr. Beast and Mark Rover raise...",1.217704,JoI4BRPd8us,"[{'id': 0, 'text': ' Two years ago, Mr. Beast ..."
34,The Christie's auction for digital artist Bee...,1.220188,Ptk6P7Lc6rs,"[{'id': 0, 'text': ' The Christie's auction fo..."


In [None]:
#results_2 = semantic_search_keyword("bitcoin", collection)