## Semantic Search over your Meeting audio data

This notebook demonstrates how to quickly enable semantic search given a single audio file with Pinecone and Hugging Face. Don't have one handy? No problem, use
the sample audio instead.

In [None]:
## Installs
!pip install datasets transformers pinecone



In [29]:
# Grab your desired audio file compatible with Hugging Face Pipelines and put it here
from getpass import getpass
audio_path = ""
transcription_result = []

api_key = getpass("Please enter your Pinecone API key here: ")

In [30]:
from datasets import load_dataset
from transformers import pipeline

pipeline = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3",
)


if audio_path == "":
    # use Hugging Face Sample Code instead, located here https://huggingface.co/learn/audio-course/en/chapter7/transcribe-meeting
    concatenated_librispeech = load_dataset(
    "sanchit-gandhi/concatenated_librispeech", split="train")
    transcription_result = pipeline(concatenated_librispeech[0]["audio"]["array"], return_timestamps=True)
    transcription_result
else:
    # Use your own audio file, check out this for details: https://huggingface.co/openai/whisper-large-v3
    transcription_result = pipeline(audio_path, return_timestamps=True)




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
print(transcription_result["chunks"])

[{'timestamp': (0.0, 15.1), 'text': ' the second in importance is as follows sovereignty may be defined to be the right of making laws in france the king really exercises a portion of the sovereign power since the laws have no weight'}, {'timestamp': (15.1, 21.72), 'text': " he was in a fevered state of mind owing to the blight his wife's action threatened to cast upon his entire future"}]


In [32]:
## use sentences as chunks, and transform into records for upsertion

# Turn into records
records = [
    {
        "_id": str(idx),
        "sentence": chunk["text"],
        # add any other desired metadata here
    }
    for idx, chunk in enumerate(transcription_result["chunks"])
]

# Import the Pinecone library
from pinecone import Pinecone

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=api_key)
namespace = "meeting-1"
# Create a dense index with integrated embedding
index_name = "meeting-transcription-index"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "sentence"}
        }
    )

index = pc.Index(index_name)
# query.

In [33]:
# upsert into pinecone
def batch_upsert(records, batch_size=96, namespace=namespace):
    # Great for longer audio files and batches of sentences
    for i in range(0, len(records), batch_size):
        batch = records[i:i+batch_size]
        index.upsert_records(namespace=namespace, records=batch)

batch_upsert(records)

In [34]:
# Replace with your own query here if needed
import time
query = "Tell me about the king of France"

# Depending on the size of your dataset, it may take a few seconds for it to finish
# embedding and populating into the index.
time.sleep(10)

results = index.search(
    namespace=namespace,
    query={
        "inputs": {"text": query},
        "top_k": 5,
    },
)

print(results)

{'result': {'hits': [{'_id': '0',
                      '_score': 0.2220812439918518,
                      'fields': {'sentence': ' the second in importance is as '
                                             'follows sovereignty may be '
                                             'defined to be the right of '
                                             'making laws in france the king '
                                             'really exercises a portion of '
                                             'the sovereign power since the '
                                             'laws have no weight'}},
                     {'_id': '1',
                      '_score': 0.02443765476346016,
                      'fields': {'sentence': ' he was in a fevered state of '
                                             'mind owing to the blight his '
                                             "wife's action threatened to cast "
                                             'upon his e

In [36]:
# Cleanup

pc.delete_index(name=index_name)