[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/recommendation/audio-recommender/audio-recommendation.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/recommendation/audio-recommender/audio-recommendation.ipynb)

# Leverage the Power of Large Language Models and Vector Database to Build a Robust Audio Recommendation System

In [1]:
%%bash
pip install pandas
pip install -U openai-whisper
pip install pytube
pip install numpy
pip install pinecone-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai-whisper
  Downloading openai-whisper-20230314.tar.gz (792 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 792.9/792.9 kB 14.6 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting tiktoken==0.3.1 (from openai-whisper)
  Downloading tiktoken-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 66.3 MB/s eta 0:00:00
Collecting ffmpeg-python==0.2.0 (from openai-whisper)
  Downloading ffmpeg_python-0.2.

In [2]:
# Import the modules
import os
import torch
import whisper
from pinecone import Pinecone
import numpy as np
import pandas as pd
from pytube import YouTube

  from tqdm.autonotebook import tqdm


In [3]:
def video_to_audio(video_url, destination):

    # Get the video
    video = YouTube(video_url)

    # Convert video to Audio
    audio = video.streams.filter(only_audio=True).first()

    # Save to destination
    output = audio.download(output_path = destination)

    name, ext = os.path.splitext(output)
    new_file = name + '.mp3'

    # Replace spaces with "_"
    new_file = new_file.replace(" ", "_")

    # Change the name of the file
    os.rename(output, new_file)

    return new_file

In [4]:
%%bash
mkdir "audio_data"

In [9]:
# Create URL column
audio_path = "audio_data"

# Have just provided a sample of links for experimentation purpose
list_videos = ["https://www.youtube.com/watch?v=IdTMDpizis8",
              "https://www.youtube.com/watch?v=fLeJJPxua3E",
              "https://www.youtube.com/watch?v=z3FA2kALScU",
              "https://www.youtube.com/watch?v=yBrRpb8aLwk",
              "https://www.youtube.com/watch?v=ERClHCOF14c",
              "https://www.youtube.com/watch?v=b-Pn0yXL9y8",
              "https://www.youtube.com/watch?v=CYfU9WBy_HA",
              "https://www.youtube.com/watch?v=FncTDZxNbM4",
              "https://www.youtube.com/watch?v=JjCFoba5hKE",
              "https://www.youtube.com/watch?v=YUxR3d10lz4",
              "https://www.youtube.com/watch?v=t1XCzWlYWeA",
              "https://www.youtube.com/watch?v=scvDvVbv1hk",
              "https://www.youtube.com/watch?v=z3FA2kALScU&t", 
              "https://www.youtube.com/watch?v=wsNzAuYDgy0", 
              "https://www.youtube.com/watch?v=R0Sq_x6eObE", 
              "https://www.youtube.com/watch?v=RLTgnOuYb6o&t", 
              "https://www.youtube.com/watch?v=UF8uR6Z6KLc", 
              "https://www.youtube.com/watch?v=aF1_oLdruSc", 
              "https://www.youtube.com/watch?v=1bumPyvzCyo", 
              "https://www.youtube.com/watch?v=YUxR3d10lz4&t", 
              "https://www.youtube.com/watch?v=gMWXMMUg5pI"]
# Create dataframe
transcription_df = pd.DataFrame(list_videos, columns=['URLs'])

In [10]:
transcription_df.head()

Unnamed: 0,URLs
0,https://www.youtube.com/watch?v=IdTMDpizis8
1,https://www.youtube.com/watch?v=fLeJJPxua3E
2,https://www.youtube.com/watch?v=z3FA2kALScU
3,https://www.youtube.com/watch?v=yBrRpb8aLwk
4,https://www.youtube.com/watch?v=ERClHCOF14c


In [11]:
# Create the files_name
transcription_df["file_name"] = transcription_df["URLs"].apply(lambda url: video_to_audio(url, audio_path))
transcription_df.head()

Unnamed: 0,URLs,file_name
0,https://www.youtube.com/watch?v=IdTMDpizis8,/content/audio_data/Jocko_Willink_GOOD_(Offici...
1,https://www.youtube.com/watch?v=fLeJJPxua3E,/content/audio_data/Best_Short_Motivational_Sp...
2,https://www.youtube.com/watch?v=z3FA2kALScU,/content/audio_data/STOP_WASTING_TIME_-_Part_1...
3,https://www.youtube.com/watch?v=yBrRpb8aLwk,/content/audio_data/YOUR_VALUE_-_Powerful_Moti...
4,https://www.youtube.com/watch?v=ERClHCOF14c,/content/audio_data/LION_MENTALITY_-_Motivatio...


## Transcription

In [12]:
# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model
whisper_model = whisper.load_model("large", device=device)

100%|██████████████████████████████████████| 2.87G/2.87G [00:22<00:00, 140MiB/s]


In [13]:
def audio_to_text(audio_file):

    return whisper_model.transcribe(audio_file)["text"]

In [14]:
# Apply the function to all the audio files
transcription_df["transcriptions"] = transcription_df["file_name"].apply(lambda f_name: audio_to_text(f_name))


# Show the first five rows
transcription_df.head()

Unnamed: 0,URLs,file_name,transcriptions
0,https://www.youtube.com/watch?v=IdTMDpizis8,/content/audio_data/Jocko_Willink_GOOD_(Offici...,"one of my direct subordinates, one of my guys..."
1,https://www.youtube.com/watch?v=fLeJJPxua3E,/content/audio_data/Best_Short_Motivational_Sp...,"If you only have 24 hours in a day, your succ..."
2,https://www.youtube.com/watch?v=z3FA2kALScU,/content/audio_data/STOP_WASTING_TIME_-_Part_1...,The one commodity that is most valuable on th...
3,https://www.youtube.com/watch?v=yBrRpb8aLwk,/content/audio_data/YOUR_VALUE_-_Powerful_Moti...,Let me speak to that person that feels like t...
4,https://www.youtube.com/watch?v=ERClHCOF14c,/content/audio_data/LION_MENTALITY_-_Motivatio...,"If the lion is the king of the jungle, how ca..."


In [17]:
import textwrap

In [18]:
wrapper = textwrap.TextWrapper(width=60)
first_transcription = transcription_df.iloc[0]["transcriptions"]
formatted_transcription = wrapper.fill(text=first_transcription)

# Check first transcription
print(formatted_transcription)

 one of my direct subordinates, one of my guys that worked
for me, he would call me up or pull me aside with some major
problem, some issue that was going on, and he'd say, boss,
we got this and that and the other thing and I'd look at him
and I'd say, good. And finally one day he was telling me
about some issue that he was having, some problem, and he
said, I already know what you're gonna say. And I said,
well, what am I gonna say? He said, you're gonna say good.
He said, that's what you always say. When something is wrong
and going bad, you always just look at me and say good. And
I said, well, yeah. When things are going bad, there's gonna
be some good that's gonna come from it. Didn't get the new
high-speed gear we wanted? Good. Didn't get promoted? Good.
More time to get better. Oh, mission got canceled? Good, we
can focus on another one. Didn't get funded? Didn't get the
job you wanted? Got injured? Sprained my ankle? Got tapped
out? Good. Got beat? Good. You learned. Unexpected

In [23]:
transcription_df = transcription_df.drop(["transcription_length"], axis=1)

In [28]:
transcription_df.head()

## Generation of Transcripts' Embeddings 

In [34]:
!pip install openai
import openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/72.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Downl

In [35]:
# Set up the OpenAI key
openai.api_key = "<YOUR OPENAI KEY>"

In [41]:
def get_embeddings(text_to_embed):

	response = openai.Embedding.create(
    	model= "text-embedding-ada-002",
    	input=[text_to_embed]
	)

	# Extract the AI output embedding as a list of floats
	embedding = response["data"][0]["embedding"]

	return embedding

In [43]:
transcription_df["embedding"] = transcription_df["transcriptions"].astype(str).apply(get_embeddings)

In [44]:
transcription_df.head()

Unnamed: 0,URLs,file_name,transcriptions,embedding
0,https://www.youtube.com/watch?v=RLTgnOuYb6o&t,/content/audio_data/Oprah_Winfreys_Life_Advice...,The three things that I want to leave with yo...,"[0.004491745959967375, -0.010612116195261478, ..."
1,https://www.youtube.com/watch?v=1bumPyvzCyo,/content/audio_data/Arnold_Schwarzenegger_Leav...,I'm here to talk about success. The first rul...,"[-0.015403729863464832, -0.03353560343384743, ..."
2,https://www.youtube.com/watch?v=b-Pn0yXL9y8,/content/audio_data/5_Minutes_to_Start_Your_Da...,"If you want to change the world, start off by...","[0.008127240464091301, 0.004852672573179007, 0..."
3,https://www.youtube.com/watch?v=scvDvVbv1hk,/content/audio_data/CHANGE_YOUR_MINDSET_-_Moti...,Are you a gazelle? Or are you a lion? Let me ...,"[-0.01881832256913185, 0.00985600147396326, 0...."
4,https://www.youtube.com/watch?v=YUxR3d10lz4,/content/audio_data/ITS_TIME_TO_GET_AFTER_IT!_...,Listen to me once you realize who you are you...,"[-0.012264629825949669, -0.02990311197936535, ..."


In [50]:
vector_dim = transcription_df.iloc[0].embedding
len(vector_dim)

1536

## Configure your environment

In [51]:
# find API key in console at app.pinecone.io
api_key = os.getenv('PINECONE_API_KEY') or 'PINECONE_API_KEY'
# find ENV (cloud region) next to API key in console
env = os.getenv('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

# Initialize connection to pinecone
pinecone.init(
  api_key=api_key,
  environment=env
)

# Index params
my_index_name = "audio-search"
vector_dim = len(transcription_df.iloc[0].embedding)

if my_index_name not in pinecone.list_indexes().names():
  # Create the index
  pinecone.create_index(name = my_index_name,
                      dimension=vector_dim,
                      metric="cosine", shards=1,
                      pod_type='s1.x1')
# Connect to the index
my_index = pinecone.Index(index_name = my_index_name)

In [52]:
# Show information about the vector index
my_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Populate the the Pinecone Vector Index

In [54]:
transcription_df["vector_id"] = transcription_df.index
transcription_df["vector_id"] = transcription_df["vector_id"].apply(str)

# Get all the metadata
final_metadata = []

for index in range(len(transcription_df)):
  final_metadata.append({
      'ID':  index,
      'url': transcription_df.iloc[index].URLs,
      'transcription': transcription_df.iloc[index].transcriptions
  })

audio_IDs = transcription_df.vector_id.tolist()
audio_embeddings = [arr for arr in transcription_df.embedding]

# Create the single list of dictionary format to insert
data_to_upsert = list(zip(audio_IDs, audio_embeddings, final_metadata))

# Upload the final data
my_index.upsert(vectors = data_to_upsert)

# Show information about the vector index
my_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 147}},
 'total_vector_count': 147}

In [None]:
N = 3
my_query_embedding = transcription_df.embedding[0]

# Run the Query Search
my_index.query(vector=my_query_embedding, top_k=N, include_metadata=True)