In [None]:
!pip install datasets==2.16.1
!pip install git+https://github.com/openai/whisper.git
!pip install torch
!pip install transformers==4.31.0
!pip install --upgrade gradio

In [None]:
import whisper
from datasets import load_dataset
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import gc
import os
import torch
from transformers import AutoTokenizer,BertModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = BertModel.from_pretrained('bert-base-uncased')

whisper_model = whisper.load_model('large-v3')

dataset_name = "mlabonne/guanaco-llama2-1k"

In [None]:
stop_words = set(stopwords.words('english'))
def cleaned_text(text):
    if not isinstance(text, str):
        raise TypeError("The input must be a string.")
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text)
    return ' '.join(words)

In [None]:
data=load_dataset(dataset_name)
data = [{"text": rec["text"],} for rec in data['train']]

In [None]:
data[113]

In [None]:
cleaned_data = [{"text": cleaned_text(rec["text"])} for rec in data]

In [None]:
def get_embeddings_batch(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy().astype(np.float16)
        all_embeddings.append(embeddings)
        del inputs, outputs
        gc.collect()
    return np.vstack(all_embeddings)

In [None]:
data_texts = [rec['text'] for rec in cleaned_data]
data_embeddings = get_embeddings_batch(data_texts)

In [None]:
data_embeddings.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def transcribe_and_find_similar(audio_file):
    result = whisper_model.transcribe(audio_file)
    transcription = result['text']
    user_text = cleaned_text(transcription)
    user_texts = [user_text]
    user_embeddings = get_embeddings_batch(user_texts)
    similarities = cosine_similarity(user_embeddings, data_embeddings)
    most_similar_index = np.argmax(similarities)
    most_similar_text = data_texts[most_similar_index]
    return most_similar_text



In [None]:
voice = '/kaggle/input/final-test/ml.mp4'
most_similar_text = transcribe_and_find_similar(voice)
print(f'The most similar text to the input is:\n{most_similar_text}')
