<a href="https://www.kaggle.com/code/laansdole/stt-rag-tts-project?scriptVersionId=246169704" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# STT

In [None]:
!pip install transcribe-anything 

In [None]:
!transcribe-anything https://www.youtube.com/watch?v=dQw4w9WgXcQ

In [None]:
!transcribe-anything "/kaggle/input/mini-speech-diarization/dataset/test/test.wav"

In [None]:
from pydub import AudioSegment
import os

def convert_mp3_to_wav(mp3_file_path, wav_file_path):
    audio = AudioSegment.from_mp3(mp3_file_path)
    audio.export(wav_file_path, format="wav")

directory_path = "/kaggle/input/mini-speech-diarization/dataset/raw"
wav_directory_path = "/kaggle/working/converted_wav_files"

os.makedirs(wav_directory_path, exist_ok=True)

for filename in sorted(os.listdir(directory_path)):
    if filename.endswith(".mp3"):
        mp3_file_path = os.path.join(directory_path, filename)
        wav_file_path = os.path.join(wav_directory_path, filename.replace(".mp3", ".wav"))
        convert_mp3_to_wav(mp3_file_path, wav_file_path)
        print(f"Converted {mp3_file_path} to {wav_file_path}")


In [None]:
from transcribe_anything import transcribe_anything

transcribe_anything(
    url_or_file="/kaggle/input/mini-speech-diarization/dataset/test/test.wav",
    output_dir="diarization",
    task="transcribe",
    model="large",
    device="cuda",
    initial_prompt="You are listening to a conversation between a professor and a student"
)

# Full function signiture:
# def transcribe(
#     url_or_file: str,
#     output_dir: Optional[str] = None,
#     model: Optional[str] = None,              # tiny,small,medium,large
#     task: Optional[str] = None,               # transcribe or translate
#     language: Optional[str] = None,           # auto detected if none, "en" for english...
#     device: Optional[str] = None,             # cuda,cpu,insane,mlx
#     embed: bool = False,                      # Produces a video.mp4 with the subtitles burned in.
#     hugging_face_token: Optional[str] = None, # If you want a speaker.json - speaker diarization task
#     other_args: Optional[list[str]] = None,   # Other args to be passed to to the whisper backend
#     initial_prompt: Optional[str] = None,     # Custom prompt for better recognition of specific terms
# ) -> str:


# RAG

In [None]:
!pip install langchain

In [None]:
!pip install langchain_community

In [None]:
!pip install langchain_openai

In [None]:
!pip install sentence-transformers

In [None]:
!pip install faiss-gpu

In [None]:
!pip install -U langsmith

In [None]:
import bs4
from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableSequence
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import RobertaForCausalLM, RobertaTokenizer, pipeline

In [None]:
doc = Document(page_content=text[0])
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

splits = text_splitter.split_documents([doc])
len(splits)

In [None]:
splits

In [None]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch 

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents=splits, embedding=embedding_model)

retriever = vectorstore.as_retriever()

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/gemma-2/transformers/gemma-2-2b-it/1/")
model = AutoModelForCausalLM.from_pretrained(
    "/kaggle/input/gemma-2/transformers/gemma-2-2b-it/1/",
    device_map="auto",
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
model.to(device) 

def generate_answer(question):
    related_docs = retriever.get_relevant_documents(question)
    context = " ".join([doc.page_content for doc in related_docs])
    input_text = f"{context}\n\nQuestion: {question}\nAnswer:"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)  
    generated_ids = model.generate(input_ids, max_length=1024, num_return_sequences=1)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [None]:
question = "What is this document about?"
answer = generate_answer(question)
print(answer)

In [None]:
def extract_answer(text):
    if '\nAnswer:' in text:
        answer = text.split('\nAnswer:')[1].strip()
    else:
        answer = "Answer 부분을 찾을 수 없습니다."
    
    return answer

only_answer = extract_answer(answer)
print(only_answer)

# TTS

In [None]:
!pip install gtts

In [None]:
from gtts import gTTS
import os

def text_to_speech(text, language='en', output_file='output.mp3'):
    """

    Parameters:
    text (str): 변환할 텍스트
    language (str): 음성 언어 (기본값은 영어 'en')
    output_file (str): 저장할 mp3 파일 이름 (기본값은 'output.mp3')
    """
    tts = gTTS(text=text, lang=language, slow=False)

    tts.save(output_file)
    print(f"음성 파일이 {output_file}로 저장되었습니다.")

    os.system(f"start {output_file}")  # Windows에서 작동
    # os.system(f"afplay {output_file}")  # macOS에서 작동
    # os.system(f"mpg321 {output_file}")  # Linux에서 작동

text_to_speech(only_answer, language='en', output_file='hello.mp3')
