In [17]:
import os
import torch
from IPython.display import Audio, display
import speech_recognition as sr
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate

# OpenVoice TTS
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from melo.api import TTS


In [27]:
PDF_PATH = "english_dataset_cleaned.pdf"
REFERENCE_SPEAKER = "resources/trump.mp3"
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
GEMINI_API_KEY = "AIzaSyCHkRovviGK45wTHjIrHDtifU-dUhITeh0"
BASE_SPEAKER = "EN-US"
TOP_K = 3
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR = "outputs_v2"
SRC_PATH = f"{OUTPUT_DIR}/tmp.wav"
SPEED = 0.85

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Setup tone color converter
ckpt_converter = 'checkpoints_v2/converter'
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=DEVICE)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

# Extract target speaker embedding
target_se, _ = se_extractor.get_se(REFERENCE_SPEAKER, tone_color_converter, vad=True)


  WeightNorm.apply(module, name, dim)


Loaded checkpoint 'checkpoints_v2/converter/checkpoint.pth'
missing/unexpected keys: [] []
OpenVoice version: v2
[(0.0, 61.3)]
after vad: dur = 61.3


In [33]:
def load_pdf_text(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    return loader.load()

def chunk_documents(docs, chunk_size=100, chunk_overlap=5):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)

def create_vectorstore(chunks):
    embed_model = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
    return FAISS.from_documents(chunks, embed_model)

def setup_llm():
    os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
    return ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        convert_system_message_to_human=True,
        temperature=0.5
    )

def agentic_rag(query, retriever, llm):
    subquestions = [query]
    output_parser = StrOutputParser()
    all_answers = []

    for subq in subquestions:
        docs = retriever.get_relevant_documents(subq)
        context = "\n\n".join([doc.page_content for doc in docs])
        prompt = PromptTemplate.from_template(
            "You are a helpful assistant.\nUse the following context to answer:\n{context}\n\nQuestion: {question}"
        ).format_prompt(context=context, question=subq)

        response = llm.invoke(prompt)
        answer = output_parser.invoke(response)
        all_answers.append(answer)

    summary_prompt = PromptTemplate.from_template(
        "You are a smart assistant. Given the answers below, produce a final concise helpful answer:\n\n{answers}"
    ).format_prompt(answers="\n\n".join(all_answers))

    final_response = llm.invoke(summary_prompt)
    return output_parser.invoke(final_response)


In [23]:
def speak(text):
    model = TTS(language="EN", device=DEVICE)
    speaker_ids = dict(model.hps.data.spk2id)
    matched_key = next((k for k in speaker_ids if k.lower() == BASE_SPEAKER.lower()), None)
    if not matched_key:
        raise ValueError(f"Speaker '{BASE_SPEAKER}' not found.")
    speaker_id = speaker_ids[matched_key]

    # Generate speech
    model.tts_to_file(text, speaker_id, SRC_PATH, speed=SPEED)

    # Voice cloning
    output_path = f"{OUTPUT_DIR}/final_output.wav"
    tone_color_converter.convert(
        audio_src_path=SRC_PATH,
        src_se=torch.load(f'checkpoints_v2/base_speakers/ses/{matched_key.lower().replace("_", "-")}.pth', map_location=DEVICE),
        tgt_se=target_se,
        output_path=output_path,
        message="@MyShell"
    )

    # Play audio in notebook
    display(Audio(output_path, autoplay=True))


In [31]:
def get_voice_input():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("\n🎤 Listening...")
        audio = recognizer.listen(source)
    try:
        text = recognizer.recognize_google(audio)
        print(f"💬 You (voice): {text}")
        return text
    except sr.UnknownValueError:
        print("❌ Sorry, I couldn't understand that.")
        return ""
    except sr.RequestError:
        print("❌ Speech recognition service error.")
        return ""

def chat_loop():
    print("🗣️ Trump says: Hello guys! It's me Trump. How should I motivate you today?")
    speak("Hello guys! It's me Trump. How should I motivate you today?")

    docs = load_pdf_text(PDF_PATH)
    chunks = chunk_documents(docs)
    vectorstore = create_vectorstore(chunks)
    retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})
    llm = setup_llm()

    while True:
        choice = input("\n🎧 Type [v] for voice or [t] for text (or 'bye' to exit): ").strip().lower()
        if choice == "bye":
            farewell = "Thank you for the great talk. Stay strong, stay smart, and never give up. Jai Narendra modi ji ki , jai shankar ki, jai shree balaji ki , jai siya raam!"
            print(f"\n🗣️ Trump says: {farewell}")
            speak(farewell)
            break
        elif choice == "v":
            query = get_voice_input()
        elif choice == "t":
            query = input("💬 You (text): ").strip()
        else:
            print("❗Invalid choice. Please type 'v' or 't'.")
            continue

        if not query:
            continue

        print("🤖 Thinking...")
        answer = agentic_rag(query, retriever, llm)
        print(f"\n🗣️ Trump says: {answer}")
        speak(answer)

# 🔁 Run this to start
chat_loop()


🗣️ Trump says: Hello guys! It's me Trump. How should I motivate you today?
 > Text split to sentences.
Hello guys! It's me Trump. How should I motivate you today?


100%|██████████| 1/1 [00:11<00:00, 11.04s/it]



🎧 Type [v] for voice or [t] for text (or 'bye' to exit):  bye



🗣️ Trump says: Thank you for the great talk. Stay strong, stay smart, and never give up. Jai Narendra modi ji ki , jai shankar ki, jai shree balaji ki , jai siya raam!


  WeightNorm.apply(module, name, dim)


 > Text split to sentences.
Thank you for the great talk. Stay strong, stay smart, and never give up. Jai Narendra modi ji ki , jai shankar ki, jai shree balaji ki , jai siya raam!


100%|██████████| 1/1 [00:31<00:00, 31.91s/it]
