In [1]:
!pip install python-telegram-bot langchain langchain_core langchain_community sentence_transformers faiss-cpu unstructured chromadb Cython tiktoken unstructured[local-inference] langchain_groq nest_asyncio gTTS SpeechRecognition pydub

Collecting python-telegram-bot
  Downloading python_telegram_bot-21.4-py3-none-any.whl.metadata (17 kB)
Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_core
  Downloading langchain_core-0.2.26-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting unstructured
  Downloading unstructured-0.15.0-py3-none-any.whl.metadata (29 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.1.9

In [None]:
import os
import tempfile
import asyncio
import nest_asyncio
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup, KeyboardButton, ReplyKeyboardMarkup
from telegram.ext import Application, CommandHandler, MessageHandler, filters, ContextTypes, CallbackQueryHandler
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from gtts import gTTS
import io
import speech_recognition as sr
from pydub import AudioSegment
import json
from datetime import datetime

nest_asyncio.apply()

TELEGRAM_BOT_TOKEN = "trSo"
GROQ_API_KEY = "gqa5uf8n"
embeddings = HuggingFaceEmbeddings()
llm = ChatGroq(api_key=GROQ_API_KEY, model="llama-3.1-70b-versatile")

user_data = {}

LANGUAGES = {
    'en': {
        'welcome': "Welcome! Please choose your language:",
        'language_set': "Language set to English. 🇬🇧\n\nI'm your AI assistant bot! 🤖 Here's what I can do:\n\n"
                        "📚 Analyze PDF documents\n"
                        "❓ Answer questions about the PDF\n"
                        "🔍 Provide insights and summaries\n"
                        "🎤 Respond with voice messages\n"
                        "🗣️ Understand your voice messages\n\n"
                        "To get started, please upload a PDF file.",
        'upload_pdf': "Please upload a PDF file for analysis.",
        'processing_pdf': "Processing your PDF... This might take a moment.",
        'wrong_file': "Oops! That's not a PDF. Please upload a PDF file.",
        'pdf_processed': "Great! I've processed your PDF. You can now ask me anything about its content.\n\n"
                         "💡 TIP: You can type your question or send a voice message. I'll respond in both text and voice!",
        'no_pdf': "I need a PDF to work with. Please upload one first.",
        'processing_question': "Thinking about your question...",
        'processing_voice': "Listening to your message...",
        'voice_not_recognized': "I couldn't quite catch that. Could you please try speaking again?",
        'contact_us': "Need help or have feedback? Reach out to us at @mohammad_ka8. We're here for you! 😊",
        'help': "Here's a quick guide on how to use me:\n\n"
                "1. Start by uploading a PDF file 📄\n"
                "2. Once processed, ask me anything about the PDF content\n"
                "3. You can type your questions or send voice messages 🎤\n"
                "4. I'll respond in both text and voice (for English)\n"
                "5. Use 'Summary' to get a brief overview of the PDF\n"
                "6. Use 'Contact' for support\n"
                "7. Use 'Change Language' to switch languages\n\n"
                "Remember, I'm here to help! Don't hesitate to ask if you need clarification.",
        'summary_no_pdf': "I haven't processed any PDF yet. Please upload a PDF first, then ask for a summary.",
        'generating_summary': "Generating a summary of the PDF... This might take a moment.",
        'summary_intro': "Here's a brief summary of the PDF content:\n\n",
        'change_language': "Change Language",
        'help_button': "Help",
        'summary_button': "Summary",
        'contact_button': "Contact",
        'change_language_button': "Change Language",
    },
    'fa': {
        'welcome': "خوش آمدید! لطفاً زبان خود را انتخاب کنید:",
        'language_set': "زبان شما فارسی تنظیم شد. 🇮🇷\n\nمن دستیار هوشمند شما هستم! 🤖 اینها کارهایی هستند که می‌توانم انجام دهم:\n\n"
                        "📚 تحلیل اسناد PDF\n"
                        "❓ پاسخ به سؤالات درباره PDF\n"
                        "🔍 ارائه بینش و خلاصه\n\n"
                        "برای شروع، لطفاً یک فایل PDF آپلود کنید.",
        'upload_pdf': "لطفاً یک فایل PDF برای تحلیل آپلود کنید.",
        'processing_pdf': "در حال پردازش PDF شما... این ممکن است کمی طول بکشد.",
        'wrong_file': "اوپس! این یک PDF نیست. لطفاً یک فایل PDF آپلود کنید.",
        'pdf_processed': "عالی! من PDF شما را پردازش کردم. حالا می‌توانید هر سؤالی درباره محتوای آن بپرسید.",
        'no_pdf': "من برای کار نیاز به یک PDF دارم. لطفاً ابتدا یکی آپلود کنید.",
        'processing_question': "در حال فکر کردن به سؤال شما...",
        'contact_us': "نیاز به کمک دارید یا بازخوردی دارید؟ با ما در @mohammad_ka8 تماس بگیرید. ما اینجا هستیم تا به شما کمک کنیم! 😊",
        'help': "راهنمای سریع استفاده از من:\n\n"
                "۱. با آپلود یک فایل PDF شروع کنید 📄\n"
                "۲. پس از پردازش، هر سؤالی درباره محتوای PDF دارید، بپرسید\n"
                "۳. می‌توانید سؤالات خود را تایپ کنید\n"
                "۴. من به صورت متنی پاسخ خواهم داد\n"
                "۵. از دکمه 'خلاصه' برای دریافت خلاصه‌ای از PDF استفاده کنید\n"
                "۶. برای پشتیبانی از دکمه 'تماس با ما' استفاده کنید\n"
                "۷. برای تغییر زبان از دکمه 'تغییر زبان' استفاده کنید\n\n"
                "به یاد داشته باشید، من اینجا هستم تا کمک کنم! اگر نیاز به توضیح بیشتری دارید، حتماً بپرسید.",
        'summary_no_pdf': "هنوز هیچ PDF‌ای پردازش نکرده‌ام. لطفاً ابتدا یک PDF آپلود کنید، سپس درخواست خلاصه کنید.",
        'generating_summary': "در حال تولید خلاصه‌ای از PDF... این ممکن است کمی طول بکشد.",
        'summary_intro': "خلاصه‌ای از محتوای PDF:\n\n",
        'change_language': "تغییر زبان",
        'help_button': "راهنما",
        'summary_button': "خلاصه",
        'contact_button': "تماس با ما",
        'change_language_button': "تغییر زبان",
    }
}

def create_user_directory(user_id):
    base_dir = "user_data"
    user_dir = os.path.join(base_dir, str(user_id))
    os.makedirs(user_dir, exist_ok=True)
    return user_dir

def save_voice_message(user_id, file_path):
    user_dir = create_user_directory(user_id)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    new_file_path = os.path.join(user_dir, f"voice_{timestamp}.ogg")
    os.rename(file_path, new_file_path)
    return new_file_path

def save_user_question(user_id, question):
    user_dir = create_user_directory(user_id)
    questions_file = os.path.join(user_dir, "questions.json")

    if os.path.exists(questions_file):
        with open(questions_file, 'r', encoding='utf-8') as f:
            questions = json.load(f)
    else:
        questions = []

    questions.append({
        "timestamp": datetime.now().isoformat(),
        "question": question
    })

    with open(questions_file, 'w', encoding='utf-8') as f:
        json.dump(questions, f, ensure_ascii=False, indent=2)

async def start(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    keyboard = [
        [InlineKeyboardButton("English 🇬🇧", callback_data='lang_en')],
        [InlineKeyboardButton("فارسی 🇮🇷", callback_data='lang_fa')]
    ]
    reply_markup = InlineKeyboardMarkup(keyboard)
    await update.message.reply_text(
        "Welcome! Please choose your language:\n\nخوش آمدید! لطفاً زبان خود را انتخاب کنید:",
        reply_markup=reply_markup
    )

async def language_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    query = update.callback_query
    await query.answer()

    lang = query.data.split('_')[1]
    user_id = update.effective_user.id
    user_data[user_id] = {"language": lang}

    await query.edit_message_text(LANGUAGES[lang]['language_set'])

    keyboard = [
        [KeyboardButton(LANGUAGES[lang]['help_button']), KeyboardButton(LANGUAGES[lang]['summary_button'])],
        [KeyboardButton(LANGUAGES[lang]['contact_button']), KeyboardButton(LANGUAGES[lang]['change_language_button'])]
    ]
    reply_markup = ReplyKeyboardMarkup(keyboard, resize_keyboard=True)
    await context.bot.send_message(chat_id=user_id, text=LANGUAGES[lang]['help'], reply_markup=reply_markup)

async def change_language(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    keyboard = [
        [InlineKeyboardButton("English 🇬🇧", callback_data='lang_en')],
        [InlineKeyboardButton("فارسی 🇮🇷", callback_data='lang_fa')]
    ]
    reply_markup = InlineKeyboardMarkup(keyboard)
    await update.message.reply_text(
        "Please choose your language:\n\nلطفاً زبان خود را انتخاب کنید:",
        reply_markup=reply_markup
    )

async def handle_pdf(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")

    if update.message.document.mime_type != "application/pdf":
        await update.message.reply_text(LANGUAGES[lang]['wrong_file'])
        return

    await update.message.reply_text(LANGUAGES[lang]['processing_pdf'])

    pdf_file = await context.bot.get_file(update.message.document.file_id)

    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
        await pdf_file.download_to_drive(custom_path=temp_file.name)

        loader = UnstructuredPDFLoader(temp_file.name)
        index = VectorstoreIndexCreator(
            embedding=embeddings,
            text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        ).from_loaders([loader])

        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=index.vectorstore.as_retriever(),
            input_key="question"
        )

        user_data[user_id]["qa_chain"] = qa_chain
        user_data[user_id]["pdf_content"] = loader.load()

    os.unlink(temp_file.name)
    await update.message.reply_text(LANGUAGES[lang]['pdf_processed'])

async def text_to_speech(text: str) -> io.BytesIO:
    tts = gTTS(text=text, lang='en')
    fp = io.BytesIO()
    tts.write_to_fp(fp)
    fp.seek(0)
    return fp

async def handle_question(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")

    if user_id not in user_data or "qa_chain" not in user_data[user_id]:
        await update.message.reply_text(LANGUAGES[lang]['no_pdf'])
        return

    question = update.message.text
    save_user_question(user_id, question)

    qa_chain = user_data[user_id]["qa_chain"]

    await update.message.reply_text(LANGUAGES[lang]['processing_question'])

    answer = qa_chain.run(question)

    await update.message.reply_text(answer)

    if lang == 'en':
        voice_fp = await text_to_speech(answer)
        await update.message.reply_voice(voice=voice_fp)

async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")

    if lang != 'en':
        return

    if user_id not in user_data or "qa_chain" not in user_data[user_id]:
        await update.message.reply_text(LANGUAGES[lang]['no_pdf'])
        return

    await update.message.reply_text(LANGUAGES[lang]['processing_voice'])

    voice_file = await context.bot.get_file(update.message.voice.file_id)

    with tempfile.NamedTemporaryFile(delete=False, suffix='.ogg') as voice_temp_file:
        await voice_file.download_to_drive(custom_path=voice_temp_file.name)

        saved_voice_path = save_voice_message(user_id, voice_temp_file.name)

        audio = AudioSegment.from_ogg(saved_voice_path)
        wav_filename = saved_voice_path[:-4] + ".wav"
        audio.export(wav_filename, format="wav")

        r = sr.Recognizer()
        with sr.AudioFile(wav_filename) as source:
            audio = r.record(source)

        try:
            question = r.recognize_google(audio)
            save_user_question(user_id, question)
        except sr.UnknownValueError:
            await update.message.reply_text(LANGUAGES[lang]['voice_not_recognized'])
            return
        finally:
            os.unlink(wav_filename)

    qa_chain = user_data[user_id]["qa_chain"]
    answer = qa_chain.run(question)

    await update.message.reply_text(f"I understood: '{question}'\n\n{answer}")

    voice_fp = await text_to_speech(answer)
    await update.message.reply_voice(voice=voice_fp)

async def contact_us(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")
    await update.message.reply_text(LANGUAGES[lang]['contact_us'])

async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")
    await update.message.reply_text(LANGUAGES[lang]['help'])

async def summary_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")

    if user_id not in user_data or "pdf_content" not in user_data[user_id]:
        await update.message.reply_text(LANGUAGES[lang]['summary_no_pdf'])
        return

    await update.message.reply_text(LANGUAGES[lang]['generating_summary'])

    pdf_content = user_data[user_id]["pdf_content"]
    qa_chain = user_data[user_id]["qa_chain"]

    summary_prompt = "Provide a concise summary of the main points in the PDF document."
    summary = qa_chain.run(summary_prompt)

    full_response = LANGUAGES[lang]['summary_intro'] + summary
    await update.message.reply_text(full_response)
    if lang == 'en':
        voice_fp = await text_to_speech(summary)
        await update.message.reply_voice(voice=voice_fp)

async def handle_button(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
    user_id = update.effective_user.id
    lang = user_data.get(user_id, {}).get("language", "en")
    button_text = update.message.text

    if button_text == LANGUAGES[lang]['help_button']:
        await help_command(update, context)
    elif button_text == LANGUAGES[lang]['summary_button']:
        await summary_command(update, context)
    elif button_text == LANGUAGES[lang]['contact_button']:
        await contact_us(update, context)
    elif button_text == LANGUAGES[lang]['change_language_button']:
        await change_language(update, context)
    else:
        await handle_question(update, context)

def main() -> None:
    application = Application.builder().token(TELEGRAM_BOT_TOKEN).build()
    application.add_handler(CommandHandler("start", start))
    application.add_handler(CallbackQueryHandler(language_callback, pattern='^lang_'))
    application.add_handler(MessageHandler(filters.Document.PDF, handle_pdf))
    application.add_handler(MessageHandler(filters.VOICE, handle_voice))
    application.add_handler(MessageHandler(filters.Regex(f"^({LANGUAGES['en']['help_button']}|{LANGUAGES['fa']['help_button']})$"), help_command))
    application.add_handler(MessageHandler(filters.Regex(f"^({LANGUAGES['en']['summary_button']}|{LANGUAGES['fa']['summary_button']})$"), summary_command))
    application.add_handler(MessageHandler(filters.Regex(f"^({LANGUAGES['en']['contact_button']}|{LANGUAGES['fa']['contact_button']})$"), contact_us))
    application.add_handler(MessageHandler(filters.Regex(f"^({LANGUAGES['en']['change_language_button']}|{LANGUAGES['fa']['change_language_button']})$"), change_language))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_button))
    application.run_polling(allowed_updates=Update.ALL_TYPES)

if __name__ == "__main__":
    main()

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ERROR:telegram.ext.Application:No error handlers are registered, logging exception.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/telegram/ext/_application.py", line 1335, in process_update
    await coroutine
  File "/usr/local/lib/python3.10/dist-packages/telegram/ext/_handlers/basehandler.py", line 157, in handle_update
    return await self.callback(update, context)
  File "<ipython-input-2-39c3bf56859d>", line 138, in start
    await update.message.reply_text(
  File "/usr/local/lib/python3.10/dist-packages/telegram/_message.py", line 1763, in reply_text
    return await self.get_bot().send_message(
  File "/usr/local/lib/python3.10/dist-packages/telegram/ext/_extbot.py", line 2907, in send_message
    return await super().send_message(
  File "/usr/local/lib/python3.10/dist-packages/telegram/_bot.py", line 1028, in send_message
    return await self._send_message(
  File "/usr/local/lib/python3.10/dist-packages/telegram/ext/_extbot.py", line 6