### Model

In [1]:
!pip install langchain langchain_community langchain_core tiktoken langchainhub chromadb transformers langchain-huggingface fpdf langchain_experimental >> devnull
!pip install sentence-transformers==2.2.2 >> devnull

In [52]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.docstore.document import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

from transformers import AutoTokenizer
from transformers import pipeline

import torch
import os
import IPython
from tqdm import tqdm

from fpdf import FPDF
def text2pdf(text, output_name='output.pdf'):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font('Arial', size=12)
    pdf.write(5, text)
    pdf.output(output_name)

In [132]:
PROMPTS = {'full_summary': PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You get one part of lecture's transcript. Explain each mentioned topic in this part.
                All parts go sequentially each other, so don't repeat what you said previously.<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Transcript: {question} <|eot_id|>
                
                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question"],
        ),
        'summary': PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You were given lecture's transcript. 
                Outline main ideas as short as possible.<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Transcript: {question} <|eot_id|>
                
                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question"],
        ),
        'explain': PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                Answer given question using information from the context<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Question: {question}
                Context: {context} <|eot_id|>

                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question", "context"],
        )}

In [96]:
class AudioLLM():
    def __init__(self, vectordb, prompts):
        self._device = "cuda:0" if torch.cuda.is_available() else "cpu"

        self._pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=self._device)
        self._llm = ChatOllama(model='llama3', temperature=0)

        self._prompt_summarize = prompts['summary']
        self._prompt_explain = prompts['explain']
        self._promts_full_summary = prompts['full_summary']

        self._emb_model_name = "sentence-transformers/all-mpnet-base-v2"
        self._vectordb = vectordb
        self._retriever = self._vectordb.as_retriever()
        self._chunk_size = 250
        
        self._text_splitter = SemanticChunker(HuggingFaceEmbeddings(), breakpoint_threshold_type="standard_deviation")
        self._full_summary = None

    def _add_to_db(self, summary):
        MARKDOWN_SEPARATORS = ["\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ",""]
        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                AutoTokenizer.from_pretrained(self._emb_model_name),
                chunk_size=self._chunk_size,
                chunk_overlap=self._chunk_size // 5,
                add_start_index=True,
                strip_whitespace=True,
                separators=MARKDOWN_SEPARATORS,
            )
        doc = Document(page_content=summary, metadata={"label": "lecture"})
        docs_processed = text_splitter.split_documents([doc])
        
        # persist_directory = "DB"
        for i, doc in enumerate(tqdm(docs_processed, desc="Adding the transcripted audio")):
            self._vectordb.add_documents([doc], ids=[f'{i}'])
        self._vectordb.persist()
        self._retriever = self._vectordb.as_retriever()


    def _ASR(self, audio):
        outputs = self._pipeline(audio, max_new_tokens=256)
        self.transcrib = outputs["text"]
        self._add_to_db(outputs["text"])
        return outputs["text"]
    

    def _semantic_split(self, text):
        if len(text) <= 5000:
            return [text]
        splitted = [chunk.page_content for chunk in self._text_splitter.create_documents(text)]
        joined_split = []
        i = 0
        while i < len(splitted):
            cur_chunk = ""
            while len(cur_chunk) <= 5000:
                if i < len(splitted):
                    cur_chunk += splitted[i]
                    i += 1
            joined_split.append(cur_chunk)
        return joined_split

    def full_summary(self, input, is_text=True, is_full=False):
        splitted, text = None, input
        if not is_text:
            text = self._ASR(input)
        splitted = self._semantic_split(text)
        summary = ''
        for chunk in tqdm(splitted):
            chain_summary = (
                {"question": RunnablePassthrough()}
                | self._promts_full_summary
                | self._llm
                | StrOutputParser()
            )
            chunk_summary = chain_summary.invoke(chunk)
            summary += chunk_summary + '\n'
        if is_full:
            self._full_summary = summary
        return summary


    def summarize(self, input, is_text=True):
        text = input
        if not is_text:
            text = self._ASR(input)
        
        chain_summary = (
            {"question": RunnablePassthrough()}
            | self._prompt_summarize
            | self._llm
            | StrOutputParser()
        )

        summary = chain_summary.invoke(text)
        return summary


    def explain(self, text):
        def format_docs(docs):
            return "\n\n".join([d.page_content for d in docs])

        chain_explain = (
            {"context": self._retriever | format_docs, "question": RunnablePassthrough()}
            | self._prompt_explain
            | self._llm
            | StrOutputParser()
        )

        explained = chain_explain.invoke(text)
        return explained


### Init

In [55]:
!cp -r /kaggle/input/data-base/DB /kaggle/working/

0

In [68]:
emb_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", encode_kwargs={'normalize_embeddings': False})
vectordb = Chroma(persist_directory="/kaggle/working/DB", embedding_function=emb_model)

In [133]:
audio_llm = AudioLLM(vectordb, PROMPTS)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [65]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Downloading ollama...
######################################################################## 100.0%#=#=#                                                                          
>>> Installing ollama to /usr/local/bin...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> NVIDIA GPU installed.
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


0

In [66]:
get_ipython().system = os.system

In [137]:
!ollama serve &

0

2024/07/04 21:10:21 routes.go:1064: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE: OLLAMA_LLM_LIBRARY: OLLAMA_MAX_LOADED_MODELS:1 OLLAMA_MAX_QUEUE:512 OLLAMA_MAX_VRAM:0 OLLAMA_MODELS:/root/.ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:1 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_RUNNERS_DIR: OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES:]"
time=2024-07-04T21:10:21.134Z level=INFO source=images.go:730 msg="total blobs: 5"
time=2024-07-04T21:10:21.134Z level=INFO source=images.go:737 msg="total unused blobs removed: 0"
time=2024-0

In [138]:
!ollama run llama3 &

0

time=2024-07-04T21:10:25.709Z level=INFO source=payload.go:44 msg="Dynamic LLM libraries [rocm_v60101 cpu cpu_avx cpu_avx2 cuda_v11]"
time=2024-07-04T21:10:25.823Z level=INFO source=types.go:98 msg="inference compute" id=GPU-cb0e410d-5cf0-d65a-a08f-f664bd7150df library=cuda compute=6.0 driver=12.4 name="Tesla P100-PCIE-16GB" total="15.9 GiB" available="12.5 GiB"


[GIN] 2024/07/04 - 21:10:25 | 200 |      50.479µs |       127.0.0.1 | HEAD     "/"
[GIN] 2024/07/04 - 21:10:25 | 200 |   22.851691ms |       127.0.0.1 | POST     "/api/show"
INFO [main] build info | build=1 commit="7c26775" tid="133878358798336" timestamp=1720127425
INFO [main] system info | n_threads=2 n_threads_batch=-1 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="133878358798336" timestamp=1720127425 total_threads=4
INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="3" port="46063" tid="133878358798336" timestamp=1720127425


[?25l⠙ [?25htime=2024-07-04T21:10:25.967Z level=INFO source=memory.go:309 msg="offload to cuda" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[12.5 GiB]" memory.required.full="5.0 GiB" memory.required.partial="5.0 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[5.0 GiB]" memory.weights.total="3.9 GiB" memory.weights.repeating="3.5 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="677.5 MiB"
time=2024-07-04T21:10:25.968Z level=INFO source=server.go:368 msg="starting llama server" cmd="/tmp/ollama2709113919/runners/cuda_v11/ollama_llama_server --model /root/.ollama/models/blobs/sha256-6a0746a1ec1aef3e7ec53868f220ff6e389f6f8ef87a01d77c96807de94ca2aa --ctx-size 2048 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 1 --port 46063"
time=2024-07-04T21:10:25.969Z level=INFO source=sched.go:382 msg="loaded runners" count=1
time=2024-07-04T21:10:25.969Z level=INFO s

INFO [main] model loaded | tid="133878358798336" timestamp=1720127429


time=2024-07-04T21:10:29.736Z level=INFO source=server.go:599 msg="llama runner started in 3.77 seconds"
[?25l[?25l[2K[1G[?25h[2K[1G[?25h[?25l[?25h

[GIN] 2024/07/04 - 21:10:29 | 200 |  3.888899547s |       127.0.0.1 | POST     "/api/generate"


### Bot

In [3]:
%pip install pyTelegramBotAPI

In [None]:
!chmod 774 /kaggle/working/DB

In [None]:
import os
import uuid
import telebot

BOT_TOKEN = ...

bot = telebot.TeleBot(BOT_TOKEN)

@bot.message_handler(commands=['start'])
def start(message):
    markup = telebot.types.InlineKeyboardMarkup()
    btn1 = telebot.types.InlineKeyboardButton("Add audio", callback_data="add_audio")
    btn2 = telebot.types.InlineKeyboardButton("Ask a question", callback_data="ask_question")
    markup.row(btn1, btn2)
    bot.send_message(message.chat.id, 
                     "Hello! What do you want to do: add your audio to the current knowledge database or ask a question?", 
                     reply_markup=markup)

    
@bot.message_handler(commands=['query'])
def start(message):
    markup = telebot.types.InlineKeyboardMarkup()
    btn1 = telebot.types.InlineKeyboardButton("Add audio", callback_data="add_audio")
    btn2 = telebot.types.InlineKeyboardButton("Ask a question", callback_data="ask_question")
    markup.row(btn1, btn2)
    bot.send_message(message.chat.id, 
                     "What do you want to do: add your audio to the current knowledge database or ask a question?", 
                     reply_markup=markup)
    
@bot.callback_query_handler(func=lambda callback: True)
def callback_message(callback):
    if callback.data == "add_audio":
        bot.send_message(callback.message.chat.id, "Send audio to add")
        bot.register_next_step_handler(callback.message, add_audio)
        
    elif callback.data == "ask_question":
        bot.send_message(callback.message.chat.id, "Send a question")
        bot.register_next_step_handler(callback.message, ask_question)
        
    elif callback.data == "get_full_summary":
        bot.send_message(callback.message.chat.id, 'Creating summary')
        summary = audio_llm.full_summary(audio_llm.transcrib, is_full=True)
        text2pdf(summary, "summary.pdf")
        f = open("summary.pdf","rb")
        
        markup = telebot.types.InlineKeyboardMarkup()
        btn1 = telebot.types.InlineKeyboardButton("Get a shorter version", callback_data="shorter_version")
        btn2 = telebot.types.InlineKeyboardButton("Ask a question", callback_data="ask_question")
        markup.row(btn1, btn2)
        bot.send_message(callback.message.chat.id, 
                         "What do you want to do: get a shorter version of the summary or ask a question?", 
                         reply_markup=markup)
        
        bot.send_document(callback.message.chat.id,f)
        
    elif callback.data == "shorter_version":
        bot.send_message(callback.message.chat.id, 'Creating a shorter summary')
        short_summary = audio_llm.summarize(audio_llm._full_summary)
        text2pdf(short_summary, "short_summary.pdf")
        f = open("short_summary.pdf","rb")
        bot.send_document(callback.message.chat.id,f)
    
    
def add_audio(message):
    filename = str(uuid.uuid4())
    dir_audio = './audio/'
    if not os.path.exists(dir_audio):
        os.makedirs(dir_audio)
    audio_ogg = dir_audio + filename + ".ogg"
    audio_mp3 = dir_audio + filename + ".mp3"
    
    bot.send_message(message.chat.id, 'Adding your audio')
    
    file_info = bot.get_file(message.audio.file_id)
    downloaded_file = bot.download_file(file_info.file_path)
    with open(audio_mp3, 'wb') as new_file:
        new_file.write(downloaded_file)

    audio_llm._ASR(audio_mp3)

    bot.send_message(message.chat.id, "Your audio was successfully added to the database")
    
    markup = telebot.types.InlineKeyboardMarkup()
    btn1 = telebot.types.InlineKeyboardButton("Get summary of it", callback_data="get_full_summary")
    btn2 = telebot.types.InlineKeyboardButton("Ask a question", callback_data="ask_question")
    markup.row(btn1, btn2)
    bot.send_message(message.chat.id, "What do you want to do next?", reply_markup=markup)
    
    
def ask_question(message):
    bot.send_message(message.chat.id, "Processing your explanation query")
    explanation = audio_llm.explain(message.text)
    bot.delete_message(message.chat.id, message.message_id + 1)
    bot.send_message(message.chat.id, explanation)
    
bot.infinity_polling()

Adding the transcripted audio: 100%|██████████| 18/18 [00:00<00:00, 34.95it/s]


[GIN] 2024/07/04 - 21:11:38 | 200 |  6.397878874s |       127.0.0.1 | POST     "/api/chat"


100%|██████████| 1/1 [00:28<00:00, 28.09s/it]

[GIN] 2024/07/04 - 21:12:11 | 200 | 28.074989425s |       127.0.0.1 | POST     "/api/chat"





[GIN] 2024/07/04 - 21:12:38 | 200 | 13.432138114s |       127.0.0.1 | POST     "/api/chat"
