In [1]:
import langchain

# Load & process
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import TokenTextSplitter

# Vector store & embeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# Conversations
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
)
from langchain.memory import ConversationBufferMemory
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.llms import OpenAI

# Post-processing
import fitz

# Token counter
import tiktoken
encoder = tiktoken.encoding_for_model("text-embedding-ada-002")
from langchain.callbacks.manager import get_openai_callback

In [2]:
# Load PDF
loader = PyPDFLoader('pdfs/2309.13963.pdf')
docs = loader.load()

In [3]:
# Split into chunks
text_splitter = TokenTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200
)

chunks = text_splitter.split_documents(docs)

# Create embeddings from chunks
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(chunks, embeddings)

In [4]:
# LLM
chat_model = ChatOpenAI()

# Template prompt
template = """Based ONLY on the context below, answer the following question!
Context:
{context}

Question:
{question}
"""

# Prompt
prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            "You are a nice chatbot having a conversation with a human."
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template(template),
    ]
)

# Memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [5]:
def contexts_formatter(contexts):
    result = ""
    for i in range(len(contexts)):
        result += f"{i+1}. {contexts[i].page_content}\n\n\n"
    return result

In [6]:
chain = prompt | chat_model

In [7]:
question = "jelasin apa itu q-former"
contexts = docsearch.similarity_search(question, k=3)

answer = ""
for res in chain.stream({"context": contexts_formatter(contexts), "question": question, "chat_history": memory.buffer_as_messages}):
    if res:
        print(res.content, end="", flush=True)
        answer += res.content

# with get_openai_callback() as cb:
#     res_invoke = chain.invoke({"context": contexts_formatter(contexts), "question": question, "chat_history": memory.buffer_as_messages})

Berdasarkan konteks di atas, Q-Former adalah sebuah modul yang berbasis Transformer yang digunakan untuk mengubah urutan masukan yang berkepanjangan menjadi representasi kueri (query) dengan panjang tetap. Modul ini awalnya dikembangkan untuk pencocokan modalitas visual-teks, tetapi dalam konteks ini, Q-Former digunakan untuk mencocokkan audio-teks. Dalam blok Q-Former, embedding kueri yang dapat dilatih (trainable query embeddings) berinteraksi dengan fitur masukan melalui lapisan self-attention dan cross-attention multi-head. Q-Former ini terdiri dari dua blok decoder Transformer dengan masker perhatian kausal dihilangkan.

In [8]:
answer

'Berdasarkan konteks di atas, Q-Former adalah sebuah modul yang berbasis Transformer yang digunakan untuk mengubah urutan masukan yang berkepanjangan menjadi representasi kueri (query) dengan panjang tetap. Modul ini awalnya dikembangkan untuk pencocokan modalitas visual-teks, tetapi dalam konteks ini, Q-Former digunakan untuk mencocokkan audio-teks. Dalam blok Q-Former, embedding kueri yang dapat dilatih (trainable query embeddings) berinteraksi dengan fitur masukan melalui lapisan self-attention dan cross-attention multi-head. Q-Former ini terdiri dari dua blok decoder Transformer dengan masker perhatian kausal dihilangkan.'

In [9]:
# Template prompt
template_reference = """Context:
{context}

Question:
{question}

Answer:
{answer}

Based on the question and answer pair above, find where the answer originates from within the given context.

Return a JSON object with the following keys:
    `page`: (the value is in the form of a list of page numbers, can be more than one page)
    `source`: (the value is in the form of a list of sentences used as a reference for the answer, write exactly as it appears in the context including the in-text citation, using the language used in the context)
    `in-text citation` : (list of in-text citation appears in contexts used as reference for the answer)
"""

In [10]:
# Prompt
prompt_reference = PromptTemplate.from_template(template_reference)

# LLm
llm = OpenAI()

# Output parser
json_parser = SimpleJsonOutputParser()

# LCEL
reference_chain = prompt_reference | llm | json_parser

In [11]:
reference = reference_chain.invoke({
    "context": contexts_formatter(contexts), 
    "question": question, 
    "answer": answer
})

In [29]:
from difflib import SequenceMatcher
import pandas as pd

matches = list()
pages = list()
for i in range(3):
    m = [m for m in SequenceMatcher(None, contexts[i].page_content, reference["source"][0]).get_matching_blocks() if m.size > 15]
    if m:
        # m = contexts[i].page_content[m[0].a : m[-1].a + m[-1].size]
        # matches.append(m)
        for j in range(len(m)):
            matches.append(contexts[i].page_content[m[j].a : m[j].a + m[j].size])
            pages.append(contexts[i].metadata["page"])

pd.DataFrame({"match": matches, "pages": pages})

(['Q-Former [20] is a Transformer-based module converting variable-',
  'length input sequences into fixed-length output query representa',
  'tions. It was initially proposed for visual-text modality alignment,',
  'and here is applied to audio-text alignment. In each Q-Former block,',
  'trainable query embeddings Q∈Rnq×dqinteract with the input fea',
  'turesXthrough multi-head self-attention and cross-attention layers,'],
 [1, 1, 1, 1, 1, 1])

# Post Pro

In [13]:
import json

def dict_to_string(input_dict):
    json_string = json.dumps(input_dict, indent=2)  # indent for pretty formatting (optional)
    return json_string

In [None]:
from forex_python.converter import CurrencyRates

def get_cost(
        model="gpt-3.5-turbo", 
        prompt_formatted=prompt.format(context=contexts_formatter(contexts), question=question, chat_history=memory.buffer_as_messages),
        output_from_llm=dict_to_string(res)
):
    curr = CurrencyRates()
    encoder = tiktoken.encoding_for_model(model)
    input_tokens_used = len(encoder.encode(prompt_formatted)) + 7 # Jaga-jaga
    output_tokens_used = len(encoder.encode(output_from_llm))
    total_token = input_tokens_used + output_tokens_used

    input_price = round((0.0015/1000) * input_tokens_used, 8)
    output_price = round((0.002/1000) * output_tokens_used, 8)
    total_price_usd = round(input_price + output_price, 8)
    total_price_idr = curr.convert('USD', 'IDR', total_price_usd)


    return f"""Tokens Used: {total_token}
        Prompt Tokens: {input_tokens_used}
        Completion Tokens: {output_tokens_used}
    Total Cost (USD): ${total_price_usd}
    Total Cost (IDR): Rp{total_price_idr}
    """

In [15]:
# Highlighting Sources
def get_matches(source, contexts=contexts, k=3):
    for i in range(k):
        idx_awal = contexts[i].page_content.find(source)
        if idx_awal != -1:
            idx_akhir = contexts[i].page_content[idx_awal:].find(".")
            idx_akhir += idx_awal
        
            match_ = contexts[i].page_content[idx_awal:idx_akhir]
            if len(match_) > 10:
                page_num = contexts[i].metadata["page"]
                return contexts[i].page_content[idx_awal:idx_akhir], page_num
    else:
        return None
    
def highlight_pdf(path, match_, page_num, output_path):
    pdf = fitz.open(path)
    page = pdf[page_num]

    matches = page.search_for(match_.replace("-\n", ""))

    for m in matches:
        page.add_highlight_annot(m)

    pdf.save(output_path)
    pdf.close()

In [16]:
# Get references
def get_reference(docs, in_text_citation):
    model = OpenAI()

    get_citation_template = """From the reference list below, rewrite the specified references!
    References:
    {references}

    Please rewrite the references related to the following numbers:
    {in_text_citation}
    """

    GET_CITATION_PROMPT = PromptTemplate.from_template(get_citation_template)

    get_reference_chain = chain = GET_CITATION_PROMPT | model

    for page_number in range(len(docs)):
        page = docs[page_number]
        text = page.page_content

        if "References" in text or "REFERENCES" in text:
            references_text = text.split("References")[1] if "References" in text else text.split("REFERENCES")[1]

    result = get_reference_chain.invoke({"references": references_text, "in_text_citation":in_text_citation})
    return result

In [None]:
if res["source"]:
    match_, page_num = get_matches(res["source"][:15], contexts)

    if match_:
        highlight_pdf("pdfs/2309.13963.pdf", match_, page_num, "pdfs/highlighted/high_pdf.pdf")

In [19]:
highlight_pdf("pdfs/2309.13963.pdf", matches[3], 1, "pdfs/highlighted/high_pdf.pdf")

In [24]:
if reference["in-text citation"]:
    ref_result = get_reference(docs, reference["in-text citation"]).strip()
print(ref_result)

[20] J. Li, D. Li, S. Savarese, and S. Hoi, “BLIP-2: Bootstrapping Language-Image Pre-Training with Frozen Image Encoders and Large Language Models,” in Proc. ICML , Vienna, 2023.
