- https://dev.classmethod.jp/articles/python-parse-pdf/
- https://colab.research.google.com/github/nyanta012/demo/blob/main/sentence_retrieval.ipynb#scrollTo=_5bY_6TK_yFC
- https://blog.langchain.dev/langchain-chat/
- https://zenn.dev/umi_mori/books/prompt-engineer/viewer/langchain_indexes

In [None]:
!pip install transformers==4.30.2 sentencepiece==0.1.99 sentence_transformers==2.2.2 accelerate==0.20.3 langchain==0.0.226 openai==0.27.8 chromadb==0.3.26
!pip install pymupdf==1.22.5 python-dotenv==1.0.0

In [None]:
import torch
from dotenv import load_dotenv
import requests
import fitz

from langchain.chat_models import ChatOpenAI
from langchain import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import Document


chatgpt_id = "gpt-3.5-turbo"
en_list = [
        "bigscience/bloom-560m",
        "bigscience/bloom-1b7",
        "bigscience/bloomz-560m",
        "bigscience/bloomz-1b7",
        # "gpt2",
        "gpt2-medium",
        "gpt2-large",
        "gpt2-xl",
        # "facebook/opt-125m",
        "facebook/opt-350m",
        "facebook/opt-1.3b",
        # "cerebras/Cerebras-GPT-111M",
        "cerebras/Cerebras-GPT-256M",
        "cerebras/Cerebras-GPT-590M",
        "cerebras/Cerebras-GPT-1.3B",
        "vicgalle/gpt2-alpaca",
]
ja_list = [
        # "cyberagent/open-calm-small",
        "cyberagent/open-calm-medium",
        "cyberagent/open-calm-large",
        "cyberagent/open-calm-1b",
        # "rinna/japanese-gpt2-xsmall",
        # "rinna/japanese-gpt2-small",
        "rinna/japanese-gpt2-medium",
        # "rinna/japanese-gpt-1b",
        "rinna/japanese-gpt-neox-small",
        "abeja/gpt2-large-japanese",
        # "abeja/gpt-neox-japanese-2.7b",
]


def get_llm(model_id, model_kwargs, pipeline_kwargs):
    if torch.cuda.is_available():
        device = 0
    else:
        device = -1
    if model_id == chatgpt_id:
        load_dotenv()
        llm = ChatOpenAI(model_name=chatgpt_id)
    else:
        llm = HuggingFacePipeline.from_model_id(
            model_id, task="text-generation",
            model_kwargs=model_kwargs,
            pipeline_kwargs=pipeline_kwargs,
            device=device,
            verbose=True
        )

    return llm


def get_embeddings(model_id):
    if model_id == chatgpt_id:
        return OpenAIEmbeddings()
    else:
        return HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
torch.cuda.is_available()

In [None]:
model_kwargs = {
            "min_length": 20,
            "max_length": 100,
            "repetition_penalty": 1.01,
            "do_sample": True,
            "top_p": 0.95,
            "top_k": 50,
            "temperature": 0.1,
        }
pipeline_kwargs = {
            "min_new_tokens": 5,
            "max_new_tokens": 50,
}

model_id = "bigscience/bloom-560m"

llm = get_llm(model_id, model_kwargs, pipeline_kwargs)

In [None]:
url = "https://buildmedia.readthedocs.org/media/pdf/pdfminer-docs/latest/pdfminer-docs.pdf"
res = requests.get(url)

In [None]:
if res is not None:
    doc = fitz.open(stream=res.content, filetype="pdf")

    docs = [
                Document(
                    page_content=page.get_text().encode("utf-8"),
                    metadata=dict(
                        {
                            "page_number": page.number + 1,
                            "total_pages": len(doc),
                        },
                        **{
                            k: doc.metadata[k]
                            for k in doc.metadata
                            if type(doc.metadata[k]) in [str, int]
                        },
                    ),
                )
                for page in doc
            ]

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(docs)
    # print(texts)

    embeddings = get_embeddings(model_id)
    vectordb = Chroma.from_documents(texts, embeddings)

    qa_chain = pdf_qa = ConversationalRetrievalChain.from_llm(llm, vectordb.as_retriever(), return_source_documents=True)
else:
    print("Exit")

In [None]:
chat_history = []
text_input = "What is PDFMiner?"
result = qa_chain({"question": text_input, "chat_history": chat_history})
print("Input: ", text_input)
print("Answer: ", result["answer"])
# print("Source: ", result["source_documents"])
chat_history.append((text_input, result["answer"], result["source_documents"]))
text_input = "Tell me some more details."
result = qa_chain({"question": text_input, "chat_history": chat_history})
print("Input: ", text_input)
print("Answer: ", result["answer"])
# print("Source: ", result["source_documents"])

In [None]:
chat_history = []
while True:
    text_input = input()
    if text_input == "exit":
        break
    result = qa_chain({"question": text_input, "chat_history": chat_history})
    print("Input: ", text_input)
    print("Answer: ", result["answer"])
    chat_history.append((text_input, result["answer"], result["source_documents"]))