In [None]:
!pip install grobid-client langchain openai faiss-cpu PyPDF2 tiktoken chromadb

In [None]:
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain.docstore.document import Document
from typing import Any, List, Optional, Type, Union
from langchain.prompts.prompt import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from pprint import pprint

In [None]:
loader = DirectoryLoader('./data', glob="**/*.txt", loader_cls=TextLoader)

In [None]:
raw_docs = loader.load()

In [None]:
async def split(docs:List[Document]):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=20, separators=["\n\n", "\n", " ", ""])
    return text_splitter.split_documents(docs)

In [None]:
docs = await split(raw_docs)
len(docs)

In [None]:
from langchain.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)
embeddings

In [None]:
query = '강현과 소연이 둘 사이에 일어났던 중요한 일들과 서로의 감정상태'
d1 = db.similarity_search(query)
d1

In [None]:
d2 = db.similarity_search_with_score(query)
d2

In [None]:
embedding_vector = embeddings.embed_query(query)
d3 = db.similarity_search_by_vector(embedding_vector)
d3

In [None]:
db.save_local('./data/faiss_kakao')

In [None]:
new_db = FAISS.load_local("./data/faiss_kakao", embeddings)
d1 = db.similarity_search(query)
d1

In [None]:
#'연인관계인 강현과 소연이의 둘 사이에 일어났던 주요 사건들과 시간에 따른 서로의 감정상태'
topic = 'The major story that happened between Kanghyeon and Soyeon, who are lovers.'

summary_prompt_template = """Summarize the chat conversation that is in the DOCS below, so that the content presented in the topic above is well represented.
You must obtain and summarize the necessary data from DOCS so that the content written in topic can be well represented.

The CONVERSATION CONTEXT format is 'year month day time, speaker: message'.    
For example, in '2000, May 3, 3:00 AM, A: Hello', the conversation content is Hello. 
The content of the conversation is the most important.

!IMPORTANT Even if you can't analyze it, guess based on your knowledge. answer unconditionally.

DOCS: {docs}

"""
prefix_summary = f"The topic is '{topic}'." 
suffix_summary = "CONCISE SUMMARY IN 3000 WORDS IN ENGLISH:"
template = prefix_summary + summary_prompt_template + suffix_summary
pprint(template)

PROMPT = PromptTemplate(template=template, input_variables=["docs"])

llm = ChatOpenAI(
        temperature=1,
        verbose=True,
        max_retries=3
)
retriever = db.as_retriever(search_kwargs={'k':10})
docs = retriever.get_relevant_documents(topic)
pprint(docs)

In [None]:
chain = load_summarize_chain(
        llm=llm, 
        chain_type="map_reduce", # chain_type=refine
        combine_document_variable_name="docs",
        map_reduce_document_variable_name="docs",
        map_prompt=PROMPT, 
        combine_prompt=PROMPT,
        verbose=True,
)
summary = chain({"input_documents": docs}, return_only_outputs=True)
summary