In [None]:
from langchain_community.document_loaders import (
    TextLoader, UnstructuredPDFLoader, UnstructuredHTMLLoader,
    Docx2txtLoader
)

def convert_to_text(file_path):
    """The function converts a document file to text"""
    file_type = file_path.split('.')[-1]

    if "pdf" == file_type:
        loader = UnstructuredPDFLoader(file_path)
        document = loader.load()
    elif "txt" == file_type:
        loader = TextLoader(file_path)
        document = loader.load()
    elif "html" == file_type:
        loader = UnstructuredHTMLLoader(file_path)
        document = loader.load()
    elif "doc" == file_type or "docx" == file_type:
        loader = Docx2txtLoader(file_path)
        document = loader.load()
    else:
        return []
    return document


In [None]:
docs = convert_to_text("2023_–ö–ü-–ú–°–®I_–ú–∞–ª–∞–π–¥–∞—Ö 20230629 2.docx")
print(docs)

In [None]:
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200
)
split_docs = splitter.split_documents(docs)
print(split_docs)

In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"  # –∞–±–æ —ñ–Ω—à–∞ –¥–æ—Å—Ç—É–ø–Ω–∞ –º–æ–¥–µ–ª—å
)


In [None]:
from langchain_community.vectorstores import FAISS

# –Ü–Ω–¥–µ–∫—Å—É–≤–∞–Ω–Ω—è —É—Å—ñ—Ö —à–º–∞—Ç–∫—ñ–≤ —É FAISS
vectorstore = FAISS.from_texts(
    [doc.page_content for doc in split_docs],
    embeddings
)
# –ü–µ—Ä–µ—Ç–≤–æ—Ä–∏–º–æ –Ω–∞ retriever –¥–ª—è –ø–æ–¥–∞–ª—å—à–æ–≥–æ –ø–æ—à—É–∫—É
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import OpenAI

# –Ü–Ω—ñ—Ü—ñ–∞–ª—ñ–∑–∞—Ü—ñ—è LLM –¥–ª—è —Å—É–º–∞—Ä–∏–∑–∞—Ü—ñ—ó
llm = OpenAI(temperature=0)

# –°—Ç–≤–æ—Ä—é—î–º–æ chain —Ç–∏–ø—É map_reduce
chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    question_prompt=None,      # –Ω–µ –ø–æ—Ç—Ä—ñ–±–µ–Ω –∑–∞–ø–∏—Ç
    map_prompt=None,           # –≤–∏–∫–æ—Ä–∏—Å—Ç–æ–≤—É—é—Ç—å—Å—è –¥–µ—Ñ–æ–ª—Ç–Ω—ñ —à–∞–±–ª–æ–Ω–∏
    reduce_prompt=None,        # –¥–µ—Ñ–æ–ª—Ç–Ω–∏–π –∑–≤–µ–¥–µ–Ω–∏–π —à–∞–±–ª–æ–Ω
    token_max=1500             # –º–∞–∫—Å. —Ç–æ–∫–µ–Ω—ñ–≤ —É —Ñ—ñ–Ω–∞–ª—å–Ω–æ–º—É –∑–≤–µ–¥–µ–Ω–Ω—ñ
)


In [None]:
# –û—Ç—Ä–∏–º—É—î–º–æ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ñ –¥–æ–∫—É–º–µ–Ω—Ç–∏ (—Ç—É—Ç –ø—Ä–æ—Å—Ç–æ –≤—Å—ñ –∫—Ä–∞—â—ñ k=5)
docs_to_summarize = retriever.get_relevant_documents("")

# –ó–∞–ø—É—Å–∫–∞—î–º–æ –ª–∞–Ω—Ü—é–≥ —Å—É–º–∞—Ä–∏–∑–∞—Ü—ñ—ó
final_summary = chain.run(docs_to_summarize)

print("üè∑ –ü—ñ–¥—Å—É–º–æ–∫ –¥–æ–∫—É–º–µ–Ω—Ç–∞:")
print(final_summary)
