In [None]:
import os
import faiss
import torch
print(torch.cuda.is_available())

from uuid import uuid4
from pymongo import MongoClient
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [None]:
from dotenv import load_dotenv

load_dotenv("./keys.env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")

MONGO_DB_URL = os.getenv("MONGO_DB_URL")

In [24]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY

In [3]:
embed_model_name = "intfloat/multilingual-e5-large-instruct"

## Data

### DB에서 데이터 꺼내기

In [None]:
def connect_to_mongo(db_name, collection_name):
    """MongoDB 연결"""
    try:
        client = MongoClient(MONGO_DB_URL)
        db = client[db_name]
        collection = db[collection_name]
        # logger.info(f"✅ {db_name} - {collection_name} 연결 성공")
        return collection
    
    except Exception as e:
        # logger.error(f"❌ {db_name} - {collection_name} 연결 실패: {e}")
        return None

In [4]:
# company_collection = connect_to_mongo("culture_db", "company")
# hompage_collection = connect_to_mongo("culture_db", "company_homepage")

# # 기업 데이터 가져오기
# print("기업 데이터 불러오는 중...")
# companies = company_collection.find()
# total_data = []
# for company in companies:
#     id = company["_id"]
#     name = company["company_name"]
#     biz_no = company["biz_no"]
#     total_data.append((id, name, biz_no))

# print(f"총 {len(total_data)}개 기업 데이터를 불러왔습니다.")

### 길이 분석

In [5]:
# doc_results = analyze_document_lengths(total_data, hompage_collection)

In [6]:
# token_results = analyze_token_lengths(total_data, hompage_collection, embed_model_name)

In [7]:
# print("\n분석이 완료되었습니다.")
# print(f"분석 결과는 'analysis_results' 디렉토리에 저장되었습니다.")

# # 요약 정보 출력
# print("\n=== 문서 길이 분석 요약 ===")
# print(f"총 문서 수: {doc_results['total_stats']['total_documents']}")
# print(f"평균 문서 길이: {doc_results['total_stats']['avg_length']:.1f} 글자")
# print(f"중앙값 문서 길이: {doc_results['total_stats']['median_length']:.1f} 글자")
# print(f"최소 문서 길이: {doc_results['total_stats']['min_length']} 글자")
# print(f"최대 문서 길이: {doc_results['total_stats']['max_length']} 글자")

# print("\n=== 토큰 길이 분석 요약 ===")
# print(f"모델: {embed_model_name}")
# print(f"평균 토큰 길이: {token_results['total_token_stats']['avg_token_length']:.1f} 토큰")
# print(f"중앙값 토큰 길이: {token_results['total_token_stats']['median_token_length']:.1f} 토큰")
# print(f"최소 토큰 길이: {token_results['total_token_stats']['min_token_length']} 토큰")
# print(f"최대 토큰 길이: {token_results['total_token_stats']['max_token_length']} 토큰")
# print(f"평균 토큰/문자 비율: {token_results['correlation_data']['token_to_char_ratio'].mean():.4f}")

In [8]:
# homepage_collection = connect_to_mongo("culture_db", "company_homepage")
# total_pages = []
# for id, name, biz_no in total_data:
#     id = convert_objectid_to_str(id)
#     print(f"{id}, {name} {biz_no}")
#     homepage = homepage_collection.find_one({"company_id": id})

#     if homepage is None:
#         print(f"Warning: No homepage data found for {name}")
#         continue

#     pages = homepage["pages"]
#     print(f"{name} : {len(pages)}")
#     for page in homepage["pages"]:
#         total_pages.append((name, page))

# print(len(total_pages))

In [9]:
# total_docs = []
# for company_name, page in total_pages:
#     doc_id = str(uuid4())
#     total_docs.append(
#         Document(
#             page_content=page["text"], 
#             metadata={
#                 "company_name": company_name, 
#                 "url": page["url"],
#                 "original_doc_id": doc_id
#             }, 
#         id=doc_id)
#     )

# print(len(total_docs))

In [None]:
homepage_collection = connect_to_mongo("culture_db", "company_homepage")
total_data = homepage_collection.find()

total_docs = []
for data in total_data:
    pages = data['pages']
    
    for page in pages:
        doc_id = str(uuid4())
        total_docs.append(
            Document(
                page_content=page["text"],
                metadata={
                    "company_name": data["company_name"], 
                    "url": page["url"], 
                    "original_doc_id": doc_id
                },
                id=doc_id
            )
        )

print(len(total_docs))

sample = total_docs[0]
print(f"{sample.metadata['company_name']} : {sample.metadata['url']}\n{sample.page_content[:100]}")

### 문자열 청킹

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " "],
    chunk_size=2000,
    chunk_overlap=250,
    length_function=len
)

chunks = text_splitter.split_documents(total_docs)
print(len(chunks))

In [None]:
import random

sample_idx = random.randint(0, len(total_docs))
print(sample_idx)

In [13]:
original_docs_by_id = {}
for doc in total_docs:
    original_docs_by_id[doc.metadata['original_doc_id']] = doc

# 청크에서 원본 문서 찾기 함수
def find_original_document(chunk):
    """청크에서 원본 문서 찾기"""
    if 'original_doc_id' in chunk.metadata:
        original_id = chunk.metadata['original_doc_id']
        return original_docs_by_id.get(original_id)
    return None

In [None]:
sample_chunk = chunks[sample_idx]
print("=" * 50, "chunk", "=" * 50)
print(sample_chunk.metadata['original_doc_id'])
print(sample_chunk.metadata['company_name'], sample_chunk.metadata['url'])
print("=" * 100)
print(len(sample_chunk.page_content))
print(sample_chunk.page_content[:100])

print("\n\n")
print("=" * 50, "original_doc", "=" * 50)
original_doc = find_original_document(sample_chunk)
print(original_doc.metadata['company_name'], original_doc.metadata['url'])
print("=" * 100)
print(len(original_doc.page_content))
print(original_doc.page_content[:100])

### 토큰 청킹

In [32]:
# from transformers import AutoTokenizer

# def hf_tokenizer_len(text):
#     # 원하는 모델에 맞는 토크나이저 선택 (예: BERT, RoBERTa 등)
#     tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
#     tokens = tokenizer.encode(text)
#     return len(tokens)

# text_splitter = RecursiveCharacterTextSplitter(
#     separators=["\n\n", "\n", ". ", " ", ""],
#     chunk_size=900,
#     chunk_overlap=150,
#     length_function=hf_tokenizer_len  # 토큰 길이 계산 함수
# )

In [33]:
# token_chunks = text_splitter.split_documents(total_docs)
# print(len(token_chunks))

In [34]:
# from langchain_experimental.text_splitter import SemanticChunker
# from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(
#     model_name=embed_model_name,
#     model_kwargs=cfg["model_kwargs"],
#     encode_kwargs=cfg["encode_kwargs"]
# )

# semantic_splitter = SemanticChunker(
#     embeddings=embeddings,
#     breakpoint_threshold_type="percentile",
#     breakpoint_threshold_amount=70,
# )

# # 시맨틱 청킹 적용
# semantic_chunks = semantic_splitter.split_documents(total_docs)
# print(f"시맨틱 청킹 후 청크 개수: {len(semantic_chunks)}")

# # 샘플 시맨틱 청크 확인
# sample_semantic_idx = random.randint(0, len(semantic_chunks) - 1)
# sample_semantic_chunk = semantic_chunks[sample_semantic_idx]
# print(f"샘플 시맨틱 청크 {sample_semantic_idx}:")
# print("=" * 100)
# print(f"회사명: {sample_semantic_chunk.metadata['company_name']}")
# print(f"URL: {sample_semantic_chunk.metadata['url']}")
# print(f"길이: {len(sample_semantic_chunk.page_content)} 글자")
# print("=" * 100)
# print(sample_semantic_chunk.page_content)

## Vector Retriever

In [16]:
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_openai import OpenAIEmbeddings
# from langchain_upstage import UpstageEmbeddings

# embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
# embedding_model = UpstageEmbeddings(model="upstage-e5-large")

model_kwargs = {
    "device": "cuda:1",
    "trust_remote_code": True
}
encode_kwargs = {
    "normalize_embeddings": True,
    "batch_size": 128,
    "multi_process": True,
    "show_progress": True
}

embedding_model = HuggingFaceEmbeddings(
    model_name=embed_model_name, 
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
"""
벡터 저장소에 만들어질 index 타입을 설정.
 - IP : 내적을 기반으로 코사인 유사도 계산으로 쿼리 벡터와 벡터 저장소 내 모든 벡터를 비교하여 가장 유사한 벡터를 찾는다.
 - L2 : 유클리드 거리를 기반으로 유사도 계산으로 쿼리 벡터와 벡터 저장소 내 모든 벡터를 비교하여 가장 유사한 벡터를 찾는다.
 - HNSW : 비정형 데이터에 적합한 효율적인 검색을 위한 인덱스 타입으로 모든 벡터를 비교하지 않고 적절한 후보 벡터를 찾아 비교하여 가장 유사한 벡터를 찾는다.
"""

sample_chunk = chunks[sample_idx].page_content
index = faiss.IndexFlatIP(len(embedding_model.embed_query(sample_chunk)))
## index = faiss.IndexFlatL2(len(embedding_model.embed_query(sample_text)))
## index = faiss.IndexHNSWFlat(len(embedding_model.embed_query(sample_text)), 32)

vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
    # normalize_L2=True
)

vector_store.add_documents(chunks)

In [18]:
import os
from datetime import datetime

now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
save_path = f"./indexes/{now}"
os.makedirs(save_path, exist_ok=True)

vector_store.save_local(save_path)

In [None]:
ret_results = vector_store.similarity_search_with_score("코리아교육그룹의 비전, 목표") ## 쿼리에 대한 검색 수행

for document, score in ret_results:
    print(document.metadata['company_name'], document.metadata['url'], score) ## 회사명, 홈페이지 주소, 유사도 점수
    print("=" * 100)
    print(document.page_content[:50]) ## 본문 내용
    print("\n\n")

In [None]:
ret_results = vector_store.similarity_search_with_relevance_scores("코리아교육그룹의 비전, 목표") ## 쿼리에 대한 검색 수행

for document, score in ret_results:
    print(document.metadata['company_name'], document.metadata['url'], score) ## 회사명, 홈페이지 주소, 유사도 점수
    print("=" * 100)
    print(document.page_content[:50]) ## 본문 내용
    print("\n\n")

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")
example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

print(example_messages[0].content)

In [26]:
from typing_extensions import TypedDict, List
from langchain_core.documents import Document

class State(TypedDict):
    """
    앱의 상태를 정의. 질문, 검색된 문서, 생성된 답변으로 구성되어 있다.
    """
    context: List[Document]
    question: str
    answer: str

In [33]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, api_key=OPENAI_API_KEY)

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search_with_relevance_scores(state["question"])
    return {"context" : retrieved_docs}

def generate(state: State):
    """
    검색된 문서와 질문을 받아 답변을 생성하는 함수.
    """
    docs_content = "\n\n".join(doc.page_content for doc, _ in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [None]:
from langgraph.graph import START, StateGraph
from IPython.display import Image, display

## 그래프를 정의하고 노드들을 시퀀스로 추가.
graph_builder = StateGraph(State).add_sequence([retrieve, generate])

## 검색 노드에서 생성 노드로 연결.
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
graph_builder = StateGraph(State)
graph_builder.add_node("retrieve", retrieve)
graph_builder.add_node("generate", generate)

graph_builder.add_edge(START, "retrieve")
graph_builder.add_edge("retrieve", "generate")
graph = graph_builder.compile()

display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
result = graph.invoke({"question": "코리아교육그룹의 비전, 목표는 무엇인가?"})
print(f"검색된 문서 :\n {result['context']}\n\n")
print(f"생성된 답변 :\n {result['answer']}")