### pdf_indexing.ipynb
- 미국 국립암센터에서 가져온 pdf 파일들(lung cancer, breast cancer)을 전처리하고 미리 인덱싱해서 저장해놓기

In [1]:
import os
from dotenv import load_dotenv
import pickle
import faiss
import numpy as np
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re
import os
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_community.vectorstores import FAISS

In [2]:
load_dotenv()  # 환경변수 설정

True

In [3]:
# 함수 정의
def remove_newlines_except_after_period(text):
    """마침표 다음의 줄바꿈을 제외한 모든 줄바꿈을 제거"""
    return re.sub(r'(?<!\.)(\n|\r\n)', ' ', text)


def save_vectorstore_with_documents(file_paths,vectorstore_path):
    """PDF 파일을 전처리하고 벡터스토어와 문서를 로컬에 저장하기"""
    all_docs=[]
    for file_path in file_paths:
        if os.path.exists(file_path):
            # 1.  문서 로딩
            loader = PDFPlumberLoader(file_path)
            docs = loader.load()            
            for doc in docs:
            # page_content에 대해 줄바꿈 처리 후 다시 할당
                doc.page_content = remove_newlines_except_after_period(doc.page_content)
            all_docs.extend(docs)
        else:
            print(f"파일을 찾을 수 없습니다: {file_path}")
    
    # 2. 문서 분할
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    split_documents = text_splitter.split_documents(all_docs)

    # 3. embedding 생성
    embeddings = OpenAIEmbeddings()

    # 4. 벡터스토어 생성하고 저장(retriever도 저장)
    vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)
    vectorstore.save_local(vectorstore_path)
    

    #5. 분할된 문서도 같이 저장
    document_paths = os.path.join(vectorstore_path, "documents.pkl")
    with open(document_paths, 'wb') as f:
        pickle.dump(split_documents, f)
    
    return vectorstore

In [5]:
file_paths = ["./NCI_breast_removed.pdf", "./NCI_lung_removed.pdf"] 
vectorstore_path = ".cache/embeddings/faiss_index.index"  # 저장할  경로
vectorstore = save_vectorstore_with_documents(file_paths,vectorstore_path)

In [6]:
vectorstore = FAISS.load_local(vectorstore_path, OpenAIEmbeddings(),allow_dangerous_deserialization=True)
query = "페암인가요?"
query_embedding = OpenAIEmbeddings().embed_query(query)
query_embedding = np.array(query_embedding)
query_embedding = query_embedding.reshape(1, -1)

In [12]:
D, I = vectorstore.index.search(query_embedding, k=4)

In [13]:
D

array([[0.60662174, 0.6164737 , 0.61733055, 0.6183497 ]], dtype=float32)

In [14]:
I

array([[ 62, 371, 376,  30]], dtype=int64)

In [15]:
document_paths = os.path.join(vectorstore_path, "documents.pkl")
with open(document_paths, 'rb') as f:
    documents = pickle.load(f)

In [16]:
# 4. 검색된 문서 내용 출력
for idx in I[0]:
    print(f"문서: {documents[idx].page_content}")

문서: New types of treatment are being tested in clinical trials.
문서: Genetic counseling for children with pleuropulmonary blastoma It may not be clear from the family medical history whether your child's pleuropulmonary blastoma is part of an inherited condition related to a change in the DICER1 gene. Genetic counseling can assess the likelihood that your child's cancer is inherited and whether
문서: PET scan (positron emission tomography scan) A PET scan uses a small amount of radioactive sugar (also called radioactive glucose) injected into a vein. Then the PET scanner rotates around the body to make detailed, computerized pictures of areas inside the body where the glucose is taken up. Because cancer cells often take up more glucose than normal cells, the pictures can be used to  nd cancer cells in the body.
Positron emission tomography (PET) scan. The child lies on a table that slides through the PET scanner. The head rest and white strap help the child lie still. A small amount of ra