In [None]:
import os
import boto3
import fitz  # PyMuPDF
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_aws.embeddings import BedrockEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.docstore.document import Document
from preprocess import PDFProcessor

# S3에서 PDF 파일을 다운로드하는 함수
def download_pdfs_from_s3(bucket_name, prefix, download_path='/tmp/pdfs'):
    s3_client = boto3.client('s3')
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    
    pdf_files = []
    total_files = len(response.get('Contents', []))
    
    for obj in tqdm(response.get('Contents', []), desc="Downloading PDFs", total=total_files):
        if obj['Key'].endswith('.pdf'):
            file_name = os.path.join(download_path, obj['Key'].split('/')[-1])
            s3_client.download_file(bucket_name, obj['Key'], file_name)
            pdf_files.append(file_name)
    
    return pdf_files

# PyMuPDF를 사용하여 PDF에서 텍스트 추출하는 함수 (오류 처리 추가)
def _extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""
    
def _extract_preprocessed_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""
# Bedrock 임베딩을 사용하여 인덱스 생성 및 벡터 DB 저장
def create_and_save_bedrock_index(pdf_paths, bucket_name, preprocess = True, vector_db_s3_path='vectorDB/'):
    # 문서 로드 및 텍스트 추출
    if preprocess:
        processor = PDFProcessor(pdf_paths)
        return processor.hr_index()
    else:
        documents = []
        for path in tqdm(pdf_paths, desc="Processing PDFs"):
            text = _extract_text_from_pdf(path)
            if text:  # 텍스트가 추출된 경우에만 추가
                documents.append(Document(page_content=text))
        
        # 텍스트 분할기 정의
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]
        )
        
        # Bedrock 임베딩 사용
        embeddings = BedrockEmbeddings(
            credentials_profile_name='default',
            region_name='us-east-1',
            model_id='amazon.titan-embed-text-v1'
        )
        
        # FAISS 벡터 저장소 생성
        vector_index = FAISS.from_documents(documents, embeddings)
        
    # 로컬에 저장할 경로
    local_index_path = '/tmp/vectordb_index'
    if not os.path.exists(local_index_path):
        os.makedirs(local_index_path)
    
    index_file = os.path.join(local_index_path, 'faiss_index')
    vector_index.save_local(local_index_path)
    
    # 로컬 저장 진행상황 출력
    print("Saving vector DB to local...")
    for file_name in tqdm(os.listdir(local_index_path), desc="Saving to local"):
        pass  # tqdm으로 진행 상황만 표시
    
    # S3에 저장
    s3_client = boto3.client('s3')
    print("Uploading vector DB to S3...")
    for file_name in tqdm(os.listdir(local_index_path), desc="Uploading to S3"):
        s3_client.upload_file(
            os.path.join(local_index_path, file_name), 
            bucket_name, 
            os.path.join(vector_db_s3_path, file_name)
        )
    
    print(f"Vector DB files uploaded to s3://{bucket_name}/{vector_db_s3_path}")

if __name__ == "__main__":
    bucket_name = 'snuh-data-team2'
    prefix = 'data/'
    
    # S3에서 PDF 다운로드
    pdf_paths = download_pdfs_from_s3(bucket_name, prefix)
    print("S3 -> Backend Transport Done.")
    
    # 벡터 DB 생성 및 S3에 저장
    create_and_save_bedrock_index(pdf_paths, bucket_name, preprocess=False)

In [3]:
processor = pp.PDFProcessor('./data/pdf_1.pdf')
processor.hr_index()

  warn_deprecated(


ValueError: Error raised by inference endpoint: An error occurred (ValidationException) when calling the InvokeModel operation: Malformed input request: #: extraneous key [texts] is not permitted#: extraneous key [input_type] is not permitted, please reformat your input and try again.

In [1]:
from extract_tables_and_images import extract_tables
from IPython.display import display
loop = extract_tables('./data/pdf_1.pdf', 83)

for item in loop:
    display(item)

'dataframe: {0:{{0:{성분명}, 1:{흡입속효성베타작용제 Salbutamol(Albuterol)}, 2:{}, 3:{}, 4:{}, 5:{}, 6:{}, 7:{}, 8:{}, 9:{}}, 1:{{0:{상품명}, 1:{벤토린 에보할러}, 2:{}, 3:{}, 4:{벤토린 네뷸}, 5:{}, 6:{}, 7:{벤토린 흡입액}, 8:{}, 9:{}}, 2:{{0:{제형}, 1:{흡입제(MDI)}, 2:{}, 3:{}, 4:{흡입액}, 5:{}, 6:{}, 7:{흡입액}, 8:{}, 9:{}}, 3:{{0:{용량/단위}, 1:{100 μg/puff200 puffs/ea}, 2:{}, 3:{}, 4:{2.5 mg/2.5 mL2.5 mL/Amp}, 5:{}, 6:{}, 7:{5 mg/mL20 mL/병}, 8:{}, 9:{}}, 4:{{0:{용법}, 1:{1회 2 puffs씩1일 4회 흡입(1일 최대 8 puffs)}, 2:{}, 3:{}, 4:{1회 2.5~5 mL를 4~6 시간 간격으로 분무기를  이용하여 흡입}, 5:{}, 6:{}, 7:{용액 0.5~1 mL를 최종 용적이 2~4 mL가 되도록 생리 식염수로 희석한 후, 1일 4회 분무기를  이용하여 흡입}, 8:{}, 9:{}}, 5:{{0:{급여구분}, 1:{급여}, 2:{}, 3:{}, 4:{급여}, 5:{}, 6:{}, 7:{급여}, 8:{}, 9:{}}, 6:{{0:{사진}, 1:{}, 2:{}, 3:{}, 4:{}, 5:{}, 6:{}, 7:{}, 8:{}, 9:{}}, 7:{{0:{}, 1:{}, 2:{}, 3:{}, 4:{}, 5:{}, 6:{}, 7:{}, 8:{}, 9:{}}, 8:{{0:{}, 1:{}, 2:{}, 3:{}, 4:{}, 5:{}, 6:{}, 7:{}, 8:{}, 9:{}}, 9:{{0:{사용법}, 1:{}, 2:{}, 3:{}, 4:{}, 5:{}, 6:{}, 7:{}, 8:{}, 9:{}}, 10:{{0:{부작용}, 1:{구강·인후 자극감, 기침, 폐질환, 기관지염,