In [19]:
import google.generativeai as genai
import os
from dotenv import load_dotenv
import chromadb
import pandas as pd
from chromadb import Documents, EmbeddingFunction, Embeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [20]:
load_dotenv('key.env')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

### 1. PDF 문서 가져오기

In [21]:
loader = PyPDFLoader('D:\chatbot_project\\about_airline_meal.pdf')
data_nyc = loader.load()
print(data_nyc)

[Document(metadata={'producer': 'Pdftools SDK', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-11-25T14:14:35+00:00', 'source': 'D:\\chatbot_project\\about_airline_meal.pdf', 'total_pages': 23, 'page': 0, 'page_label': '1'}, page_content="[\n진\n에\n어 \n기\n내\n식\n] \n \n[{'menu_name': '\n승무원용 \n낙지덮밥\n', \n  \n 'menu_price': 'KRW 13,000', \n  \n 'flight route': '\n인천\n,\n부산 \n출발편 \n/ \n중\n·\n장거리 \n노선만 \n주문 \n가능\n'}, \n \n{'menu_name': '\n승무원용 \n비빔밥\n', \n  \n 'menu_price': 'KRW 12,000', \n  \n 'flight route': '\n인천\n,\n부산 \n출발편 \n/ \n중\n·\n장거리 \n노선만 \n주문 \n가능\n'}, \n \n{'menu_name': '\n승무원용 \n소불고기덮밥\n', \n  \n 'menu_price': 'KRW 12,000', \n  \n 'flight route': '\n인천\n,\n부산 \n출발편 \n/ \n중\n·\n장거리 \n노선만 \n주문 \n가능\n'}, \n \n{'menu_name': '\n승무원용 \n제육덮밥\n', \n  \n 'menu_price': 'KRW 12,000', \n  \n 'flight route': '\n인천\n,\n부산 \n출발편 \n/ \n중\n·\n장거리 \n노선만 \n주문 \n가능\n'}, \n \n{'menu_name': '\n승무원용 \n김치볶음밥\n', \n  \n 'menu_price': 'KRW 12,000', \n  \n 'flight route': '\n인천\n,\n부산 \n출발편 \

In [22]:
text_splitter =  RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap = 100)
all_splits = text_splitter.split_documents(data_nyc)

In [23]:
# https://github.com/google-gemini/cookbook/blob/main/examples/chromadb/Vectordb_with_chroma.ipynb

In [24]:
def gemini_embedding(text): # text : 임베딩을 원하는 문장
    embedding_result = genai.embed_content(
        model="gemini-embedding-001",
        content=text)
    return embedding_result['embedding']

In [25]:
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    title = "Custom query"
    response = genai.embed_content(
        model='gemini-embedding-001',
        content=input,
		task_type="retrieval_document",
		title=title
    )
    return response['embedding']

In [26]:
def create_chroma_db(documents, name):
  chroma_client = chromadb.PersistentClient(path='D:\chatbot_project\chroma_db')
  db = chroma_client.create_collection(
      name=name,
      embedding_function=GeminiEmbeddingFunction()
  )

  for i, d in enumerate(documents):
    db.add(
      documents=d,
      ids=str(i)
    )
  return db

In [27]:
documents = [i.page_content for i in all_splits]

In [28]:
db_pos = 'chroma_db'
try:
	db = create_chroma_db(documents, db_pos) # 생성을 하지 않은 경우 DB 생성 
except:
    pass

chroma_client = chromadb.PersistentClient(path="D:\chatbot_project\chroma_db")

# 이미 생성한 경우 2번 수행할 필요가 없기에 기존 DB 불러오기
db = chroma_client.get_collection(
    name=db_pos, 
    embedding_function=GeminiEmbeddingFunction()
)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


In [66]:
sample_data = db.get(include=['documents', 'embeddings'])

df = pd.DataFrame({
    "IDs": sample_data['ids'],
    "Documents": sample_data['documents'],
    "Embeddings": [str(emb) for emb in sample_data['embeddings']]  # Truncate embeddings
})

df.head()

Unnamed: 0,IDs,Documents,Embeddings
0,0,[\n진\n에\n어 \n기\n내\n식\n] \n \n[{'menu_name': '\...,[-0.00232023 -0.012616 0.01797053 ... 0.01...
1,1,"'flight route': '\n인천\n,\n부산 \n출발편 \n/ \n중거리\n...",[-0.00978631 -0.01229042 0.0155403 ... 0.00...
2,2,"{'menu_name': '\n소시지 \n오므라이스\n', \n \n 'menu_...",[ 0.00159121 -0.00355688 0.01607749 ... 0.00...
3,3,"떡갈비 \n김치볶음밥 \nKRW 12,000 / USD 12 / JPY 1,200 ...",[-0.00226122 0.00913556 0.00846783 ... 0.00...
4,4,치킨너겟 \n오므라이스 \n에어부산 \n키즈 \n기내식 \n메뉴 \n오므라이스와 \...,[-0.00766604 0.02529271 0.00641076 ... 0.00...


In [90]:
def get_relevant_passage(query, db, n):
  passage = db.query(query_texts=[query], n_results=n)['documents'][0]
  return ''.join(passage)

In [91]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""
    너는 지금부터 항공사별 기내식 관련 질문에 답하는 전문가야. 
    다음 문서를 기반으로 정확하게 응답해줘. 
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'""").format(query=query, relevant_passage=escaped)

  return prompt

In [98]:
model = genai.GenerativeModel("gemini-2.5-flash")

query = "에어프레미아에 특별식을 주문하고 싶은데 관련 전화번호 있어?"
passage = get_relevant_passage(query, db, 5)
prompt = make_prompt(query, passage)

response = model.generate_content(prompt)
response.text

'네, 에어프레미아에 특별식을 주문하시려면 **예약 센터(1800-2626)**로 연락하시거나, 예약 조회 > 부가서비스 관리 메뉴 내에서 신청하실 수 있습니다. 특별식은 영업일 기준 항공기 출발 48시간 전까지 신청해야 합니다.'