In [10]:
!pip install langchain langchain_community langchain_openai pymupdf sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.1.0


In [None]:
from google.colab import userdata
key = userdata.get('openai-api')


In [3]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./readme_file.txt")
documents = loader.load()

texts = [doc.page_content for doc in documents]
print(texts)

embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=key)
document_embeddings = embedding_model.embed_documents(texts)

vector_db = FAISS.from_texts(texts, embedding_model)

print(f"total = {len(texts)} in database")


['# LLM, RAG and Agent Tutorial\nThis repository contains AI tool, lib installation link for LLM & Agent, focusing on creative LLM coding, modeling, and computing as the viewpoint of media project. \nPlease download and read the below docuemnt to understand this development environment. \n- [LLM development environment document(word file)](https://github.com/mac999/LLM-RAG-Agent-Tutorial/blob/main/1-1.prepare/dev-env.docx)\n\n## Overview\n- **Huggingface**: For uisng LLM, Stable Diffusion-based model, You need to sign up Huggingface. In example, [Single Image-to-3D model](https://huggingface.co/spaces/stabilityai/stable-point-aware-3d)\n- **Ollama**: For using AI tools in interactive art projects. You need to install NVIDIA cuda for run it.\n\nThe repository includes examples to experiment with generative media art.</br>\n- [Gen AI for Media Art](https://github.com/mac999/llm-media-art-demo)\nIn addition, you can find Text-to-3D model tool the below link. \n- [Text-to-3D model code](ht

  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=key)


total = 1 in database


In [4]:
from langchain.agents import initialize_agent, Tool, AgentType
from langchain_openai import ChatOpenAI
from langchain.callbacks import get_openai_callback
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0, openai_api_key=key)

tools = [
    Tool(
        name="Echo",
        func=lambda x: f"Echoing: {x}",  # 단순한 에코 함수
        description="Echo the input text"
    )
]

agent_executor = initialize_agent(
    tools=tools,
    llm=llm,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True  
)

query = "Hello, I am Tom"
print("Question:", query)

result = agent_executor.invoke({"input": query})

print("Answer:", result['output'])


  agent_executor = initialize_agent(


Question: Hello, I am Tom


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to acknowledge the introduction.  
Action: Echo  
Action Input: Hello, I am Tom  [0m
Observation: [36;1m[1;3mEchoing: Hello, I am Tom[0m
Thought:[32;1m[1;3mI have acknowledged Tom's introduction.  
Final Answer: Hello, I am Tom[0m

[1m> Finished chain.[0m
Answer: Hello, I am Tom


In [3]:
import torch
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_core.prompts import PromptTemplate
from typing import List
from langchain_core.output_parsers import BaseOutputParser

# load pdf file and split into chunks
loader = PyMuPDFLoader("./files/mama-mia.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
text_contents = [doc.page_content for doc in texts]



In [None]:
# embedding model 
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# vedtorstore 생성
vectorstore = FAISS.from_texts(text_contents, embeddings)

# LLM 
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name) # 토크나이저
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
llm = HuggingFacePipeline(
    pipeline=pipeline(
        "text-generation", # 텍스트 생성
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256, # 성할 텍스트의 최대 토큰 수
        do_sample=True, # 확률 기반 샘플링 생성
        temperature=0.7,
        top_p=0.95, # 확률 분포에서 상위 95%의 누적 확률에 해당하는 토큰만 고려하여 텍스트를 생성
        device=0 if torch.cuda.is_available() else -1
    )
)

# template 정의
custom_prompt = PromptTemplate(
    input_variables=["question"],
    template="""당신은 AI 언어 모델 어시스턴트입니다. 사용자가 제공한 질문에 대해 벡터 데이터베이스에서 관련 문서를 검색할 수 있도록 질문을 3가지 다른 버전으로 생성하는 것이 당신의 임무입니다. 사용자의 질문을 다양한 관점에서 재구성하여 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다. 각 버전의 질문은 줄바꿈으로 구분하여 작성하세요. 한국어로 작성하세요. 원본 질문: {question}"""
)

# OUtputParser 정의
class LineListOutputParser(BaseOutputParser):
    def parse(self, text: str) -> List[str]:
        return text.strip().split("\n")

# LLM chain 생성
output_parser = LineListOutputParser()
llm_chain = custom_prompt | llm | output_parser

# multi-query retriever 생성
retriever_from_llm = MultiQueryRetriever(retriever=vectorstore.as_retriever(), llm_chain=llm_chain, parser_key="lines") # 사용자의 질문을 여러 관점에서 재구성하여 다양한 쿼리를 생성
retriever_from_llm.verbose = True

# 쿼리 실행
query = "mama mia?"
results = retriever_from_llm.get_relevant_documents(query)



In [5]:
# 결과 출력
for i, doc in enumerate(results[:5]):  # 상위 5개 결과 출력
    print(f"문서 {i+1}:")
    print(doc.page_content + "\n")

문서 1:
1
Archbishop Rummel Genesian Players Audition Packet 
Information, Audition Sides, and Music 
 
Mamma Mia! 
Based upon the hit songs of ABBA 
Music and Lyrics by BENNY ANDERSSON & BJÖRN ULVAEUS 
And some songs with STIG ANDERSON 
Book by CATHERINE JOHNSON  
Stage & Music Direction by Brandt Blocker – Choreography by Karen Hebert  
Performance Dates and Times  
April 21, 22*, 23, 28, 29, 30, 2022 
*Champagne Performance 
Rehearsal Schedule  
Anyone cast in the show MUST be available March 2 – April 20 (Mondays through Fridays 
6PM-9PM and Saturdays 12PM-4PM. Exceptions may be made for school related activities.) 
You must also be available for all performances. 
Auditions 
By online submission at https://airtable.com/shrRx2KndqbqPhsuY    (Deadline to submit has 
been extended to Monday, January 31.) 
In-person callbacks will be held Saturday, February 19 at 12:00PM. 
Location for Callbacks 
Archbishop Rummel Genesian Theatre 
1901 Severn Avenue, Metairie  
Audition Meeting (Option