<a href="https://colab.research.google.com/github/nhyha/CodeMind_RAG/blob/main/RAG/CodeMind_RAG_V5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.llms import HuggingFaceHub

import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.document_loaders.hugging_face_dataset import (
    HuggingFaceDatasetLoader,
)

import streamlit as st
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.llms import HuggingFaceHub
import os

from langchain.callbacks.base import BaseCallbackHandler


# Streamlit

In [None]:
%%writefile app.py
import faiss

import streamlit as st
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
from langchain.callbacks.base import BaseCallbackHandler

from langchain.llms import HuggingFaceHub
from langchain_openai import ChatOpenAI
import os



# 환경변수 설정
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your token'
os.environ['OPENAI_API_KEY'] = 'your token'

# Langsmith
os.environ["LANGCHAIN_API_KEY"] = "your token"
os.environ["LANGCHAIN_PROJECT"] = "streamlit_v2"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# 데이터 로드
dataset_name = "greengerong/leetcode"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column="content")
docs = loader.load()



# 'title', 'slug', 'page_content' 결합
for doc in docs:
    title = doc.metadata.get('title', 'No Title') if 'title' in doc.metadata else 'No Title'
    slug = doc.metadata.get('slug', 'No Slug') if 'slug' in doc.metadata else 'No Slug'
    id = doc.metadata.get('id', 'No ID') if 'id' in doc.metadata else 'No ID'
    page_content = doc.page_content
    doc.page_content = f"Title: {title}\n\nID: LeetCode_number_{id}_\n\nSlug: {slug}\n\n{page_content}"  # 'page_content' 속성 수정

# 변경된 page_content를 반영하기 위해 문서를 다시 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(docs)

# 변경된 문서 내용으로 벡터 스토어 및 검색 엔진 재설정
vectorstore = FAISS.from_documents(documents=split_docs, embedding=HuggingFaceBgeEmbeddings())
bm25_retriever = BM25Retriever.from_documents(split_docs)
bm25_retriever.k = 20
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.1, 0.9])




class StreamCallback(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs):
        print(token, end="", flush=True)


llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    streaming=True,
    callbacks=[StreamCallback()],
)


def format_docs(docs):
    # 검색한 문서 결과를 하나의 문단으로 합쳐줍니다.
    return "\n\n".join(doc.page_content for doc in docs)

prompt = hub.pull("nhyha/rag-prompt")



# Streamlit 인터페이스
st.title("문서 검색 및 응답 생성")
query = st.text_input("질문을 입력하세요:")

# 결합된 문서 샘플 보기 기능
if st.button("Show combined document samples"):
    st.subheader("Combined Document Samples:")
    for sample_doc in docs[:5]:  # 첫 5개의 결합된 문서 보여주기
        st.text(sample_doc.page_content)  # 결합된 컨텐츠 출력
        st.write("---")  # 문서 사이에 구분선 추가

if query:

    rag_chain = (
        {"context": ensemble_retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )
    response = rag_chain.invoke(query)
    st.write("Response:", response)


In [None]:
!ngrok config add-authtoken 2f9zvJxKCwgBCOHmyWDKw9FSRwQ_ufQqGCV7pvwgcKbA8zfQ

In [None]:
!ngrok http --domain=intimate-neutral-skink.ngrok-free.app

In [None]:
from pyngrok import ngrok

# 포트 8501에 대한 ngrok 터널을 설정합니다
http_tunnel = ngrok.connect(addr="8501", proto="http", bind_tls=True)
print('Streamlit이 다음 URL에서 실행 중입니다:', http_tunnel.public_url)

!streamlit run app.py &>/dev/null&

# Retriever 결과 확인

In [None]:
import faiss

import streamlit as st
from langchain_community.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
from langchain.callbacks.base import BaseCallbackHandler

from langchain.llms import HuggingFaceHub
from langchain_openai import ChatOpenAI
import os



# 환경변수 설정
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'your token'
os.environ['OPENAI_API_KEY'] = 'your token'

# Langsmith
os.environ["LANGCHAIN_API_KEY"] = "your token"
os.environ["LANGCHAIN_PROJECT"] = "streamlit_v2"
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# 데이터 로드
dataset_name = "greengerong/leetcode"
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column="content")
docs = loader.load()



# 'title', 'slug', 'page_content' 결합
for doc in docs:
    title = doc.metadata.get('title', 'No Title') if 'title' in doc.metadata else 'No Title'
    slug = doc.metadata.get('slug', 'No Slug') if 'slug' in doc.metadata else 'No Slug'
    id = doc.metadata.get('id', 'No ID') if 'id' in doc.metadata else 'No ID'
    page_content = doc.page_content
    doc.page_content = f"Title: {title}\n\nID: LeetCode_number_{id}_\n\nSlug: {slug}\n\n{page_content}"  # 'page_content' 속성 수정

# 변경된 page_content를 반영하기 위해 문서를 다시 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(docs)

# 변경된 문서 내용으로 벡터 스토어 및 검색 엔진 재설정
vectorstore = FAISS.from_documents(documents=split_docs, embedding=HuggingFaceBgeEmbeddings())
bm25_retriever = BM25Retriever.from_documents(split_docs)
bm25_retriever.k = 20
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.1, 0.9])




class StreamCallback(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs):
        print(token, end="", flush=True)


llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    streaming=True,
    callbacks=[StreamCallback()],
)

def format_docs(docs):
    # 검색한 문서 결과를 하나의 문단으로 합쳐줍니다.
    return "\n\n".join(doc.page_content for doc in docs)

prompt = hub.pull("nhyha/rag-prompt")


In [None]:

# 단계 8: 체인 생성(Create Chain)
rag_chain = (
    {"context": ensemble_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# question = "Please write example code to solve the problem of minimum-window-substring."
# question = "Please write example code to solve the problem of Integer to Roman"
question = "Please write example code to solve the problem of Find the Index of the First Occurrence in a String"

response = rag_chain.invoke(question)

# 결과 출력
print(f"Dataset Path: {dataset_name}")
print(f"문서의 수: {len(split_docs)}")
print("===" * 20)
print(f"[HUMAN]\n{question}\n")
print(f"[AI]\n{response}")

In [None]:
ensemble_retriever.invoke("i dont know about word break problem. please give me an approch about this problem in python.")

In [None]:
ensemble_retriever.invoke("Please write example code to solve the problem of Find the Index of the First Occurrence in a String")