# **LangChain ile KiÅŸiye Ã–zel Chatbot - PDF'lerinizle konuÅŸun!**


**Notebook'u hazÄ±rlayan: [Lokman Baturay Efe](https://www.linkedin.com/in/lokmanefe/)**


0. Ä°ndirmeler, KÃ¼tÃ¼phaneler and API AnahtarlarÄ±
1. PDF'leri yÃ¼kleme ve LangChain ile parÃ§alara ayÄ±rma
2. Metinleri gÃ¶mme (embedding) ve gÃ¶mme sonuÃ§larÄ±nÄ± depolama
3. EriÅŸim (retrieval) fonksiyonu oluÅŸturma
4. Sohbet hafÄ±zalÄ± bir sohbet botu oluÅŸturma
5. TÃ¼m iÅŸlemi Gradio ile daha kullanÄ±labilir hale getirme

**Notebook hazÄ±rlanÄ±rken [Liam Ottley](https://youtube.com/@LiamOttley)'in hazÄ±rladÄ±ÄŸÄ± [notebook](https://colab.research.google.com/drive/1OZpmLgd5D_qmjTnL5AsD1_ZJDdb7LQZI?usp=sharing) referans alÄ±nmÄ±ÅŸtÄ±r.**








# 0. Ä°ndirmeler, KÃ¼tÃ¼phaneler and API AnahtarlarÄ±

---



In [None]:
!pip install pip==24.0

In [None]:
!pip install -q pypdf pandas matplotlib tiktoken transformers faiss-cpu langchain-community langchain-google-genai textract==1.6.5 gradio google-cloud-aiplatform[tokenization]

In [5]:
import os
import textract
import pandas as pd
import matplotlib.pyplot as plt
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from vertexai.preview import tokenization
import gradio as gr

In [6]:
os.environ["GEMINI_API_KEY"] = "API anahtarÄ±nÄ±zÄ± girin."

file_name = "PDF dosya ismini girin."

# 1. PDF'leri yÃ¼kleme ve LangChain ile parÃ§alara ayÄ±rma

---



In [None]:
# Ã–ncelikle kullanacaÄŸÄ±mÄ±z PDF dosyalarÄ±nÄ± Colab'in dosyalar kÄ±smÄ±na eklememiz gerekiyor!

# PDF'i hafÄ±zaya yÃ¼klÃ¼yoruz.
loader = PyPDFLoader(f"./{file_name}.pdf")

# PDF'i okuyup sayfalara ayÄ±rÄ±yoruz.
pages = loader.load_and_split()
print(pages[0])

In [10]:
chunks = pages
# AdÄ±m 1: PDF dosyasÄ±nÄ± metne Ã§eviriyoruz.
doc = textract.process(f"./{file_name}.pdf")

# AdÄ±m 2: Hatalara engel olmasÄ± iÃ§in .txt olarak kaydedip tekrar okuyoruz.
with open(f"./{file_name}.txt", 'w') as f:
    f.write(doc.decode('utf-8'))

with open(f"./{file_name}.txt", 'r') as f:
    text = f.read()

# AdÄ±m 3: Tokenleri saymak iÃ§in bir fonksiyon oluÅŸturuyoruz.
tokenizer = tokenization.get_tokenizer_for_model("gemini-1.5-flash")

def count_tokens(text: str) -> int:
    result = tokenizer.count_tokens(text)
    return result.total_tokens

# AdÄ±m 4: Metinleri parÃ§alara ayÄ±rÄ±yoruz.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 24,
    length_function = count_tokens,
)

chunks = text_splitter.create_documents([text])

In [None]:
# OluÅŸan sonuÃ§lar her biri yaklaÅŸÄ±k 500 token veya 500'den daha az token
# iÃ§eren LangChain Document nesneleridir. (RecursiveCharacterTextSplitter yÃ¼zÃ¼nden
# token sayÄ±sÄ± 500'den fazla da olabilir, eÄŸer baÄŸlamÄ± koparmak kÃ¶tÃ¼yse 500'Ã¼ aÅŸabilir)
type(chunks[0])

In [None]:
# ParÃ§alara ayÄ±rma iÅŸleminin baÅŸarÄ±lÄ± olup olmadÄ±ÄŸÄ±nÄ± anlamak iÃ§in bir gÃ¶rselleÅŸtirme yapalÄ±m.

# Her bir parÃ§adaki token sayÄ±sÄ±nÄ±n bir listesini oluÅŸturalÄ±m.
token_counts = [count_tokens(chunk.page_content) for chunk in chunks]

# OluÅŸturduÄŸumuz listeden bir veri tablosu oluÅŸturalÄ±m.
df = pd.DataFrame({'Token Sayisi': token_counts})

# Token sayÄ±sÄ± daÄŸÄ±lÄ±mÄ±nÄ± incelemek iÃ§in bir histogram oluÅŸturalÄ±m.
df.hist(bins=40, )

# OluÅŸturduÄŸumuz grafiÄŸi gÃ¶sterelim.
plt.show()

# 2. Metinleri gÃ¶mme (embedding) ve gÃ¶mme sonuÃ§larÄ±nÄ± depolama

---



In [13]:
# Embedding (metin gÃ¶mme) modelini Ã§aÄŸÄ±ralÄ±m.
embeddings = GoogleGenerativeAIEmbeddings(google_api_key=os.environ.get("GEMINI_API_KEY"), model="models/embedding-001")

# OluÅŸturduÄŸumuz embeddinglerden bir vektÃ¶r veritabanÄ± oluÅŸturalÄ±m.
db = FAISS.from_documents(chunks, embeddings)

# 3. EriÅŸim (retrieval) fonksiyonu oluÅŸturma

---



In [None]:
# Benzerlik algoritmasÄ±nÄ±n doÄŸru sonuÃ§ verip vermediÄŸini test edelim.
query = "RAG ile KiÅŸisel Asistan eÄŸitimi"
docs = db.similarity_search(query)
docs[0]

In [None]:
# KullanÄ±cÄ±nÄ±n girdileri ile arama yapmak iÃ§in benzerlik algoritmasÄ±nÄ± kullanarak
# bir soru cevap zinciri oluÅŸturuyoruz. (KullanÄ±cÄ±nÄ±n girdisine artÄ±k verilen baÄŸlama bakarak cevap verecek.)

chain = load_qa_chain(
    ChatGoogleGenerativeAI(
        temperature=0.7,
        api_key=os.environ.get("GEMINI_API_KEY"),
        model="gemini-1.5-flash"),
    chain_type="stuff"
)

In [None]:
# OluÅŸturduÄŸumuz soru cevap zincirini Ã§alÄ±ÅŸtÄ±rarak dokÃ¼manlar Ã¼zerinde test edelim.
query = "RAG ile KiÅŸisel Asistan eÄŸitimini kim veriyor?"
docs = db.similarity_search(query)

chain.run(input_documents=docs, question=query)

# 4. Sohbet hafÄ±zalÄ± bir sohbet botu oluÅŸturma

---



In [17]:
from IPython.display import display
import ipywidgets as widgets

# Daha Ã¶nce FAISS kullanarak oluÅŸturduÄŸumuz vektÃ¶r veritabanÄ±nÄ±
# dokÃ¼manlarÄ±mÄ±za eriÅŸim mekanizmasÄ± olarak kullanarak
# hafÄ±zaya sahip olan (Ã¶nceki mesajlarÄ± hatÄ±rlayabilen) bir konuÅŸma zinciri oluÅŸturalÄ±m.
qa = ConversationalRetrievalChain.from_llm(ChatGoogleGenerativeAI(temperature=0.7, api_key=os.environ.get("GEMINI_API_KEY"), model="gemini-1.5-flash"), db.as_retriever())

In [None]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""

    if query.lower() == 'exit':
        print("GDG On Campus Trakya Chatbot'unu kullandÄ±ÄŸÄ±nÄ±z iÃ§in teÅŸekkÃ¼rler!")
        return

    result = qa({"question": query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))

    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))

print("GDG On Campus Trakya Chatbot'una hoÅŸgeldiniz!")

input_box = widgets.Text(placeholder='LÃ¼tfen sorunuzu girin:')
input_box.on_submit(on_submit)

display(input_box)

# 5. TÃ¼m iÅŸlemi Gradio ile daha kullanÄ±labilir hale getirme

---



In [None]:
# Token sayma fonksiyonu
tokenizer = tokenization.get_tokenizer_for_model("gemini-1.5-flash")

def count_tokens(text: str) -> int:
    result = tokenizer.count_tokens(text)
    return result.total_tokens


# Chatbot fonksiyonu
def chatbot(api_key, uploaded_file, chat_history, query):
    os.environ["GEMINI_API_KEY"] = api_key

    # PDF'i hafÄ±zaya yÃ¼klÃ¼yoruz
    loader = PyPDFLoader(uploaded_file.name)
    pages = loader.load_and_split()

    # Metni iÅŸliyoruz
    doc = textract.process(uploaded_file.name)
    text = doc.decode('utf-8')

    # Metinleri parÃ§alara ayÄ±rma
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=24,
        length_function=count_tokens,
    )
    chunks = text_splitter.create_documents([text])

    # Embedding iÅŸlemi
    embeddings = GoogleGenerativeAIEmbeddings(
        google_api_key=os.environ.get("GEMINI_API_KEY"),
        model="models/embedding-001"
    )

    # VektÃ¶r veritabanÄ± oluÅŸturma
    db = FAISS.from_documents(chunks, embeddings)

    # KonuÅŸma zinciri oluÅŸturma
    qa = ConversationalRetrievalChain.from_llm(
        ChatGoogleGenerativeAI(
            temperature=0.7,
            api_key=os.environ.get("GEMINI_API_KEY"),
            model="gemini-1.5-flash"
        ),
        db.as_retriever()
    )

    result = qa({"question": query, "chat_history": chat_history})
    answer = result['answer']
    chat_history.append((query, answer))
    return chat_history, chat_history

# Gradio arayÃ¼zÃ¼
with gr.Blocks() as demo:
    gr.Markdown("# ğŸ¦œğŸ”—LangChain ile KiÅŸiye Ã–zel Chatbot - PDF'lerinizle KonuÅŸun!")

    with gr.Column():
        api_key = gr.Textbox(label="LÃ¼tfen GEMINI API anahtarÄ±nÄ±zÄ± girin:", type="password")
        uploaded_file = gr.File(label="LÃ¼tfen bir PDF dosyasÄ± yÃ¼kleyin", file_types=[".pdf"])
        chatbot_interface = gr.Chatbot()
        query = gr.Textbox(label="LÃ¼tfen sorunuzu girin:")

    chat_history = gr.State([])

    def respond(api_key, uploaded_file, chat_history, query):
        chat_history, _ = chatbot(api_key, uploaded_file, chat_history, query)
        return chat_history, ""

    query.submit(respond, [api_key, uploaded_file, chat_history, query], [chatbot_interface, query])

demo.launch()
