In [9]:
# ============================================
# 1️⃣ Gerekli kütüphaneler
# ============================================
!pip install -q pdfplumber tqdm chromadb google-generativeai numpy langchain gradio

import os
import pdfplumber
import numpy as np
import chromadb
import google.generativeai as genai
import time
import gradio as gr

# ============================================
# 2️⃣ API Key ayarı
# ============================================
from google.colab import userdata
API_KEY = userdata.get('API_KEY')
genai.configure(api_key=API_KEY)

EMBED_MODEL = "models/text-embedding-004"
LLM_MODEL = "models/gemini-2.5-flash"

# ============================================
# 3️⃣ PDF'i oku ve chunk'lara böl
# ============================================
# PDF oku
# 1️⃣ Google Drive'ı bağla
from google.colab import drive
drive.mount('/content/drive')

# 2️⃣ PDF dosya yolunu ayarla
pdf_path = "/content/The Ultimate Guide to 5S and 5S Training _ KAIZEN™ Article.pdf"

# 3️⃣ PDF'i oku ve chunk'lara böl
import pdfplumber

with pdfplumber.open(pdf_path) as pdf:
    text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

def chunk_text(text, size=500, overlap=50):
    tokens = text.split()
    chunks = []
    i = 0
    while i < len(tokens):
        chunks.append(" ".join(tokens[i:i+size]))
        i += size - overlap
    return chunks

chunks = chunk_text(text)
print(f"📄 PDF {len(chunks)} chunk'a bölündü.")


# ============================================
# 4️⃣ Embedding oluştur ve Chroma'ya kaydet
# ============================================
collection_name = "kaizen_5s_rag"
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name=collection_name)

if collection.count() == 0:
    print("⏳ Embeddingler oluşturuluyor ve Chroma'ya ekleniyor...")
    for i, chunk in enumerate(chunks):
        try:
            result = genai.embed_content(model=EMBED_MODEL, content=chunk)
            emb = np.array(result["embedding"], dtype="float32")
            collection.add(
                documents=[chunk],
                embeddings=[emb.tolist()],
                metadatas=[{"chunk_id": i}],
                ids=[str(i)]
            )
            time.sleep(0.1)  # rate limit önleme
        except Exception as e:
            print(f"⚠️ Chunk {i} embedding hatası:", e)
    print("✅ Embeddingler başarıyla oluşturuldu.")
else:
    print("✅ Chroma veritabanı zaten dolu, yükleniyor.")

# ============================================
# 5️⃣ Retrieval fonksiyonu
# ============================================
def retrieve(query, top_k=2):
    try:
        query_emb = np.array(
            genai.embed_content(model=EMBED_MODEL, content=query)["embedding"],
            dtype="float32"
        )
        results = collection.query(
            query_embeddings=[query_emb.tolist()],
            n_results=top_k
        )
        docs = results.get("documents", [[]])[0]
        return docs
    except Exception as e:
        print("⚠️ Retrieval hatası:", e)
        return []

# ============================================
# 6️⃣ LLM ile cevap üretimi
# ============================================
from google.generativeai import GenerativeModel
model = genai.GenerativeModel(LLM_MODEL)

def answer_query(query, top_k=2):
    context_docs = retrieve(query, top_k)
    if not context_docs:
        return "⚠️ İlgili bilgi bulunamadı."

    context_text = "\n\n".join(context_docs)
    prompt = f"""
Aşağıdaki bağlamı kullanarak kullanıcı sorusuna Türkçe olarak net bir cevap ver.

Bağlam:
{context_text}

Soru:
{query}

Cevap:
"""
    try:
        response = model.generate_content(prompt)
        return getattr(response, "text", str(response)).strip()
    except Exception as e:
        return f"⚠️ LLM hatası: {e}"

# ============================================
# 7️⃣ Gradio arayüzü
# ============================================
import gradio as gr

# Chat fonksiyonu
def chat_fn(message, history):
    reply = answer_query(message)  # RAG + LLM fonksiyonu
    if "ilgili bilgi bulunamadı" in reply:
        reply += "\n\n💡 Genel Bilgi: 5S uygulamalarında sık yapılan hatalar; standartları güncel tutmamak, görsel yönetimi ihmal etmek, çalışanları eğitmemek, düzeni sürdürememek gibi durumlar olabilir."
    history.append((message, reply))
    return history

# Arayüz
with gr.Blocks(theme=gr.themes.Soft()) as chat_demo:
    gr.Markdown("## 💬 5S & Kaizen Öğretici Chatbot")
    gr.Markdown("Sorunu yaz ve gönder!")

    chat_box = gr.Chatbot(label="Sohbet")
    user_input = gr.Textbox(placeholder="Sorunu yaz...")
    submit_btn = gr.Button("🚀 Gönder")

    submit_btn.click(chat_fn, inputs=[user_input, chat_box], outputs=chat_box)

chat_demo.launch(debug=True, share=True)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




📄 PDF 6 chunk'a bölündü.
✅ Chroma veritabanı zaten dolu, yükleniyor.


  chat_box = gr.Chatbot(label="Sohbet")


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://cadfd4663b9f61b3fd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://cadfd4663b9f61b3fd.gradio.live


