In [1]:
!pip install pyPDF2




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import re
import math
import json
import uuid
import time
import openai
import PyPDF2  # PDF okuma işlemleri için
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
from langchain.embeddings.openai import OpenAIEmbeddings

class SemanticChunker:
    def __init__(self, embeddings, window_size=6, threshold=0.7):
        self.embeddings = embeddings
        self.window_size = window_size
        self.threshold = threshold

    def split_text_into_sentences(self, text: str) -> list:
        """
        Metni cümlelere böler. Regex deseni, 
          - Bir alfanümerik karakterden sonra gelen nokta ve 
          - Noktanın sonrasında (boşluklar dahil) büyük harf gelme durumu veya
          - Yeni satır karakterini baz alır.
        Sonrasında cümleler temizlenir ve 1000 karakteri aşmayan parçalara birleştirilir.
        """
        pattern = r'(?<=[a-zA-Z0-9])\.(?=\s*[A-Z])|(?<=\n)'
        sentences = re.split(pattern, text)
        sentences = [s.strip() for s in sentences if s.strip()]
        merged_sentences = []
        buffer = ""
        for sentence in sentences:
            if len(buffer) + len(sentence) < 500:
                buffer = f"{buffer} {sentence}" if buffer else sentence
            else:
                if buffer:
                    merged_sentences.append(buffer)
                buffer = sentence
        if buffer:
            merged_sentences.append(buffer)
        return merged_sentences

    def split_sentence_by_word_boundary(self, sentence: str, max_length: int) -> list:
        """
        Bir cümleyi kelime sınırlarına göre böler.
        Eğer mevcut parçaya yeni bir kelime eklenince max_length'i aşarsa,
        o parçayı listeye ekler ve yeni parçaya kelime eklemeye başlar.
        """
        words = sentence.split()
        parts = []
        current_part = ""
        for word in words:
            if len(current_part + " " + word) > max_length:
                parts.append(current_part.strip())
                current_part = word
            else:
                current_part += " " + word
        if current_part.strip():
            parts.append(current_part.strip())
        return parts

    def preprocess_sentences(self, sentences: list) -> list:
        """
        Cümleleri ön işlemden geçirir:
          - Uzunluğu 3 karakterden kısa olanları atlar.
          - 10000 karakterden uzun cümleleri kelime bazında bölerek parçalar.
          - Diğer cümleleri olduğu gibi ekler.
        """
        processed_sentences = []
        for sentence in sentences:
            if len(sentence) < 3:
                continue
            if len(sentence) > 10000:
                splits = self.split_sentence_by_word_boundary(sentence, 10000)
                processed_sentences.extend(splits)
            else:
                processed_sentences.append(sentence)
        return processed_sentences

    def create_embeddings(self, texts: list) -> list:
        """
        Embeddings sağlayıcısını kullanarak verilen metinler için embedding'leri döndürür.
        """
        return self.embeddings.embed_documents(texts)

    def get_embeddings_batch(self, sentences: list, agent_id="live") -> list:
        """
        Cümleleri 2048'lik partiler halinde işleyerek embedding'leri toplar.
        """
        batch_size = 2048
        embeddings_result = []
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i+batch_size]
            batch_result = self.create_embeddings(batch)
            if not batch_result:
                print(f"Error: Embedding API returned no data for batch {i // batch_size} and agent {agent_id}")
            embeddings_result.extend(batch_result)
        return embeddings_result

    def calculate_divergence(self, embedding1: list, embedding2: list) -> float:
        """
        İki embedding arasındaki farkı (divergence) hesaplar.
        Hesaplama, kosinüs benzerliği esasına göre yapılır; sonuç 1 - cosine_similarity olarak döner.
        """
        if len(embedding1) != len(embedding2):
            return 0.0
        dot_product = 0.0
        magnitude1 = 0.0
        magnitude2 = 0.0
        for v1, v2 in zip(embedding1, embedding2):
            dot_product += v1 * v2
            magnitude1 += v1 ** 2
            magnitude2 += v2 ** 2
        magnitude1 = math.sqrt(magnitude1)
        magnitude2 = math.sqrt(magnitude2)
        if magnitude1 == 0 or magnitude2 == 0:
            return 0.0
        cosine_similarity = dot_product / (magnitude1 * magnitude2)
        return 1 - cosine_similarity

    def calculate_average_embedding(self, embeddings: list) -> list:
        """
        Bir grup embedding'in ortalamasını hesaplar.
        """
        if not embeddings:
            return []
        num_embeddings = len(embeddings)
        vector_length = len(embeddings[0])
        sum_vector = [0.0] * vector_length
        for embedding in embeddings:
            for i, value in enumerate(embedding):
                sum_vector[i] += value
        average_vector = [s / num_embeddings for s in sum_vector]
        return average_vector

    def sliding_window_divergence(self, sentences: list, window_size: int, agent_id="live") -> list:
        """
        Kayan pencere yöntemiyle, belirli pencere boyutundaki cümle gruplarının divergence değerlerini hesaplar.
        """
        divergences = []
        embeddings = self.get_embeddings_batch(sentences, agent_id)
        half_window = window_size // 2
        for i in range(0, len(sentences) - window_size + 1):
            window1_embeddings = embeddings[i:i+half_window]
            window2_embeddings = embeddings[i+half_window:i+half_window+half_window]
            avg_embedding1 = self.calculate_average_embedding(window1_embeddings)
            avg_embedding2 = self.calculate_average_embedding(window2_embeddings)
            divergence = self.calculate_divergence(avg_embedding1, avg_embedding2)
            divergences.append(divergence)
        return divergences

    def detect_peaks(self, divergences: list, threshold: float) -> list:
        """
        Hesaplanan divergence değerleri arasında, belirlenen eşik değerinin üzerinde ve komşularına göre zirve olan noktaları tespit eder.
        """
        peaks = []
        if not divergences:
            return peaks
        max_divergence = max(divergences)
        threshold_value = threshold * max_divergence
        for i, value in enumerate(divergences):
            prev_val = divergences[i-1] if i - 1 >= 0 else 0
            next_val = divergences[i+1] if i + 1 < len(divergences) else 0
            if value > threshold_value and value > prev_val and value > next_val:
                peaks.append(i)
        return peaks

    def semantic_chunking(self, text: str, window_size: int, threshold: float, agent_id="live") -> list:
        """
        Metni anlamsal parçalara böler:
          1. Metin cümlelere ayrılır.
          2. Kayan pencere yöntemiyle divergence değerleri hesaplanır.
          3. Tespit edilen peak noktaları, metni bölecek yerler olarak kullanılır.
          4. Cümle grupları (chunk) oluşturulur.
        """
        sentences = self.split_text_into_sentences(text)

        sentences = self.preprocess_sentences(sentences)
        divergences = self.sliding_window_divergence(sentences, window_size, agent_id)
        split_points = self.detect_peaks(divergences, threshold)
        chunks = []
        last_split = 0
        for point in split_points:
            chunk = ". ".join(sentences[last_split:point])
            chunks.append(chunk)
            last_split = point
        
        chunk = ". ".join(sentences[last_split:])
        chunks.append(chunk)
        return chunks

    def create_documents(self, docs: list) -> list:
        """
        Verilen metin (doküman) listesini semantic chunking algoritması ile parçalara ayırır 
        ve her parçayı basit bir doküman olarak döndürür.
        """
        all_chunks = []
        for doc in docs:
            chunks = self.semantic_chunking(doc, self.window_size, self.threshold, agent_id="live")
            for chunk in chunks:
                all_chunks.append({"page_content": chunk})
        return all_chunks

def integrate_pipeline(texts: list, material_ids: list, chunker: SemanticChunker,
                       window_size: int = 6, threshold: float = 0.7, long_text_limit: int = 10000) -> tuple:
    """
    Verilen metinleri ve ilgili material ID'lerini kontrol eder.
    Eğer metin uzunluğu long_text_limit (default: 10000) karakterden fazlaysa,
    semantic_chunking ile anlamsal parçalara ayrılır ve her parçaya orijinal material ID atanır.
    Aksi halde, metin doğrudan eklenir.
    Son olarak, tüm işlenmiş metinler preprocess_sentences ile tekrar işlenir.
    
    """
    processed_data = []
    processed_material_ids = []
    
    for text, mat_id in zip(texts, material_ids):
        if len(text) >= long_text_limit:
           
            chunked_data = chunker.semantic_chunking(text, window_size, threshold, agent_id="live")
            processed_data.extend(chunked_data)
           
            processed_material_ids.extend([mat_id] * len(chunked_data))
        else:
            processed_data.append(text)
            processed_material_ids.append(mat_id)
    
    processed_data = chunker.preprocess_sentences(processed_data)
    
    return processed_data, processed_material_ids



In [3]:
# RAG ve Qdrant İşlemleri

with open("config.json", "r") as config_file:
    config = json.load(config_file)
OPENAI_API_KEY = config["OPENAI_API_KEY"]

collection_name = "jotform-pdf-LLM_data_collection"  # Koleksiyon ismi
qdrant_client = QdrantClient(host="localhost", port=6333)

embedding_model_name = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model=embedding_model_name)

chunker = SemanticChunker(embeddings, window_size=6, threshold=0.7)

pdf_path = "Foundations of LLM.pdf"
documents = []
with open(pdf_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    full_text = []
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text.append(page_text)
 
    documents = ["\n".join(full_text)]

material_ids = list(range(len(documents)))
processed_texts, processed_material_ids = integrate_pipeline(documents, material_ids, chunker)

all_chunks = [{"page_content": text, "material_id": mat_id} for text, mat_id in zip(processed_texts, processed_material_ids)]

vector_size = 1536

existing_collections = [c.name for c in qdrant_client.get_collections().collections]
if collection_name in existing_collections:
    qdrant_client.delete_collection(collection_name)
    print(f"{collection_name} koleksiyonu silindi, yeniden oluşturuluyor...")


qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)
print(f"{collection_name} koleksiyonu başarıyla oluşturuldu.")

points = []
for chunk in all_chunks:
    chunk_text = chunk["page_content"].strip()
    if not chunk_text:
        continue
    vector = embeddings.embed_query(chunk_text)
    point = {
        "id": str(uuid.uuid4()),
        "vector": vector,
        "payload": {"text": chunk_text, "material_id": chunk["material_id"]}
    }
    points.append(point)

qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"{len(points)} adet semantic chunk Qdrant koleksiyonuna eklendi.")


def test_rag_direct_qdrant(qdrant_client, collection_name, query):
    print(f"\nQuery: {query}")
    start_time = time.time()
    
    query_vector = embeddings.embed_query(query)
    
    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=5,
        with_payload=True
    )
    
    retrieved_docs = [result.payload["text"] for result in search_results]
    
    if not retrieved_docs:
        print("Qdrant'ta eşleşen belge bulunamadı.")
        return

    context = "\n".join(retrieved_docs)
    print("\nQdrant'tan Kullanılan Kaynaklar:")
    for idx, doc in enumerate(retrieved_docs, start=1):
        print(f"{idx}. (Uzunluk: {len(doc)}) {doc[:100]}...")

    input_text = f"Context: {context}\nQuestion: {query}\nAnswer: "

   
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an AI assistant that provides precise answers based on the given context."},
            {"role": "user", "content": input_text}
        ],
        temperature=0.5,
        max_tokens=300
    )

    end_time = time.time()
    print(f"\nInference zamanı: {round(end_time - start_time, 3)} saniye")
    print(f"\nModelin cevabı:\n{response.choices[0].message.content}")

jotform-pdf-LLM_data_collection koleksiyonu silindi, yeniden oluşturuluyor...
jotform-pdf-LLM_data_collection koleksiyonu başarıyla oluşturuldu.
16 adet semantic chunk Qdrant koleksiyonuna eklendi.


In [4]:
query1 = "What is pre-training?"
test_rag_direct_qdrant(qdrant_client, collection_name, query1)


Query: What is pre-training?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 10000) . . . . 172 4.3.1 Basics of Reinforcement Learning . . . . . . . . . . . . . . . . . . . . . 173 4.3...
2. (Uzunluk: 9993) entire model is traine d from scratch A well-known example of this is training sequence models by su...
3. (Uzunluk: 9996) architectures Moreover, we have presented and com pared a variety of pre-training tasks for these ar...
4. (Uzunluk: 9995) source and target sides, respect ively5 Likewise, we can express other tasks in the same way For exa...
5. (Uzunluk: 9994) corresponding output The notation of this equation seems a bit complicated, but the training/tuning ...

Inference zamanı: 2.792 saniye

Modelin cevabı:
Pre-training refers to the process of optimizing a neural network on a large dataset before it is further trained or fine-tuned for specific tasks. This approach leverages self-supervised, unsupervised, or supervised learning methods to create a model that can generalize

In [5]:
query2 = "Which types of models are widely used in NLP pre-training?"
test_rag_direct_qdrant(qdrant_client, collection_name, query2)


Query: Which types of models are widely used in NLP pre-training?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 9996) architectures Moreover, we have presented and com pared a variety of pre-training tasks for these ar...
2. (Uzunluk: 9993) entire model is traine d from scratch A well-known example of this is training sequence models by su...
3. (Uzunluk: 10000) . . . . 172 4.3.1 Basics of Reinforcement Learning . . . . . . . . . . . . . . . . . . . . . 173 4.3...
4. (Uzunluk: 9999) important roles in t he recent rise of large language models We will discuss these issues more deepl...
5. (Uzunluk: 9998) arXiv:2501.09223v1 [cs CL] 16 Jan 2025Foundations of Large Language Models Tong Xiao and Jingbo Zhu ...

Inference zamanı: 4.448 saniye

Modelin cevabı:
The two major types of models widely used in NLP pre-training are:

1. **Sequence Encoding Models**: These models represent a sequence of words or tokens as either a real-valued vector or a sequence of vectors, which is typically used

In [6]:
query3 = "How do we implement permuted language modelling?"
test_rag_direct_qdrant(qdrant_client, collection_name, query3)


Query: How do we implement permuted language modelling?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 10000) right-to-left), permuted language modeling allows for predictions in any order The approach i s stra...
2. (Uzunluk: 9999) important roles in t he recent rise of large language models We will discuss these issues more deepl...
3. (Uzunluk: 9998) seen as sub- sequences from the same sequence By adopting such notation , we see that the form of th...
4. (Uzunluk: 9996) i−1)is the value of the i-th entry of. Pr(·|x0,...,x i−1). When applying a trained language model, a...
5. (Uzunluk: 9996) architectures Moreover, we have presented and com pared a variety of pre-training tasks for these ar...

Inference zamanı: 3.711 saniye

Modelin cevabı:
Permuted language modeling can be implemented by determining a specific order for token predictions that differs from the standard left-to-right sequence while keeping the original order of tokens in the text unchanged. Here’s a step-by-step ou

In [7]:
query4 = "What is the large-scale pre-training of the document?"
test_rag_direct_qdrant(qdrant_client, collection_name, query4)



Query: What is the large-scale pre-training of the document?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 9996) architectures Moreover, we have presented and com pared a variety of pre-training tasks for these ar...
2. (Uzunluk: 9998) arXiv:2501.09223v1 [cs CL] 16 Jan 2025Foundations of Large Language Models Tong Xiao and Jingbo Zhu ...
3. (Uzunluk: 10000) . . . . 172 4.3.1 Basics of Reinforcement Learning . . . . . . . . . . . . . . . . . . . . . 173 4.3...
4. (Uzunluk: 9993) entire model is traine d from scratch A well-known example of this is training sequence models by su...
5. (Uzunluk: 9999) important roles in t he recent rise of large language models We will discuss these issues more deepl...

Inference zamanı: 2.457 saniye

Modelin cevabı:
Large-scale pre-training refers to the process of training AI models, particularly in natural language processing (NLP), on vast amounts of unlabeled data using self-supervised learning techniques. This approach allows models to learn gene