In [1]:
import re
import math
from sklearn.metrics.pairwise import cosine_similarity
import time

class SemanticChunking:
    def __init__(self, embeddings, similarity_threshold, window_size, transfer_sentence_count):
        self.embeddings = embeddings
        self.similarity_threshold = similarity_threshold
        self.window_size = window_size
        self.transfer_sentence_count = transfer_sentence_count

    def split_segment_by_word_boundary(self, segment, max_length):
        words = segment.split()
        parts = []
        current_part = ""
        for word in words:
            if len(current_part) + len(word) + 1 > max_length:
                parts.append(current_part.strip())
                current_part = word
            else:
                current_part = f"{current_part} {word}" if current_part else word
        if current_part.strip():
            parts.append(current_part.strip())
        return parts

    def preprocess_segments(self, segments):
        processed = []
        for seg in segments:
            seg = seg.strip()
            if len(seg) < 3:
                continue
            if len(seg) > 10000:
                processed.extend(self.split_segment_by_word_boundary(seg, 10000))
            else:
                processed.append(seg)
        return processed

    def split_text_into_sentences(self, text: str) -> list:
        pattern = r'(?<=[a-zA-Z0-9])([.!])(?=\s*[A-Z])|(?<=\n)' 
        temp_parts = re.split(pattern, text)
        temp_parts = [part if part is not None else "" for part in temp_parts]
        reattached_sentences = []
        i = 0
        while i < len(temp_parts):
            chunk = temp_parts[i]
            if i + 1 < len(temp_parts) and re.match(r'^[.!]$', temp_parts[i+1]):
                chunk += temp_parts[i+1]
                i += 1
            chunk = chunk.strip()
            if chunk:
                reattached_sentences.append(chunk)
            i += 1

        merged_sentences = []
        buffer = ""
        for sentence in reattached_sentences:
            if len(buffer) + len(sentence) < 1000: # pdf uzunluğuna bağlı değiştirilebilir
                buffer = f"{buffer} {sentence}" if buffer else sentence
            else:
                if buffer:
                    merged_sentences.append(buffer)
                buffer = sentence
        if buffer:
            merged_sentences.append(buffer)
        return merged_sentences

    def rule_based_segmentation(self, text):
        segments = self.split_text_into_sentences(text)
        segments = self.preprocess_segments(segments)
        return segments

    def create_embeddings(self, texts: list) -> list:
        return self.embeddings.embed_documents(texts)

    def calculate_dynamic_threshold_from_divergences(self, divergences):
        """
        Verilen divergence değerlerine göre her window için dinamik bir threshold hesaplar.
        - Divergence'ların ortalaması ve standart sapması hesaplanır.
        - Standart sapmanın küçük, orta veya büyük olmasına bağlı olarak farklı faktörler uygulanır.
        """
        mean_div = sum(divergences) / len(divergences)
        variance = sum((d - mean_div) ** 2 for d in divergences) / len(divergences)
        std_div = math.sqrt(variance)
        if std_div < 0.1:
            factor = 1.5 # buradaki çarpımlar değiştirilebilir
        elif std_div > 0.3:
            factor = 1.0
        else:
            factor = 1.25
        return mean_div + std_div * factor

    def semantic_merging(self, segments):
        n = len(segments)
        if n < self.window_size:
            return [" ".join(segments)]
        
        embeddings = self.create_embeddings(segments)
        split_points = set()
        
        for window_start in range(0, n - self.window_size + 1):
            window_end = window_start + self.window_size
            window_embeddings = embeddings[window_start:window_end]
            window_divergences = []
            for i in range(self.window_size - 1):
                sim = cosine_similarity([window_embeddings[i]], [window_embeddings[i+1]])[0][0]
                divergence = 1 - sim
                window_divergences.append(divergence)
            local_threshold = self.calculate_dynamic_threshold_from_divergences(window_divergences)
            
            for i, div in enumerate(window_divergences):
                if div > local_threshold:
                    global_index = window_start + i + 1
                    split_points.add(global_index)
        
        split_points = sorted(list(split_points))
        chunks = []
        last_split = 0
        for point in split_points:
            chunk = " ".join(segments[last_split:point])
            if chunk:
                chunks.append(chunk)
            last_split = point
        if last_split < n:
            chunk = " ".join(segments[last_split:])
            if chunk:
                chunks.append(chunk)
        return chunks

    def adjust_boundaries(self, chunks):
        """
        Chunk'lar arasındaki sınırları ayarlamak için kullanılır.
        - Her iki chunk arasındaki geçiş bölgesinde, belirli sayıda cümlenin transferi ile
          daha uyumlu bir sınır elde edilmesi amaçlanır.
        - Bir sonraki chunk'ın ilk 'transfer_sentence_count' cümlesi aday olarak alınır.
        - Aday metnin, önceki chunk ve kalan kısmıyla olan benzerliği karşılaştırılarak,
          eğer önceki chunk ile olan benzerlik daha yüksekse, aday cümleler önceki chunk'a eklenir.
        """
        adjusted_chunks = chunks.copy()
        candidate_texts = []
        previous_texts = []
        remainder_texts = []
        indices = []
        
        for i in range(len(adjusted_chunks) - 1):
            next_sentences = self.split_text_into_sentences(adjusted_chunks[i+1])
            if not next_sentences or len(next_sentences) <= self.transfer_sentence_count:
                continue
            candidate_text = " ".join(next_sentences[:self.transfer_sentence_count])
            remainder = " ".join(next_sentences[self.transfer_sentence_count:])
            candidate_texts.append(candidate_text)
            previous_texts.append(adjusted_chunks[i])
            remainder_texts.append(remainder)
            indices.append(i)
        
        if candidate_texts:
            candidate_embeddings = self.create_embeddings(candidate_texts)
            previous_embeddings = self.create_embeddings(previous_texts)
            remainder_embeddings = self.create_embeddings(remainder_texts)
        
            for idx, i in enumerate(indices):
                candidate_emb = candidate_embeddings[idx]
                prev_emb = previous_embeddings[idx]
                next_emb = remainder_embeddings[idx]
                sim_prev = cosine_similarity([prev_emb], [candidate_emb])[0][0]
                sim_next = cosine_similarity([next_emb], [candidate_emb])[0][0]
                
                if sim_prev > sim_next:
                    next_sentences = self.split_text_into_sentences(adjusted_chunks[i+1])
                    candidate_text = " ".join(next_sentences[:self.transfer_sentence_count])
                    adjusted_chunks[i] = adjusted_chunks[i].strip() + " " + candidate_text
                    adjusted_chunks[i+1] = " ".join(next_sentences[self.transfer_sentence_count:])
        return adjusted_chunks

    def create_documents(self, texts):
        all_chunks = []
        for text in texts:
            segments = self.rule_based_segmentation(text)
            initial_chunks = self.semantic_merging(segments)
            adjusted_chunks = self.adjust_boundaries(initial_chunks)
            final_chunks = []
            for chunk in adjusted_chunks:
                if len(chunk) > 10000:
                    final_chunks.extend(self.split_segment_by_word_boundary(chunk, 10000))
                else:
                    final_chunks.append(chunk)
            all_chunks.extend(final_chunks)
        return all_chunks


In [None]:
# RAG ve Qdrant işlemleri
import json
import uuid
import openai
import PyPDF2
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
from langchain.embeddings.openai import OpenAIEmbeddings

with open("config.json", "r") as config_file:
    config = json.load(config_file)
OPENAI_API_KEY = config["OPENAI_API_KEY"]

collection_name = "hybrid-2-pdf-LLM-scipy_data_collection"
qdrant_client = QdrantClient(host="localhost", port=6333)

embedding_model_name = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model=embedding_model_name)

chunker = SemanticChunking(
    embeddings, 
    similarity_threshold=None, 
    window_size=6,  
    transfer_sentence_count=2 # değiştirilebilir
)

pdf_path = "Foundations of LLM.pdf"
documents = []
with open(pdf_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    full_text = []
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text.append(page_text)
    documents = ["\n".join(full_text)]

all_chunks = []
for doc in documents:
    chunks = chunker.create_documents([doc])
    all_chunks.extend(chunks)

vector_size = 1536

existing_collections = [c.name for c in qdrant_client.get_collections().collections]
if collection_name in existing_collections:
    qdrant_client.delete_collection(collection_name)
    print(f"{collection_name} koleksiyonu silindi, yeniden oluşturuluyor.")

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)
print(f"{collection_name} koleksiyonu başarıyla oluşturuldu.")

points = []
for chunk in all_chunks:
    vector = embeddings.embed_query(chunk)
    point = {
        "id": str(uuid.uuid4()),
        "vector": vector,
        "payload": {"text": chunk}
    }
    points.append(point)

qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"{len(points)} adet semantic chunk başarıyla Qdrant koleksiyonuna eklendi.")

def test_rag_direct_qdrant(qdrant_client, collection_name, query):
    print(f"\nSorgu: {query}")
    start_time = time.time()
    
    query_vector = embeddings.embed_query(query)
    
    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=5,
        with_payload=True
    )
    
    retrieved_docs = [result.payload["text"] for result in search_results]
    
    if not retrieved_docs:
        print("Qdrant'tan eşleşen belge bulunamadı.")
        return

    context = "\n".join(retrieved_docs)
    print("\nQdrant'tan Kullanılan Kaynaklar:")
    for idx, doc in enumerate(retrieved_docs, start=1):
        print(f"{idx}. (Uzunluk: {len(doc)}) {doc[:100]}...")
        
    input_text = f"Context: {context}\nQuestion: {query}\nAnswer: "
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an AI assistant that provides precise answers based on the given context."},
            {"role": "user", "content": input_text}
        ],
        temperature=0.5,
        max_tokens=300
    )
    
    end_time = time.time()
    print(f"\nInference zamanı: {round(end_time - start_time, 3)} saniye")
    print(f"\nModelin cevabı:\n{response.choices[0].message.content}")


hybrid-2-pdf-LLM-scipy_data_collection koleksiyonu silindi, yeniden oluşturuluyor...
hybrid-2-pdf-LLM-scipy_data_collection koleksiyonu başarıyla oluşturuldu.
136 adet semantic chunk başarıyla Qdrant koleksiyonuna eklendi.


In [4]:
query1 = "What is pre-training?"
test_rag_direct_qdrant(qdrant_client, collection_name, query1)


Sorgu: What is pre-training?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 2574) neural network with parameters θ, and odenotes the output of the neural network. Different problem s...
2. (Uzunluk: 2710) forgetting problem in continual training, where a neural network forge ts previously learned in- for...
3. (Uzunluk: 5110) The training objective can be deﬁned as (ˆθ,ˆω) = arg max θ,ωLoss(Model θ,ω(xnoise),x) (1.16) Here t...
4. (Uzunluk: 4839) The ﬁne-tuned model is then employed to classify new sequences for this task. An advantage of superv...
5. (Uzunluk: 965) 1Here we assume that tokens are basic units of text that are sep arated through tokenization. Someti...

Inference zamanı: 2.391 saniye

Modelin cevabı:
Pre-training refers to the process of optimizing a neural network on a task or set of tasks before it is further trained or fine-tuned for specific downstream tasks. The goal of pre-training is to create a model that can generalize across various tasks, reducing the relian

In [5]:
query2 = "Which types of models are widely used in NLP pre-training?"
test_rag_direct_qdrant(qdrant_client, collection_name, query2)


Sorgu: Which types of models are widely used in NLP pre-training?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 965) 1Here we assume that tokens are basic units of text that are sep arated through tokenization. Someti...
2. (Uzunluk: 3190) Parameter-efﬁcient transfer learning for NLP. In Proceedings of the 36th International Conference on...
3. (Uzunluk: 5918) the discussion of these topics to the following chapters. CHAPTER 2 Generative Models One of the mos...
4. (Uzunluk: 3861) The use of these ex- amples does not distinguish between models, but we mark the m odel architecture...
5. (Uzunluk: 8279) in work using shared vocabularies, specifying the language to which a token belongs is not necessary...

Inference zamanı: 2.428 saniye

Modelin cevabı:
The types of models widely used in NLP pre-training include:

1. **BERT (Bidirectional Encoder Representations from Transformers)** - A Transformer encoder trained using masked language modeling and next sentence prediction tasks.
2. **

In [6]:
query3 = "How do we implement permuted language modelling?"
test_rag_direct_qdrant(qdrant_client, collection_name, query3)


Sorgu: How do we implement permuted language modelling?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 5918) the discussion of these topics to the following chapters. CHAPTER 2 Generative Models One of the mos...
2. (Uzunluk: 8453) The approach described above provides a new framework of uni versal language understanding and gener...
3. (Uzunluk: 7725) sense to predict any of the tokens in this sequence. 1.2.2.1 Masked Language Modeling One of the mos...
4. (Uzunluk: 5839) architecture to adapt LLMs to large-scale training. In Sect ion2.2we will present more discussions o...
5. (Uzunluk: 5843) ↓ Is Next or Not? 1.2 Self-supervised Pre-training Tasks 13 x0 x0x1 x1x2 x2x3 x3x4 x4Pr(x0) = 1 Pr(x...

Inference zamanı: 3.813 saniye

Modelin cevabı:
Permuted language modeling involves making sequential predictions of tokens in a non-linear order, rather than following the natural left-to-right or right-to-left order of the text. Here’s how to implement permuted language modeling:

1. **Toke

In [7]:
query4 = "What is the large-scale pre-training of the document?"
test_rag_direct_qdrant(qdrant_client, collection_name, query4)


Sorgu: What is the large-scale pre-training of the document?

Qdrant'tan Kullanılan Kaynaklar:
1. (Uzunluk: 2710) forgetting problem in continual training, where a neural network forge ts previously learned in- for...
2. (Uzunluk: 965) 1Here we assume that tokens are basic units of text that are sep arated through tokenization. Someti...
3. (Uzunluk: 4869) scaling laws for LLMs, which help us understand their traini ng efﬁciency and effectiveness. 2.2.1 D...
4. (Uzunluk: 9996) D dataset used for training or ﬁne-tuning a model ∂L ∂θgradient of the loss function Lwith respect t...
5. (Uzunluk: 9182) example, in He et al. [2021 ]’s work, a 1.5 billion-parameter BERT-like model is built b y increasin...

Inference zamanı: 2.315 saniye

Modelin cevabı:
The large-scale pre-training discussed in the document refers to the process of training neural sequence models, particularly in natural language processing (NLP), on vast amounts of unlabeled data using self-supervised learning techniques. 