In [8]:
import re
import math
from sklearn.metrics.pairwise import cosine_similarity
import time
import json
import uuid
import openai
import PyPDF2
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance

def get_embedding(text, model):
        """
        Verilen metin için OpenAI API'sini kullanarak embedding üretir.

        """
        response = openai.Embedding.create(input=[text], model=model)
        return response["data"][0]["embedding"]

class SemanticChunking:
    def __init__(self, embedding_model, window_size, transfer_sentence_count):
        self.embedding_model = embedding_model
        self.window_size = window_size
        self.transfer_sentence_count = transfer_sentence_count

    def split_segment_by_word_boundary(self, segment, max_length):
        """
        Bir segmenti, maksimum karakter uzunluğunu aşmayacak şekilde kelime sınırlarına göre parçalara böler.

        """
        words = segment.split()
        parts = []
        current_part = ""
        for word in words:
            if len(current_part) + len(word) + 1 > max_length:
                parts.append(current_part.strip())
                current_part = word
            else:
                current_part = f"{current_part} {word}" if current_part else word
        if current_part.strip():
            parts.append(current_part.strip())
        return parts

    def preprocess_segments(self, segments):
        """
        Segmentleri ön işler; boşlukları temizler, kısa segmentleri atar ve 10000 karakteri aşan segmentleri kelime bazında böler.

        """
        processed = []
        for seg in segments:
            seg = seg.strip()
            if len(seg) < 3:
                continue
            if len(seg) > 10000:
                processed.extend(self.split_segment_by_word_boundary(seg, 10000))
            else:
                processed.append(seg)
        return processed

    def split_text_into_sentences(self, text: str) -> list:
        """
        Metni, noktalama işaretleri ve yeni satır karakterlerini baz alarak cümlelere böler.
        Daha sonra cümleler, yaklaşık 750 karakterlik parçalarda birleştirilir.

        """
        pattern = r'(?<=[a-zA-Z0-9])([.!])(?=\s*[A-Z])|(?<=\n)' 
        temp_parts = re.split(pattern, text)
        temp_parts = [part if part is not None else "" for part in temp_parts]
        reattached_sentences = []
        i = 0
        while i < len(temp_parts):
            chunk = temp_parts[i]
            if i + 1 < len(temp_parts) and re.match(r'^[.!]$', temp_parts[i+1]):
                chunk += temp_parts[i+1]
                i += 1
            chunk = chunk.strip()
            if chunk:
                reattached_sentences.append(chunk)
            i += 1

        merged_sentences = []
        buffer = ""
        for sentence in reattached_sentences:
            # pdf uzunluğuna bağlı olarak 750 karakter limiti değiştirilebilir.
            if len(buffer) + len(sentence) < 750:
                buffer = f"{buffer} {sentence}" if buffer else sentence
            else:
                if buffer:
                    merged_sentences.append(buffer)
                buffer = sentence
        if buffer:
            merged_sentences.append(buffer)
        return merged_sentences

    def rule_based_segmentation(self, text):
        """
        Metni, cümlelere ayırıp ön işlemden geçirerek kural tabanlı segmentlere böler.

        """
        segments = self.split_text_into_sentences(text)
        segments = self.preprocess_segments(segments)
        return segments
    

    def create_embeddings(self, texts: list) -> list:
        response = openai.Embedding.create(input=texts, model=self.embedding_model)
        return [d["embedding"] for d in response["data"]]


    def calculate_dynamic_threshold_from_divergences(self, divergences):
        """
        Verilen divergence değerlerine göre her window için dinamik bir threshold hesaplar.
        - Divergence'ların ortalaması ve standart sapması hesaplanır.
        - Standart sapmanın küçük, orta veya büyük olmasına bağlı olarak farklı faktörler uygulanır.

        """
        mean_div = sum(divergences) / len(divergences)
        variance = sum((d - mean_div) ** 2 for d in divergences) / len(divergences)
        std_div = math.sqrt(variance)
        if std_div < 0.1:
            factor = 1.4  # buradaki çarpımlar değiştirilebilir
        elif std_div > 0.3:
            factor = 1.0
        else:
            factor = 1.2
        return mean_div + std_div * factor

    def semantic_merging(self, segments):
        """
        Segmentleri, her pencere içerisindeki embedding'ler arası divergence hesaplanarak semantik olarak birleştirir.
        Her window için belirlenen dinamik threshold değerine göre bölme noktaları tespit edilir ve
        bu noktalara göre segmentler birleştirilir.

        """
        n = len(segments)
        if n < self.window_size:
            return [" ".join(segments)]
        
        embeddings = self.create_embeddings(segments)
        split_points = set()
        
        for window_start in range(0, n - self.window_size + 1):
            window_end = window_start + self.window_size
            window_embeddings = embeddings[window_start:window_end]
            window_divergences = []
            for i in range(self.window_size - 1):
                sim = cosine_similarity([window_embeddings[i]], [window_embeddings[i+1]])[0][0]
                divergence = 1 - sim
                window_divergences.append(divergence)
            local_threshold = self.calculate_dynamic_threshold_from_divergences(window_divergences)
            
            for i, div in enumerate(window_divergences):
                if div > local_threshold:
                    global_index = window_start + i + 1
                    split_points.add(global_index)
        
        split_points = sorted(list(split_points))
        chunks = []
        last_split = 0
        for point in split_points:
            chunk = " ".join(segments[last_split:point])
            if chunk:
                chunks.append(chunk)
            last_split = point
        if last_split < n:
            chunk = " ".join(segments[last_split:])
            if chunk:
                chunks.append(chunk)
        return chunks

    def adjust_boundaries(self, chunks):
        """
        Chunk'lar arasındaki sınırları ayarlamak için kullanılır.
        - Her iki chunk arasındaki geçiş bölgesinde, belirli sayıda cümlenin transferi ile
          daha uyumlu bir sınır elde edilmesi amaçlanır.
        - Bir sonraki chunk'ın ilk 'transfer_sentence_count' cümlesi aday olarak alınır.
        - Aday metnin, önceki chunk ve kalan kısmıyla olan benzerliği karşılaştırılarak,
          eğer önceki chunk ile olan benzerlik daha yüksekse, aday cümleler önceki chunk'a eklenir.

        """
        adjusted_chunks = chunks.copy()
        candidate_texts = []
        previous_texts = []
        remainder_texts = []
        indices = []
        
        for i in range(len(adjusted_chunks) - 1):
            next_sentences = self.split_text_into_sentences(adjusted_chunks[i+1])
            if not next_sentences or len(next_sentences) <= self.transfer_sentence_count:
                continue
            candidate_text = " ".join(next_sentences[:self.transfer_sentence_count])
            remainder = " ".join(next_sentences[self.transfer_sentence_count:])
            candidate_texts.append(candidate_text)
            previous_texts.append(adjusted_chunks[i])
            remainder_texts.append(remainder)
            indices.append(i)
        
        if candidate_texts:
            candidate_embeddings = self.create_embeddings(candidate_texts)
            previous_embeddings = self.create_embeddings(previous_texts)
            remainder_embeddings = self.create_embeddings(remainder_texts)
        
            for idx, i in enumerate(indices):
                candidate_emb = candidate_embeddings[idx]
                prev_emb = previous_embeddings[idx]
                next_emb = remainder_embeddings[idx]
                sim_prev = cosine_similarity([prev_emb], [candidate_emb])[0][0]
                sim_next = cosine_similarity([next_emb], [candidate_emb])[0][0]
                
                if sim_prev > sim_next:
                    next_sentences = self.split_text_into_sentences(adjusted_chunks[i+1])
                    candidate_text = " ".join(next_sentences[:self.transfer_sentence_count])
                    adjusted_chunks[i] = adjusted_chunks[i].strip() + " " + candidate_text
                    adjusted_chunks[i+1] = " ".join(next_sentences[self.transfer_sentence_count:])
        return adjusted_chunks

    def create_documents(self, texts):
        """
        Verilen metinleri, kural tabanlı segmentasyon, semantik birleştirme, sınır ayarlaması ve
        gerektiğinde uzun chunk'ların kelime sınırına göre bölünmesi adımlarından geçirerek dokümanlar oluşturur.

        """
        all_chunks = []
        for text in texts:
            segments = self.rule_based_segmentation(text)
            initial_chunks = self.semantic_merging(segments)
            adjusted_chunks = self.adjust_boundaries(initial_chunks)
            final_chunks = []
            for chunk in adjusted_chunks:
                if len(chunk) > 10000:
                    final_chunks.extend(self.split_segment_by_word_boundary(chunk, 10000))
                else:
                    final_chunks.append(chunk)
            all_chunks.extend(final_chunks)
        return all_chunks


In [None]:
# RAG ve Qdrant işlemleri

with open("config.json", "r") as config_file:
    config = json.load(config_file)
OPENAI_API_KEY = config["OPENAI_API_KEY"]
openai.api_key = OPENAI_API_KEY

collection_name = "hybrid-2-pdf-LLM-scipy_data_collection"
qdrant_client = QdrantClient(host="localhost", port=6333)

embedding_model_name = "text-embedding-3-large"

chunker = SemanticChunking(
    embedding_model=embedding_model_name,  
    window_size=6,  
    transfer_sentence_count=2  # değiştirilebilir
)

pdf_path = "Foundations of LLM.pdf"
documents = []
with open(pdf_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    full_text = []
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text.append(page_text)
    documents = ["\n".join(full_text)]

all_chunks = []
for doc in documents:
    chunks = chunker.create_documents([doc])
    all_chunks.extend(chunks)

vector_size = 3072

existing_collections = [c.name for c in qdrant_client.get_collections().collections]
if collection_name in existing_collections:
    qdrant_client.delete_collection(collection_name)
    print(f"{collection_name} koleksiyonu silindi, yeniden oluşturuluyor.")

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
)
print(f"{collection_name} koleksiyonu başarıyla oluşturuldu.")

points = []
for chunk in all_chunks:
    vector = get_embedding(chunk, embedding_model_name)
    point = {
        "id": str(uuid.uuid4()),
        "vector": vector,
        "payload": {"text": chunk}
    }
    points.append(point)

qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)
print(f"{len(points)} adet semantic chunk başarıyla Qdrant koleksiyonuna eklendi.")

hybrid-2-pdf-LLM-scipy_data_collection koleksiyonu silindi, yeniden oluşturuluyor.
hybrid-2-pdf-LLM-scipy_data_collection koleksiyonu başarıyla oluşturuldu.
217 adet semantic chunk başarıyla Qdrant koleksiyonuna eklendi.


In [None]:
def test_rag_direct_qdrant(qdrant_client, collection_name, query):
    """
    Sorgu üzerinden RAG işlemini test eder.
    - Sorgunun embedding'ini OpenAI API'si ile oluşturur.
    - Qdrant koleksiyonunda benzer metinleri arar.
    - Elde edilen kaynaklardan bağlam oluşturur ve ChatCompletion modeline göndererek yanıt alır.
    - İnference süresini hesaplar ve çıktıları ekrana yazdırır.

    """
    print(f"\nQuery: {query}")
    start_time = time.time()
    
    query_vector = get_embedding(query, embedding_model_name)
    
    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=5,
        with_payload=True
    )
    
    retrieved_docs = [result.payload["text"] for result in search_results]
    
    if not retrieved_docs:
        print("Qdrant'tan eşleşen belge bulunamadı.")
        return

    context = "\n".join(retrieved_docs)
    print("\nQdrant'tan Kullanılan Kaynaklar:")
    for idx, doc in enumerate(retrieved_docs, start=1):
        print(f"{idx}. (Lenghth: {len(doc)}) {doc[:100]}...")
        
    input_text = f"Context: {context}\nQuestion: {query}\nAnswer: "
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are an AI assistant that provides precise answers based on the given context."},
            {"role": "user", "content": input_text}
        ],
        temperature=0.3,
        max_tokens=300
    )
    
    end_time = time.time()
    print(f"\nInference zamanı: {round(end_time - start_time, 3)} saniye")
    print(f"\nModelin cevabı:\n{response.choices[0].message.content}")

In [10]:
query1 = "What is pre-training?"
test_rag_direct_qdrant(qdrant_client, collection_name, query1)


Query: What is pre-training?

Qdrant'tan Kullanılan Kaynaklar:
1. (Lenghth: 4031) In this section, we discuss the basic ideas in addressing the se issues. 1.1.1 Unsupervised, Supervi...
2. (Lenghth: 3053) Such a met hod has been successfully used in sev- eral NLP areas, such as word sense disambiguation ...
3. (Lenghth: 4191) 4.4.4 Step-by-step Alignment . . . . . . . . . . . . . . . . . . . . . . . . . . 198 4.4.5 Inference...
4. (Lenghth: 2126) ample, we randomly choose one of them for each training sampl e. In practice, the outcome of encoder...
5. (Lenghth: 4906) different problems together, with the beneﬁt of training a s ingle model that can perform many tasks...

Inference zamanı: 3.477 saniye

Modelin cevabı:
Pre-training refers to the process of optimizing a neural network before it is further trained or tuned for specific tasks. It is based on the assumption that a model pre-trained on one task can be adapted to perform another task, reducing the need for extensive labeled d

In [11]:
query2 = "Which types of models are widely used in NLP pre-training?"
test_rag_direct_qdrant(qdrant_client, collection_name, query2)


Query: Which types of models are widely used in NLP pre-training?

Qdrant'tan Kullanılan Kaynaklar:
1. (Lenghth: 3053) Such a met hod has been successfully used in sev- eral NLP areas, such as word sense disambiguation ...
2. (Lenghth: 2186) As an example, BERT is used to illus trate how sequence models are pre- trained via masked language ...
3. (Lenghth: 4834) This is often achieved via dynamic programming, which, in th e context of path ﬁnding over a lattice...
4. (Lenghth: 3459) goals and methods of language modeling have remained largel y unchanged over the decades since then....
5. (Lenghth: 4191) 4.4.4 Step-by-step Alignment . . . . . . . . . . . . . . . . . . . . . . . . . . 198 4.4.5 Inference...

Inference zamanı: 3.25 saniye

Modelin cevabı:
The two major types of models widely used in NLP pre-training are:

1. **Sequence Encoding Models**: These models represent a sequence of words or tokens as either a real-valued vector or a sequence of vectors, which is typically used a

In [12]:
query3 = "How do we implement permuted language modelling?"
test_rag_direct_qdrant(qdrant_client, collection_name, query3)


Query: How do we implement permuted language modelling?

Qdrant'tan Kullanılan Kaynaklar:
1. (Lenghth: 5429) sense to predict any of the tokens in this sequence. 1.2.2.1 Masked Language Modeling One of the mos...
2. (Lenghth: 2895) ceive a sequence of tokens x0,...,x i−1and produce a distribution over the vocabulary V(de- noted by...
3. (Lenghth: 1439) ↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ ↓ encoding: h0h1h2 h3h4h5h6h7h8 h9 h10h11 ↓ Softmax ↓ Is Next or Not? 1.2 Self...
4. (Lenghth: 3459) goals and methods of language modeling have remained largel y unchanged over the decades since then....
5. (Lenghth: 3090) number of tasks and generalize to perform new tasks with a sma ll adaptation effort [ Bubeck et al. ...

Inference zamanı: 7.857 saniye

Modelin cevabı:
Permuted language modeling involves making sequential predictions of tokens in a non-linear order, allowing for predictions to occur in any sequence rather than strictly following the natural order of the text. Here’s how to implement permuted l

In [15]:
query4 = "What is the large-scale pre-training of the document?"
test_rag_direct_qdrant(qdrant_client, collection_name, query4)


Query: What is the large-scale pre-training of the document?

Qdrant'tan Kullanılan Kaynaklar:
1. (Lenghth: 4031) In this section, we discuss the basic ideas in addressing the se issues. 1.1.1 Unsupervised, Supervi...
2. (Lenghth: 6359) The training task is itself standard: the objective is to max imize the likelihood, which can be ach...
3. (Lenghth: 3053) Such a met hod has been successfully used in sev- eral NLP areas, such as word sense disambiguation ...
4. (Lenghth: 2186) As an example, BERT is used to illus trate how sequence models are pre- trained via masked language ...
5. (Lenghth: 4191) 4.4.4 Step-by-step Alignment . . . . . . . . . . . . . . . . . . . . . . . . . . 198 4.4.5 Inference...

Inference zamanı: 6.226 saniye

Modelin cevabı:
Large-scale pre-training refers to the process of training neural networks, particularly in natural language processing (NLP), on vast amounts of unlabeled data using self-supervised learning techniques. This approach allows models to learn