In [None]:
import json
import uuid
import time
import openai
import PyPDF2
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
from langchain.embeddings.openai import OpenAIEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer

# Helper function that extracts keywords using TF-IDF
def extract_keywords_tfidf(text, top_n: int = 5):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_array = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]
    if len(tfidf_scores) == 0:
        return []
    # Select the top-n words with the highest scores
    top_n_idx = tfidf_scores.argsort()[::-1][:top_n]
    top_keywords = [feature_array[i] for i in top_n_idx]
    return top_keywords


# Load configuration
with open("config.json", "r") as config_file:
    config = json.load(config_file)
OPENAI_API_KEY = config["OPENAI_API_KEY"]

# Qdrant and embedding settings
collection_name = "semantic-tfidf-LLM-data_collection"
qdrant_client = QdrantClient(host="localhost", port=6333)
embedding_model_name = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY, model=embedding_model_name
)

# Example SemanticChunking class (your own implementation)
chunker = SemanticChunking()

# Pull data from PDF
pdf_path = "Foundations of LLM.pdf"
documents = []
with open(pdf_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    full_text = []
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            full_text.append(page_text)
    documents = ["\n".join(full_text)]

# Split into chunks
all_chunks = []
for doc in documents:
    chunks = chunker.create_documents([doc])
    all_chunks.extend(chunks)

vector_size = 1536

# If the collection already exists, delete it and recreate
existing_collections = [c.name for c in qdrant_client.get_collections().collections]
if collection_name in existing_collections:
    qdrant_client.delete_collection(collection_name)
    print(f"{collection_name} collection deleted, recreating...")

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)
print(f"{collection_name} collection successfully created.")

# While inserting chunks into Qdrant, also attach TF-IDF keywords to the payload
points = []
for chunk in all_chunks:
    vector = embeddings.embed_query(chunk)

    # Extract keywords from the chunk content using TF-IDF
    chunk_keywords = extract_keywords_tfidf(chunk, top_n=5)

    point = {
        "id": str(uuid.uuid4()),
        "vector": vector,
        "payload": {"text": chunk, "keywords": chunk_keywords},
    }
    points.append(point)

qdrant_client.upsert(collection_name=collection_name, points=points)
print(f"{len(points)} semantic chunks successfully added to Qdrant collection.")


def test_rag_hybrid_qdrant(qdrant_client, collection_name, query: str):
    print(f"\nQuery: {query}")
    start_time = time.time()

    # Compute the query embedding
    query_vector = embeddings.embed_query(query)

    # Extract keywords from the query with TF-IDF (top 3 words)
    query_keywords = extract_keywords_tfidf(query, top_n=3)

    # Retrieve 10 candidate results from Qdrant based on cosine similarity
    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=10,
        with_payload=True,
    )

    # Post-processing: compute a keyword-match score for each document
    re_ranked = []
    for result in search_results:
        cosine_score = result.score  # Cosine similarity score returned by Qdrant
        payload = result.payload

        # Get the document's keyword list (if any)
        doc_keywords = payload.get("keywords", [])

        # Calculate the match ratio between query and document keywords (0-1)
        match_count = 0
        for kw in query_keywords:
            for doc_kw in doc_keywords:
                if kw.lower() in doc_kw.lower():
                    match_count += 1
                    break
        keyword_score = match_count / len(query_keywords) if query_keywords else 0

        # Hybrid score: 0.7 cosine, 0.3 keyword match
        # Additionally keep the raw cosine score
        final_score = 0.7 * cosine_score + 0.3 * keyword_score
        re_ranked.append((final_score, cosine_score, payload["text"]))

    # Sort results in descending order of final_score
    re_ranked.sort(key=lambda x: x[0], reverse=True)
    top_results = re_ranked[:5]

    if not top_results:
        print("No matching document found in Qdrant.")
        return

    # The context variable is built only from the document texts
    context = "\n".join([doc for _, _, doc in top_results])
    print("\nSources used from Qdrant (hybrid ranking):")
    for idx, (final_score, cosine_score, doc) in enumerate(top_results, start=1):
        print(
            f"{idx}. (Hybrid Score: {final_score:.3f}, Cosine: {cosine_score:.3f}, "
            f"Length: {len(doc)}) {doc[:100]}..."
        )

    input_text = f"Context: {context}\nQuestion: {query}\nAnswer: "
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an AI assistant that provides precise answers based on the given context.",
            },
            {"role": "user", "content": input_text},
        ],
        temperature=0.5,
        max_tokens=300,
    )

    end_time = time.time()
    print(f"\nInference time: {round(end_time - start_time, 3)} seconds")
    print(f"\nModel's answer:\n{response.choices[0].message.content}")


semantic-tfidf-LLM-data_collection koleksiyonu silindi, yeniden oluşturuluyor...
semantic-tfidf-LLM-data_collection koleksiyonu başarıyla oluşturuldu.
136 adet semantic chunk başarıyla Qdrant koleksiyonuna eklendi.


In [3]:
query7 = "What are the key features of the Transformer architecture?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query7)


Sorgu: What are the key features of the Transformer architecture?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.554, Cosine: 0.792, Uzunluk: 6730) verge at certain points during optimization. The training o f LLMs is generally inﬂuenced by many fa...
2. (Hybrid Score: 0.549, Cosine: 0.784, Uzunluk: 3153) This can be expressed as Output = Merge(head 1,...,head τ)Whead(2.70) where head j∈Rdhis computed us...
3. (Hybrid Score: 0.547, Cosine: 0.781, Uzunluk: 4147) An example 74 Generative Models of this approach is compressive Transformer [ Rae et al. ,2019 ]. It...
4. (Hybrid Score: 0.545, Cosine: 0.779, Uzunluk: 1907) [Elsken et al., 2019] Thomas Elsken, Jan Hendrik Metzen, and Frank Hutter. Neural architecture searc...
5. (Hybrid Score: 0.544, Cosine: 0.778, Uzunluk: 4824) So they cannot cover functions with inﬂection points, such as double descent cur ves. In response, r...

Inference zamanı: 7.497 saniye

Modelin cevabı:
The key features of the Transform

In [4]:
query10 = "What is positional encoding?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query10)


Sorgu: What is positional encoding?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.744, Cosine: 0.849, Uzunluk: 5334) In this case, the embedding at position ican be expressed as ei=xi+ PE(i) (2.74) where xi∈Rddenotes ...
2. (Hybrid Score: 0.718, Cosine: 0.811, Uzunluk: 1108) However, Press et al. [2022 ] found that setting βto values decreasing geometrically by a factor of1...
3. (Hybrid Score: 0.711, Cosine: 0.802, Uzunluk: 7737) is the rotation matrix. If two or more rotations are performe d on the same vector, we can rotate th...
4. (Hybrid Score: 0.571, Cosine: 0.816, Uzunluk: 2879) positions are represented as combinations of sine and cosin e functions with different frequencies. ...
5. (Hybrid Score: 0.557, Cosine: 0.796, Uzunluk: 3860) i+1for short). Suppose we have the gold- standard distribution at the same position, denoted by pgol...

Inference zamanı: 8.732 saniye

Modelin cevabı:
Positional encoding is a technique used in neural network model

In [5]:
query12 = "What is semantic chunking and how does it improve search efficiency?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query12)


Sorgu: What is semantic chunking and how does it improve search efficiency?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.637, Cosine: 0.767, Uzunluk: 3191) For example, LLMs can access real-time data from ﬁnancial markets to prov ide up-to-date investment ...
2. (Hybrid Score: 0.546, Cosine: 0.780, Uzunluk: 3190) Parameter-efﬁcient transfer learning for NLP. In Proceedings of the 36th International Conference on...
3. (Hybrid Score: 0.544, Cosine: 0.777, Uzunluk: 3393) Long Papers) , pages 86–96, 2016. [Seo et al., 2017] Minjoon Seo, Aniruddha Kembhavi, Ali Farh adi, ...
4. (Hybrid Score: 0.543, Cosine: 0.776, Uzunluk: 5572) This moti vates researchers to develop new evaluation benchmarks and metrics for long-context LLMs. ...
5. (Hybrid Score: 0.541, Cosine: 0.773, Uzunluk: 3900) it remains challenging to effectively prompt LLMs. Note tha t if we face a very difﬁcult classiﬁca- ...

Inference zamanı: 5.441 saniye

Modelin cevabı:
Semantic chunking is th

In [6]:
import uuid

# Üç ayrı metin (chunk) tanımlıyoruz:
custom_texts = [
    "The Transformer architecture is a fundamental building block in language models. Its self-attention mechanism enables the model to capture relationships between words.",
    "Positional encoding assists models in understanding the order of words, ensuring that the sentence structure is maintained.",
    "Semantic chunking splits long texts into meaningful segments, facilitating efficient search and information retrieval. This method helps in grouping similar content together."
]

points = []
for text in custom_texts:
    # Her chunk için embedding hesapla
    vector = embeddings.embed_query(text)
    
    # TF-IDF ile anahtar kelimeleri çıkar (ilk 5 anahtar kelime)
    custom_keywords = extract_keywords_tfidf(text, top_n=5)
    
    # Her chunk için point oluştur
    point = {
        "id": str(uuid.uuid4()),
        "vector": vector,
        "payload": {
            "text": text,
            "keywords": custom_keywords
        }
    }
    points.append(point)

# Tüm point'leri Qdrant koleksiyonuna ekle
qdrant_client.upsert(
    collection_name=collection_name,
    points=points
)

print("Kendi point'leriniz başarıyla koleksiyona eklendi!")


Kendi point'leriniz başarıyla koleksiyona eklendi!


In [7]:
query8 = "What are the key features of the Transformer architecture??"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query8)




Sorgu: What are the key features of the Transformer architecture??

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.706, Cosine: 0.866, Uzunluk: 167) The Transformer architecture is a fundamental building block in language models. Its self-attention ...
2. (Hybrid Score: 0.556, Cosine: 0.794, Uzunluk: 6730) verge at certain points during optimization. The training o f LLMs is generally inﬂuenced by many fa...
3. (Hybrid Score: 0.547, Cosine: 0.782, Uzunluk: 3153) This can be expressed as Output = Merge(head 1,...,head τ)Whead(2.70) where head j∈Rdhis computed us...
4. (Hybrid Score: 0.546, Cosine: 0.780, Uzunluk: 4147) An example 74 Generative Models of this approach is compressive Transformer [ Rae et al. ,2019 ]. It...
5. (Hybrid Score: 0.546, Cosine: 0.780, Uzunluk: 1925) [Kahneman, 2011] Daniel Kahneman. Thinking, fast and slow . macmillan, 2011. [Kaplan et al., 2020] J...

Inference zamanı: 9.256 saniye

Modelin cevabı:
The key features of the Transform

In [8]:
query9 = "What is positional encoding?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query9)


Sorgu: What is positional encoding?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.768, Cosine: 0.883, Uzunluk: 123) Positional encoding assists models in understanding the order of words, ensuring that the sentence s...
2. (Hybrid Score: 0.744, Cosine: 0.849, Uzunluk: 5334) In this case, the embedding at position ican be expressed as ei=xi+ PE(i) (2.74) where xi∈Rddenotes ...
3. (Hybrid Score: 0.718, Cosine: 0.811, Uzunluk: 1108) However, Press et al. [2022 ] found that setting βto values decreasing geometrically by a factor of1...
4. (Hybrid Score: 0.711, Cosine: 0.802, Uzunluk: 7737) is the rotation matrix. If two or more rotations are performe d on the same vector, we can rotate th...
5. (Hybrid Score: 0.571, Cosine: 0.816, Uzunluk: 2879) positions are represented as combinations of sine and cosin e functions with different frequencies. ...

Inference zamanı: 3.307 saniye

Modelin cevabı:
Positional encoding is a technique used in models, particularly 

In [9]:
query11 = "What is semantic chunking and how does it improve search efficiency?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query11)


Sorgu: What is semantic chunking and how does it improve search efficiency?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.633, Cosine: 0.904, Uzunluk: 174) Semantic chunking splits long texts into meaningful segments, facilitating efficient search and info...
2. (Hybrid Score: 0.564, Cosine: 0.806, Uzunluk: 123) Positional encoding assists models in understanding the order of words, ensuring that the sentence s...
3. (Hybrid Score: 0.546, Cosine: 0.780, Uzunluk: 3190) Parameter-efﬁcient transfer learning for NLP. In Proceedings of the 36th International Conference on...
4. (Hybrid Score: 0.544, Cosine: 0.777, Uzunluk: 3393) Long Papers) , pages 86–96, 2016. [Seo et al., 2017] Minjoon Seo, Aniruddha Kembhavi, Ali Farh adi, ...
5. (Hybrid Score: 0.543, Cosine: 0.776, Uzunluk: 5572) This moti vates researchers to develop new evaluation benchmarks and metrics for long-context LLMs. ...

Inference zamanı: 2.347 saniye

Modelin cevabı:
Semantic chunking is a me

In [10]:
query1 = "What is pre-training?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query1)


Sorgu: What is pre-training?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.875, Cosine: 0.821, Uzunluk: 2710) forgetting problem in continual training, where a neural network forge ts previously learned in- for...
2. (Hybrid Score: 0.872, Cosine: 0.818, Uzunluk: 5110) The training objective can be deﬁned as (ˆθ,ˆω) = arg max θ,ωLoss(Model θ,ω(xnoise),x) (1.16) Here t...
3. (Hybrid Score: 0.871, Cosine: 0.816, Uzunluk: 4839) The ﬁne-tuned model is then employed to classify new sequences for this task. An advantage of superv...
4. (Hybrid Score: 0.866, Cosine: 0.809, Uzunluk: 9996) D dataset used for training or ﬁne-tuning a model ∂L ∂θgradient of the loss function Lwith respect t...
5. (Hybrid Score: 0.857, Cosine: 0.796, Uzunluk: 3860) i+1for short). Suppose we have the gold- standard distribution at the same position, denoted by pgol...

Inference zamanı: 3.397 saniye

Modelin cevabı:
Pre-training is a machine learning approach where a model is initially

In [11]:
query2 = "Which types of models are widely used in NLP pre-training?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query2)


Sorgu: Which types of models are widely used in NLP pre-training?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.604, Cosine: 0.863, Uzunluk: 965) 1Here we assume that tokens are basic units of text that are sep arated through tokenization. Someti...
2. (Hybrid Score: 0.592, Cosine: 0.846, Uzunluk: 3190) Parameter-efﬁcient transfer learning for NLP. In Proceedings of the 36th International Conference on...
3. (Hybrid Score: 0.590, Cosine: 0.843, Uzunluk: 5918) the discussion of these topics to the following chapters. CHAPTER 2 Generative Models One of the mos...
4. (Hybrid Score: 0.589, Cosine: 0.842, Uzunluk: 3861) The use of these ex- amples does not distinguish between models, but we mark the m odel architecture...
5. (Hybrid Score: 0.583, Cosine: 0.833, Uzunluk: 8279) in work using shared vocabularies, specifying the language to which a token belongs is not necessary...

Inference zamanı: 5.488 saniye

Modelin cevabı:
The types of models widely used in

In [12]:
query3 = "How do we implement permuted language modelling?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query3)


Sorgu: How do we implement permuted language modelling?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.682, Cosine: 0.831, Uzunluk: 5918) the discussion of these topics to the following chapters. CHAPTER 2 Generative Models One of the mos...
2. (Hybrid Score: 0.681, Cosine: 0.830, Uzunluk: 7725) sense to predict any of the tokens in this sequence. 1.2.2.1 Masked Language Modeling One of the mos...
3. (Hybrid Score: 0.673, Cosine: 0.818, Uzunluk: 3860) i+1for short). Suppose we have the gold- standard distribution at the same position, denoted by pgol...
4. (Hybrid Score: 0.582, Cosine: 0.831, Uzunluk: 8453) The approach described above provides a new framework of uni versal language understanding and gener...
5. (Hybrid Score: 0.578, Cosine: 0.826, Uzunluk: 5839) architecture to adapt LLMs to large-scale training. In Sect ion2.2we will present more discussions o...

Inference zamanı: 9.554 saniye

Modelin cevabı:
Permuted language modeling is implemented b

In [13]:
query4 = "What is the large-scale pre-training of the document?"
test_rag_hybrid_qdrant(qdrant_client, collection_name, query4)


Sorgu: What is the large-scale pre-training of the document?

Qdrant'tan Kullanılan Kaynaklar (hybrid sıralama ile):
1. (Hybrid Score: 0.784, Cosine: 0.835, Uzunluk: 2710) forgetting problem in continual training, where a neural network forge ts previously learned in- for...
2. (Hybrid Score: 0.778, Cosine: 0.826, Uzunluk: 9996) D dataset used for training or ﬁne-tuning a model ∂L ∂θgradient of the loss function Lwith respect t...
3. (Hybrid Score: 0.777, Cosine: 0.825, Uzunluk: 9182) example, in He et al. [2021 ]’s work, a 1.5 billion-parameter BERT-like model is built b y increasin...
4. (Hybrid Score: 0.683, Cosine: 0.833, Uzunluk: 965) 1Here we assume that tokens are basic units of text that are sep arated through tokenization. Someti...
5. (Hybrid Score: 0.682, Cosine: 0.832, Uzunluk: 4869) scaling laws for LLMs, which help us understand their traini ng efﬁciency and effectiveness. 2.2.1 D...

Inference zamanı: 4.935 saniye

Modelin cevabı:
The large-scale pre-training discussed 