In [1]:
import json
import pandas as pd
from tqdm.auto import tqdm
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('document_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [4]:
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)
    

100%|████████████████████████████████████████████████████████████████████████████| 948/948 [01:55<00:00,  8.19it/s]


In [5]:
es_client = Elasticsearch("http://localhost:9200")

In [6]:

index_settings = {
    "settings": {
        "number_of_shards" : 1,
        "number_of_replicas" : 0
    },
    "mappings" : {
        "properties" : {
            "text" : {"type" : "text"},
            "section" : {"type" : "text"},
            "question" : {"type" : "text"},
            "course" : {"type" : "keyword"},
            "id" : {"type" : "keyword"},
            "question_vector" : {
                "type" : "dense_vector",
                "dims" : 384,
                "index" : True,
                "similarity": "cosine"
            },
            "text_vector" : {
                "type" : "dense_vector",
                "dims" : 384,
                "index" : True,
                "similarity": "cosine"
            },
            "question_text_vector" : {
                "type" : "dense_vector",
                "dims" : 384,
                "index" : True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course_questions"
es_client.indices.delete(index = index_name, ignore_unavailable=True)
es_client.indices.create(index = index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [7]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████| 948/948 [00:07<00:00, 119.62it/s]


### Vector Search

In [18]:
course = "data-engineering-zoomcamp"

In [19]:
query = "Can i still join the course?"

In [20]:
query_vec = model.encode(query)

In [25]:
knn_query= {
    "field" : "text_vector",
    "query_vector" : query_vec,
    "k" : 5,
    "num_candidates" : 10000,
    "boost" :  0.5,
    "filter" : {
        "term" : {
            "course" : course
            }
        }
    }

In [26]:
keyword_search = {
    "bool": {
        "must": {
            "multi_match": {
                "query": query,
                "fields": ["question^3", "text", "section"],
                "type": "best_fields",
                "boost" :  0.5
            }
        },
        "filter": {
            "term": {
                "course": course
            }
        }
    }
}

In [27]:
response_result = es_client.search(
    
    index = index_name,
    query = keyword_search,
    knn = knn_query,
    size = 5
    )

In [28]:
response_result['hits']['hits']

[{'_index': 'course_questions',
  '_id': 'GwdS2pUBGGbfuhAvCd_r',
  '_score': 33.394363,
  '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
   'section': 'General course-related questions',
   'question': 'Course - Can I still join the course after the start date?',
   'course': 'data-engineering-zoomcamp',
   'id': '7842b56a',
   'question_vector': [0.0030358924996107817,
    -0.002387200016528368,
    0.035881660878658295,
    0.02099882811307907,
    -0.018282320350408554,
    0.06715093553066254,
    -0.10277318954467773,
    -0.11509547382593155,
    -0.06606752425432205,
    -0.004973369650542736,
    -0.002861724467948079,
    0.10543154180049896,
    -0.000814331229776144,
    0.08418365567922592,
    0.027047153562307358,
    -0.03135377913713455,
    -0.05154325067996979,
    -0.0494899675250053

### Hybrid search pipline

In [19]:
df_ground_truth = pd.read_csv('ground_truth_data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [17]:
def hit_rate(relavance_total):
    cnt = 0
    for line in relavance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relavance_total)

In [18]:
def mrr_score(relavance_total):
    score = 0.0
    for line in relavance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                score = score + 1 / (rank + 1)
        
    return score / len(relavance_total)

In [67]:
def elastic_search_hybrid(field, query, vector,course):
    knn_query= {
        "field" : field,
        "query_vector" : vector,
        "k" : 5,
        "num_candidates" : 10000,
        "boost" :  0.1,
        "filter" : {
            "term" : {
                "course" : course
                    }
                }
        }

    keyword_search = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost" :  0.9
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }
    
    search_query = {
        "knn" : knn_query,
        "query" : keyword_search,
        "size" : 5,
        "_source" : ["text", "section", "question", "course", "id"]
    }
    
    es_result = es_client.search(
        index = index_name,
        body = search_query
    )
    result_docs = []
    
    for hit in es_result['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [42]:
def question_hybrid(q):
    question = q["question"]
    course = q["course"]

    v_q = model.encode(question)
    return elastic_search_hybrid("question_vector", question, v_q, course)

In [15]:
def evaluation_function(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        #results = search_function(query=q['question'], course=q['course'])
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr_score': mrr_score(relevance_total)
    }

In [45]:
evaluation_function(ground_truth, question_hybrid)

  0%|          | 0/4735 [00:00<?, ?it/s]

{'hit_rate': 0.8992608236536431, 'mrr_score': 0.7927877507919756}

ES on question: `{'hit_rate': 0.737909186906019, 'mrr_score': 0.6191622668074627}`

In [49]:
def text_hybrid(q):
    question = q["question"]
    course = q["course"]

    v_q = model.encode(question)
    return elastic_search_hybrid("text_vector", question, v_q, course)

In [48]:
evaluation_function(ground_truth, text_hybrid)

  0%|          | 0/4735 [00:00<?, ?it/s]

{'hit_rate': 0.8986272439281943, 'mrr_score': 0.79202041534671}

ES on text : `{'hit_rate': 0.8270327349524815, 'mrr_score': 0.6902358324533618}`

In [68]:
def question_text_hybrid(q):
    question = q["question"]
    course = q["course"]

    v_q = model.encode(question)
    return elastic_search_hybrid("question_text_vector", question, v_q, course)

evaluation_function(ground_truth, question_text_hybrid)

  0%|          | 0/4735 [00:00<?, ?it/s]

{'hit_rate': 0.8891235480464625, 'mrr_score': 0.7806546990496314}

- ES vector boost:0.4 :  `{'hit_rate': 0.8977824709609292, 'mrr_score': 0.7900527983104552}`
- ES vector boost:0.1 :  `{'hit_rate': 0.8891235480464625, 'mrr_score': 0.7806546990496314}`
- ES vector boost:0.2 :  `{'hit_rate': 0.8914466737064414, 'mrr_score': 0.7830306230200645}`
- ES vector boost:0.6 :  `{'hit_rate': 0.9083421330517424, 'mrr_score': 0.8043083421330526}`
- ES vector boost:0.7 :  `{'hit_rate': 0.916156282998944, 'mrr_score': 0.8148785638859566}`
- ES vector boost:0.8 :  `{'hit_rate': 0.9336853220696938, 'mrr_score': 0.8329250263991561}`
- ES vector boost:0.9 :  `{'hit_rate': 0.9526927138331573, 'mrr_score': 0.8553326293558616}`
- ES vector boost:0.95 :  `{'hit_rate': 0.9414994720168954, 'mrr_score': 0.851883139739529}`

ES on both text_ques:`{'hit_rate': 0.9102428722280888, 'mrr_score': 0.7961457233368543}`

### Reranking

In [8]:
def elastic_search_hybrid_reranker(field, query, vector,course):
    knn_query= {
        "field" : field,
        "query_vector" : vector,
        "k" : 5,
        "num_candidates" : 10000,
        "boost" :  0.1,
        "filter" : {
            "term" : {
                "course" : course
                    }
                }
        }

    keyword_search = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost" :  0.9
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }
    
    search_query = {
        "knn" : knn_query,
        "query" : keyword_search,
        "size" : 5,
        "rank":{
            "rrf" : {}
        },
        "_source" : ["text", "section", "question", "course", "id"]
    }
    
    es_result = es_client.search(
        index = index_name,
        body = search_query
    )
    result_docs = []
    
    for hit in es_result['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [9]:
course = "data-engineering-zoomcamp"
query = "Can i still join the course?"

In [10]:
v_q = model.encode(query)

##### elastic search restrichted license to use rrf on open source license

In [None]:
elastic_search_hybrid_reranker("question_text_vector", query, v_q, course)

### RRF implementation

In [32]:
def compute_rrf(rank, k=60):
    return 1 / (k + rank)

def elastic_search_hybrid_reranker(field, query, vector,course, k=60):
    knn_query= {
        "field" : field,
        "query_vector" : vector,
        "k" : 10,
        "num_candidates" : 10000,
        "boost" :  0.5,
        "filter" : {
            "term" : {
                "course" : course
                    }
                }
        }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "text", "section"],
                    "type": "best_fields",
                    "boost" :  0.5
                }
            },
            "filter": {
                "term": {
                    "course": course
                }
            }
        }
    }

    knn_results = es_client.search(
        index = index_name,
        body = {
            "knn" : knn_query,
            "size" : 10,
        }
    )['hits']['hits']

    keyword_results = es_client.search(
        index = index_name,
        body = {
            "query" : keyword_query,
            "size" : 10,
        }
    )['hits']['hits']
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)
    #Sort rrf scores in desc
    reranked_docs= sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)

    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:5]:
        doc = es_client.get(index = index_name, id = doc_id)
        final_results.append(doc['_source'])
        
    return final_results

In [33]:
def question_text_hybrid_rrf(q):
    question = q["question"]
    course = q["course"]

    v_q = model.encode(question)
    return elastic_search_hybrid_reranker("question_text_vector", question, v_q, course)

evaluation_function(ground_truth, question_text_hybrid_rrf)

100%|██████████████████████████████████████████████████████████████████████████| 4735/4735 [03:08<00:00, 25.10it/s]


{'hit_rate': 0.950791974656811, 'mrr_score': 0.8447905667018667}

- Es hybrid search with rrf reranker:-vector=0.7 : `{'hit_rate': 0.950791974656811, 'mrr_score': 0.8447905667018667}`