Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Azure Cognitive Search and the OpenAI RAG pattern in Azure Synapse/ Microsoft Fabric using the BBC Sports Dataset

# To run this notebook, ensure you have run the deployment script in the Data Discovery Toolkit - [here](https://github.com/microsoft/Data-Discovery-Toolkit#if-you-do-not-have-a-synapse-workspace)

## This notebook will showcase the following:

- For a given dataset generate a query per record using a HuggingFace Question generator
- Run the queries using Standard Search
- Run the queries using Semantic Search
- Run the queries using Standard Search and the OpenAI RAG pattern
- Run the queries using Semantic Search and the OpenAI RAG pattern
- Run the queries using Standard Hybrid Vector Search
- Run the queries using Semantic Hybrid Vector Search
- Run the queries using Standard Hybrid Vector Search and the OpenAI RAG pattern
- Run the queries using Semantic Hybrid Vector Search and the OpenAI RAG pattern
- Evaluate the search results and use BART for OpenAI answer Entailment evaluation
- Evaluate using a normalised score for both semantic and standard search



## This cell configures the spark session - Do not change (not needed for Trident)

In [1]:
%%configure -f
{
"conf": {
     "spark.rpc.message.maxSize": 1024,
     "spark.kryoserializer.buffer.max": "1024m"
   }
}

StatementMeta(, 92, -1, Finished, Available)

In [2]:
from pyspark import SparkContext, SparkConf
import sys
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
import ntpath
import os
import numpy as np
import openai
from math import sqrt
from scipy.stats import spearmanr
import torch
import torch.nn.functional as F
from requests import post, put


from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf,col

from itertools import combinations
from operator import itemgetter

from graphframes import *
from pyspark.sql.functions import monotonically_increasing_id, lit


import os
import openai
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import QueryType
from azure.search.documents.indexes import SearchIndexClient  
from azure.core.credentials import AzureKeyCredential

import spacy
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from fuzzywuzzy import fuzz
from IPython.display import display, Markdown

from pyspark.sql.functions import udf

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType, StructType
from pyspark.sql.functions import udf,col
from objdict import ObjDict
from tenacity import retry, wait_random_exponential, stop_after_attempt
 
import requests
from pprint import pprint
import json
import ntpath



StatementMeta(DataDiscovery, 92, 2, Finished, Available)

2023-06-29 08:32:28.753724: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Update all your parameters here

In [19]:
# OpenAI deployment values
global openai_chatgpt_deployment
openai_chatgpt_deployment = "chat"

global openai_gpt_deployment
openai_gpt_deployment = "davinci"

global openai_service
openai_service = ""

global openai_api_key
openai_api_key = ""

global openai_api_type
openai_api_type = "azure"

global openai_api_base
openai_api_base = f"https://{openai_service}.openai.azure.com"

global openai_api_version
openai_api_version = "2022-12-01"

global openai_embeddings_deployment_id
openai_embeddings_deployment_id = "te" 

# If you have a RPM rate limit on your OpenAI account - sleep time in seconds
openai_sleep_time = 120

# The input file name that you want to query and egenerate questions from
input_filename = 'abfss://share@datadiscoverypipeline2.dfs.core.windows.net/bbcsports/csv/sport_articles.csv'

# The vector file - pre-extracted
input_vector_file = 'abfss://share@datadiscoverypipeline2.dfs.core.windows.net/bbcsports/csv/bbcVectors.json'

# How many records to query and evaluate
global top
top = 3

# Azure Search Admin Key
global search_admin_key
search_admin_key = "" 
# The name of the search service
global search_service_name
search_service_name = "dd28" 
# The Azure Search Query Key
global search_query_key
search_query_key = "" 

global search_index
search_index = "bbcvector"
# This is the name of the semantic configuration on the search index
global semantic_configuration_name
semantic_configuration_name = "bbc-semantic-config2"
global vector_search_api_version
vector_search_api_version = "2023-07-01-Preview"


search_client = SearchClient(
    endpoint=f"https://{search_service_name}.search.windows.net",
    index_name=search_index,
    credential=AzureKeyCredential(search_admin_key))

StatementMeta(DataDiscovery, 92, 19, Finished, Available)

## Test OpenAI

In [21]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, deployment_id=openai_embeddings_deployment_id)
    embeddings = response['data'][0]['embedding']
    return embeddings

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_completion(engine, prompt, temperature, max_tokens, stop):
    completion = openai.Completion.create(
                    engine=openai_gpt_deployment,
                    prompt=prompt,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    stop=stop)
    return completion.choices[0].text    

openai_embeddings_deployment_id = openai_embeddings_deployment_id

openai.api_key = openai_api_key
openai.api_type = openai_api_type
openai.api_base = f"https://{openai_service}.openai.azure.com"
openai.api_version = openai_api_version

search = generate_completion(engine=openai_chatgpt_deployment, prompt="Tell me about yourself", temperature=0.7, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"])
assert len(search) != 0
print(search)

StatementMeta(DataDiscovery, 92, 21, Finished, Available)



I am a recent college graduate with a bachelor's degree in marketing. I am a hard-working, dedicated, and reliable individual who enjoys taking on new challenges and learning new skills. I am also an organized and detail-oriented person, with excellent communication and interpersonal skills, which I believe will be a great asset in any role. I am passionate about marketing and am eager to use my knowledge and skills to help businesses reach their goals.



# Let's automatically generate a query or each record

In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = "voidful/context-only-question-generator" #"allenai/t5-small-squad2-question-generation"

qa_tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


def generate_question(input_string):
    input_ids = qa_tokenizer.encode(input_string, return_tensors="pt", truncation=True)
    res = qa_model.generate(input_ids)
    output = qa_tokenizer.batch_decode(res, skip_special_tokens=True)
    return str(output[0])

udf_generate_question = udf(generate_question, StringType()) 


StatementMeta(DataDiscovery, 92, 22, Finished, Available)

Downloading (…)okenizer_config.json:   0%|          | 0.00/331 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

StatementMeta(DataDiscovery, 92, 23, Finished, Available)

# Test the question generation 

In [14]:
context = """Eddie Jones says New Zealand are vulnerable as his England team prepares to face the three-time world champions for the first time since 2019.

England's victory in the Rugby World Cup semi-final three years ago is a highlight of Jones' seven-year reign.

"We showed that if you've got the right attitude and the right game plan then history can be broken," Jones said.

"If we go after them then they're there for the taking - and we're going to go after them."

New Zealand, who play Scotland on Sunday, visit Twickenham next Saturday, with England coming into the game on the back of a 52-13 win over Japan.

New Zealand have won their last five matches, but lost six of the previous eight. That run came close to costing Jones"""
question = generate_question(context)

assert question == "Who won the Rugby World Cup semi-final three years ago?"
print(question)

StatementMeta(DataDiscovery, 84, 15, Finished, Available)

Who won the Rugby World Cup semi-final three years ago?


# Run the Question generation process

In [25]:
df_questions = df.withColumn("question", udf_generate_question(col("text")))

StatementMeta(DataDiscovery, 40, 29, Finished, Available)

# Test the Search client

In [9]:
r = search_client.search("Whose thumb was fractured?",
                                 #filter=filter,
                                 query_type="semantic",
                                 query_language="en-us",
                                 query_speller="lexicon",
                                 semantic_configuration_name=semantic_configuration_name,
                                 top=top)



results = [doc for doc in r]
for i, doc in enumerate(results):

    break

assert doc['file'] == "rugby008.txt"
print(doc['file'])

StatementMeta(DataDiscovery, 90, 9, Finished, Available)

rugby008.txt


# Load the vectors file

In [3]:
def read_batch_config(batch_root: str):
    """
    We read the config file using the Java File System API as we do not need to let multiple nodes read individual lines and join it
    all back together again
    """
    # Change our file system from 'synapse' to 'input'
    sc._jsc.hadoopConfiguration().set("fs.defaultFS", ntpath.dirname(input_vector_file))

    fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get(sc._jsc.hadoopConfiguration())
    config_path = sc._jvm.org.apache.hadoop.fs.Path(f'{batch_root}')

    # If we don't have a batch config, copy the global one.
    if fs.exists(config_path) != True:
        logger.error(f'{config_path} not found.')

    # Open our file directly rather than through spark
    input_stream = fs.open(config_path)  # FSDataInputStream

    config_string = sc._jvm.java.io.BufferedReader(
        sc._jvm.java.io.InputStreamReader(input_stream, sc._jvm.java.nio.charset.StandardCharsets.UTF_8)
        ).lines().collect(sc._jvm.java.util.stream.Collectors.joining("\n"))

    # Load it into json    
    return json.loads(config_string)

vectors = read_batch_config(input_vector_file)
assert vectors is not None


StatementMeta(DataDiscovery, 92, 3, Finished, Available)

# Create the Vector Index - use REST to avoid SDK dependencies

In [17]:
headers = {
    "api-key": search_admin_key,     
    "Content-Type": "application/json",

}


body = {
        "name": search_index,
        "fields": [
        {
            "name": "id",
            "type": "Edm.String",
            "key": True,
            "filterable": True
        },
        {
            "name": "file",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True
        },
        {
            "name": "content",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True
        },
        {
            "name": "category",
            "type": "Edm.String",
            "filterable": True,
            "searchable": True,
            "retrievable": True
        },
        {
            "name": "contentVector",
            "type": "Collection(Edm.Single)",
            "searchable": True,
            "retrievable": True,
            "dimensions": 1536,
            "vectorSearchConfiguration": "bbc-vector-config2"
        }
    ],
    "corsOptions": {
        "allowedOrigins": [
            "*"
        ],
        "maxAgeInSeconds": 60
    },
    "vectorSearch": {
        "algorithmConfigurations": [
            {
                "name": "bbc-vector-config2",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 1000,
                    "metric": "cosine"
                }
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "bbc-semantic-config2",
                "prioritizedFields": {
                    "prioritizedContentFields": [
                        {
                            "fieldName": "content"
                        }
                    ],
                    "prioritizedKeywordsFields": [
                        {
                            "fieldName": "category"
                        }
                    ]
                }
            }
        ]
    }
}

# Create the index
try:
    url = f"https://{search_service_name}.search.windows.net/indexes?api-version={vector_search_api_version}"
    resp = post(url=url, json=body, headers=headers)
    
    result_response = resp.json()
    if resp.status_code == 403:
        print("Authorisation Failed: Check that your API KEY value is correct")
        
    if resp.status_code == 400:
        print(f"Error", resp.text)    
            
    if resp.status_code == 201:
        print("Success creating index")
        
except Exception as e:
    print('Exception creating index', e)

assert resp.status_code == 201

# Upload the vectors

vector_data = {
                "value": vectors,
                "@search.action": "upload"
               }
                


try:
    url = f"https://{search_service_name}.search.windows.net/indexes/{search_index}/docs/index?api-version={vector_search_api_version}"
    resp = post(url=url, json=vector_data, headers=headers)
    
    result_response = resp.json()
    if resp.status_code == 403:
        print("Authorisation Failed: Check that your API KEY value is correct")
        
    if resp.status_code == 400:
        print(f"Error", resp.text)    
            
    if resp.status_code == 200:
        print("Success uploading vectors and data to index")
        
except Exception as e:
    print('Exception creating index', e)

assert resp.status_code == 200

StatementMeta(DataDiscovery, 92, 17, Finished, Available)

Success creating index
Success uploading vectors and data to index


# Step 1 Run Semantic search

In [17]:
#1

def evaluateSemantic(input_string):

    r = search_client.search(input_string,
                                    #filter=filter,
                                    query_type="semantic",
                                    query_language="en-us",
                                    query_speller="lexicon",
                                    semantic_configuration_name=semantic_configuration_name,
                                    top=top)
    output = []
    values = {}

    for i, doc in enumerate(r):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+str(doc['@search.reranker_score'])+'"', '"'+str(i)+'"']
    output.append(values)
    return output 

udf_evaluateSemantic = udf(evaluateSemantic, ArrayType(StringType()))

df_questions = df_questions.withColumn("SemanticSearch", udf_evaluateSemantic(col("question")))
df_questions.write.saveAsTable("2eval_step1")


StatementMeta(DataDiscovery, 90, 17, Finished, Available)

# Step 2 Run Standard Search

In [19]:
#2
def evaluateSimpleSearch(input_string):

    r = search_client.search(input_string, top=top)

    output = []
    values = {}
    
    for i, doc in enumerate(r):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+str(doc['@search.reranker_score'])+'"', '"'+str(i)+'"']
    output.append(values)
    return output 



udf_evaluateSimpleSearch = udf(evaluateSimpleSearch, ArrayType(StringType()))


df_questions = df_questions.withColumn("SimpleSearch", udf_evaluateSimpleSearch(col("question")))
df_questions.write.saveAsTable("2eval_step2")


StatementMeta(DataDiscovery, 90, 19, Finished, Available)

# Step 3 Standard Search RAG

In [24]:
#3

def evaluateSimpleSearchRAG(user_input):

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_completion(engine, prompt, temperature, max_tokens, stop):
        completion = openai.Completion.create(
                        engine=engine,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        stop=stop)
        return completion.choices[0].text    


    error_string = ""
    has_error = False

    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    prompt_prefix = """<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    {sources}
    

    <|im_end|>"""

    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:

        try:
            search = generate_completion(engine=openai_gpt_deployment, prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input), temperature=0.9, max_tokens=320, stop=["\n"])
        except Exception as e:
            has_error = True
            error_string = e
    else:
        search = user_input

        # Alternatively simply use search_client.search(q, top=3) if not using semantic search
        filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
        r = search_client.search(search,
                                 filter=filter,
                                 top=top)

        search_results = [doc for doc in r]
        results = [doc['content'][:2000].replace("\n", "").replace("\r", "") for doc in search_results]

    prompt = prompt_prefix.format(sources=results) + prompt_history + user_input + turn_suffix

    try:
        completion = ""
        completion = generate_completion(engine=openai_gpt_deployment, prompt=prompt, temperature=0.7, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"])
        completion = completion.replace('"', "").replace(",", "").replace("'", "")
    except Exception as e:
        has_error = True
        error_string = e

    # Build the search output
    output = []
    values = {}

    for i, doc in enumerate(search_results):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+str(doc['@search.reranker_score'])+'"', '"'+str(i)+'"', '"'+completion+'"']
    output.append(values)

    if has_error == False:
        prompt_history += user_input + turn_suffix + completion + "\n<|im_end|>" + turn_prefix
        history.append("user: " + user_input)
        history.append("assistant: " + completion)

    if has_error == True:
        return str(error_string)
    else:

        return output

udf_evaluateSimpleSearchRAG = udf(evaluateSimpleSearchRAG, ArrayType(StringType()))
df_questions = df_questions.withColumn("SimpleSearchRAG", udf_evaluateSimpleSearchRAG(col("question")))
df_questions.write.saveAsTable("2eval_step3")


StatementMeta(DataDiscovery, 90, 24, Finished, Available)

# Step 4 Semantic Search RAG

In [26]:
#4

def evaluateSemanticSearchRAG(user_input):

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_completion(engine, prompt, temperature, max_tokens, stop):
        completion = openai.Completion.create(
                        engine=engine,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        stop=stop)
        return completion.choices[0].text    


    error_string = ""
    has_error = False

    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    prompt_prefix = """<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    {sources}
    

    <|im_end|>"""

    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:

        try:
            search = generate_completion(engine=openai_gpt_deployment, prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input), temperature=0.9, max_tokens=320, stop=["\n"])
        except Exception as e:
            has_error = True
            error_string = e

        print(f"search {search}")
    else:
        search = user_input

        # Alternatively simply use search_client.search(q, top=3) if not using semantic search
        filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
        r = search_client.search(search,
                            filter=filter,
                            query_type="semantic",
                            query_language="en-us",
                            query_speller="lexicon",
                            semantic_configuration_name=semantic_configuration_name,
                            top=top)

        search_results = [doc for doc in r]
        results = [doc['content'][:2000].replace("\n", "").replace("\r", "") for doc in search_results]

    prompt = prompt_prefix.format(sources=results) + prompt_history + user_input + turn_suffix

    try:
        completion = ""
        completion = generate_completion(engine=openai_gpt_deployment, prompt=prompt, temperature=0.7, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"])
        completion = completion.replace('"', "").replace(",", "").replace("'", "")
    except Exception as e:
        has_error = True
        error_string = e

    # Build the search output
    output = []
    values = {}

    for i, doc in enumerate(search_results):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+str(doc['@search.reranker_score'])+'"', '"'+str(i)+'"', '"'+completion+'"']
    output.append(values)


    if has_error == False:
        prompt_history += user_input + turn_suffix + completion + "\n<|im_end|>" + turn_prefix
        history.append("user: " + user_input)
        history.append("assistant: " + completion)

    if has_error == True:
        return str(error_string)
    else:

        return output


udf_evaluateSemanticSearchRAG = udf(evaluateSemanticSearchRAG, StringType())
df_questions = df_questions.withColumn("SemanticSearchRAG", udf_evaluateSemanticSearchRAG(col("question")))
df_questions.write.saveAsTable("2eval_step4")


StatementMeta(DataDiscovery, 90, 26, Finished, Available)

# Vector search - use requests to avoid dev SDK version 

# Test the vector search

In [28]:
searchFields = 'content, file'
selectFields = 'content, file'
scoringProfile = None

serviceName = search_service_name
service_endpoint = (f"https://{serviceName}.search.windows.net")
indexName = search_index
apiKey = search_admin_key
apiVersion = "2023-07-01-Preview"

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, deployment_id=openai_embeddings_deployment_id)
    embeddings = response['data'][0]['embedding']
    return embeddings

def getServiceUrl():
    return 'https://' + serviceName + '.search.windows.net'

def getMethod(servicePath):
    headers = {'Content-type': 'application/json', 'api-key': apiKey}
    r = requests.get(getServiceUrl() + servicePath, headers=headers)
    return r

def postMethod(servicePath, body):
    headers = {'Content-type': 'application/json', 'api-key': apiKey}
    r = requests.post(getServiceUrl() + servicePath, headers=headers, data=body)
    return r

def simpleHybridSearch(servicePath, query, top):
    show = False
    query_embedding = generate_embeddings(query)

    values = ObjDict()
    values.values = {}
    vector = {}

    vector['value'] = query_embedding
    vector['fields'] = "contentVector"
    vector['k'] = top
    values.values['vector'] = vector
    values.values['search'] = query
    values.values['top'] = top

    body_json = json.dumps(values.values)
    servicePath = '/indexes/' + indexName + '/docs/search?api-version=%s' % (apiVersion)
    r = postMethod(servicePath, body_json)
    return r
        
def submitQuery(query, fields=None, select=None, scoring=None, top=top, fuzzy=False, method="GET"):
    servicePath = '/indexes/' + indexName + '/docs?api-version=%s&search=%s&$top=%d' % \
        (apiVersion, query, top)
    if fields is not None:
        servicePath += '&searchFields=%s' % fields
    if select is not None:
        servicePath += '&$select=%s' % select
    if scoring is not None:
        servicePath += '&scoringProfile=%s' % scoring
    if fuzzy:
        servicePath += '&queryType=full'

    if method == "GET":
        r = getMethod(servicePath)
    elif method == "simpleHybridSearch":
        r = simpleHybridSearch(servicePath, query, top)
    elif method == "semanticHybridSearch":
        r = semanticHybridSearch(servicePath, query, top)
    if r.status_code != 200:
        print('Failed to retrieve search results')
        print(query, r, r.text)
        return {}
    docs = json.loads(r.text)['value']
    return docs

results = submitQuery("Who broke their thumb?", fields=searchFields, select=selectFields, scoring=scoringProfile, top=10, fuzzy=False, method="simpleHybridSearch")
print(results[0]['file'])
assert results[0]['file'] == "rugby008.txt"

StatementMeta(DataDiscovery, 92, 29, Finished, Available)

'rugby008.txt'

# Step 5 Simple vector search

In [29]:
def simpleHybridSearch(user_input):

    serviceName = search_service_name
    service_endpoint = (f"https://{serviceName}.search.windows.net")
    indexName = search_index
    apiKey = search_admin_key
    apiVersion = "2023-07-01-Preview"

    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    searchFields = 'content, file'
    selectFields = 'content, file'
    scoringProfile = None

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_completion(engine, prompt, temperature, max_tokens, stop):
        completion = openai.Completion.create(
                        engine=engine,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        stop=stop)
        return completion.choices[0].text

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_embeddings(text):
        response = openai.Embedding.create(
            input=text, deployment_id=openai_embeddings_deployment_id)
        embeddings = response['data'][0]['embedding']
        return embeddings

    def getServiceUrl():
        return 'https://' + serviceName + '.search.windows.net'

    def getMethod(servicePath):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.get(getServiceUrl() + servicePath, headers=headers)
        return r

    def postMethod(servicePath, body):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.post(getServiceUrl() + servicePath, headers=headers, data=body)
        return r

    def simpleHybridSearch(servicePath, query, top):
        show = False
        query_embedding = generate_embeddings(query)

        values = ObjDict()
        values.values = {}
        vector = {}

        vector['value'] = query_embedding
        vector['fields'] = "contentVector"
        vector['k'] = top
        values.values['vector'] = vector
        values.values['search'] = query
        values.values['top'] = top

        body_json = json.dumps(values.values)
        servicePath = '/indexes/' + indexName + '/docs/search?api-version=%s' % (apiVersion)
        r = postMethod(servicePath, body_json)
        return r
            
    def submitQuery(query, fields=None, select=None, scoring=None, top=top, fuzzy=False, method="GET"):
        servicePath = '/indexes/' + indexName + '/docs?api-version=%s&search=%s&$top=%d' % \
            (apiVersion, query, top)
        if fields is not None:
            servicePath += '&searchFields=%s' % fields
        if select is not None:
            servicePath += '&$select=%s' % select
        if scoring is not None:
            servicePath += '&scoringProfile=%s' % scoring
        if fuzzy:
            servicePath += '&queryType=full'

        if method == "GET":
            r = getMethod(servicePath)
        elif method == "simpleHybridSearch":
            r = simpleHybridSearch(servicePath, query, top)
        elif method == "semanticHybridSearch":
            r = semanticHybridSearch(servicePath, query, top)
        if r.status_code != 200:
            print('Failed to retrieve search results')
            print(query, r, r.text)
            return {}
        docs = json.loads(r.text)['value']
        return docs
  
    search_results = submitQuery(user_input, fields=searchFields, select=selectFields, scoring=scoringProfile, top=top, fuzzy=False, method="simpleHybridSearch")

    # Build the search output
    output = []
    values = {}

    for i, doc in enumerate(search_results):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+"None"+'"', '"'+str(i)+'"']
    output.append(values)

    return output

udf_simpleHybridSearch = udf(simpleHybridSearch, ArrayType(StringType()))
df_questions = df_questions.withColumn("SimpleHybridSearch", udf_simpleHybridSearch(col("question")))
df_questions.write.saveAsTable("2eval_step5")



StatementMeta(DataDiscovery, 90, 29, Finished, Available)

# Step 6 Semantic Hybrid Search

In [31]:
def semanticHybridSearch(user_input):

    serviceName = search_service_name
    service_endpoint = (f"https://{serviceName}.search.windows.net")
    indexName = search_index
    apiKey = search_admin_key
    apiVersion = "2023-07-01-Preview"

    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    searchFields = 'content, file'
    selectFields = 'content, file'
    scoringProfile = None

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_completion(engine, prompt, temperature, max_tokens, stop):
        completion = openai.Completion.create(
                        engine=engine,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        stop=stop)
        return completion.choices[0].text

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_embeddings(text):
        response = openai.Embedding.create(
            input=text, deployment_id=openai_embeddings_deployment_id)
        embeddings = response['data'][0]['embedding']
        return embeddings

    def getServiceUrl():
        return 'https://' + serviceName + '.search.windows.net'

    def getMethod(servicePath):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.get(getServiceUrl() + servicePath, headers=headers)
        return r

    def postMethod(servicePath, body):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.post(getServiceUrl() + servicePath, headers=headers, data=body)
        return r

    def semanticHybridSearch(servicePath, query, ntop):

        query_embedding = generate_embeddings(query)

        values = ObjDict()
        values.values = {}
        vector = {}

        vector['value'] = query_embedding
        vector['fields'] = "contentVector"
        vector['k'] = top
        values.values['vector'] = vector
        values.values['search'] = query
        values.values['queryType'] = "semantic"
        values.values['semanticConfiguration'] = "bbc-semantic-config2"
        values.values['queryLanguage'] = "en-us"
        values.values['captions'] = "extractive"
        values.values['answers'] = "extractive"
        values.values['top'] = top

        body_json = json.dumps(values.values)
        servicePath = '/indexes/' + indexName + '/docs/search?api-version=%s' % (apiVersion)
        r = postMethod(servicePath, body_json)
        return r
            
    def submitQuery(query, fields=None, select=None, scoring=None, top=top, fuzzy=False, method="GET"):
        servicePath = '/indexes/' + indexName + '/docs?api-version=%s&search=%s&$top=%d' % \
            (apiVersion, query, top)
        if fields is not None:
            servicePath += '&searchFields=%s' % fields
        if select is not None:
            servicePath += '&$select=%s' % select
        if scoring is not None:
            servicePath += '&scoringProfile=%s' % scoring
        if fuzzy:
            servicePath += '&queryType=full'

        if method == "GET":
            r = getMethod(servicePath)
        elif method == "simpleHybridSearch":
            r = simpleHybridSearch(servicePath, query, top)
        elif method == "semanticHybridSearch":
            r = semanticHybridSearch(servicePath, query, top)
        if r.status_code != 200:
            print('Failed to retrieve search results')
            print(query, r, r.text)
            return {}
        docs = json.loads(r.text)['value']
        return docs
  
    search_results = submitQuery(user_input, fields=searchFields, select=selectFields, scoring=scoringProfile, top=top, fuzzy=False, method="semanticHybridSearch")

    # Build the search output
    output = []
    values = {}

    for i, doc in enumerate(search_results):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+str(doc['@search.rerankerScore'])+'"', '"'+str(i)+'"']
    output.append(values)


    return output


udf_semanticHybridSearch = udf(semanticHybridSearch, ArrayType(StringType()))
df_questions = df_questions.withColumn("SemanticHybridSearch", udf_semanticHybridSearch(col("question")))

df_questions.write.saveAsTable("2eval_step6")

StatementMeta(DataDiscovery, 90, 31, Finished, Available)

# Step 7 Simple Hybrid Search RAG

In [34]:
#7

def evaluateHybridSimpleSearchRAG(user_input):
    
    serviceName = search_service_name
    service_endpoint = (f"https://{serviceName}.search.windows.net")
    indexName = search_index
    apiKey = search_admin_key
    apiVersion = "2023-07-01-Preview"

    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    searchFields = 'content, file'
    selectFields = 'content, file'
    scoringProfile = None


    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_completion(engine, prompt, temperature, max_tokens, stop):
        completion = openai.Completion.create(
                        engine=engine,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        stop=stop)
        return completion.choices[0].text

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_embeddings(text):
        response = openai.Embedding.create(
            input=text, deployment_id=openai_embeddings_deployment_id)
        embeddings = response['data'][0]['embedding']
        return embeddings

    def getServiceUrl():
        return 'https://' + serviceName + '.search.windows.net'

    def getMethod(servicePath):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.get(getServiceUrl() + servicePath, headers=headers)
        return r

    def postMethod(servicePath, body):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.post(getServiceUrl() + servicePath, headers=headers, data=body)
        return r

    def simpleHybridSearch(servicePath, query, top):
        
        query_embedding = generate_embeddings(query)

        values = ObjDict()
        values.values = {}
        vector = {}

        vector['value'] = query_embedding
        vector['fields'] = "contentVector"
        vector['k'] = top
        values.values['vector'] = vector
        values.values['search'] = query
        values.values['top'] = top
        body_json = json.dumps(values.values)
        servicePath = '/indexes/' + indexName + '/docs/search?api-version=%s' % (apiVersion)
        r = postMethod(servicePath, body_json)

        return r


    def submitQuery(query, fields=None, select=None, scoring=None, top=top, fuzzy=False, method="GET"):
            servicePath = '/indexes/' + indexName + '/docs?api-version=%s&search=%s&$top=%d' % \
                (apiVersion, query, top)
            if fields is not None:
                servicePath += '&searchFields=%s' % fields
            if select is not None:
                servicePath += '&$select=%s' % select
            if scoring is not None:
                servicePath += '&scoringProfile=%s' % scoring
            if fuzzy:
                servicePath += '&queryType=full'

            if method == "GET":
                r = getMethod(servicePath)
            elif method == "simpleHybridSearch":
                r = simpleHybridSearch(servicePath, query, top)
            elif method == "semanticHybridSearch":
                r = semanticHybridSearch(servicePath, query, top)
            if r.status_code != 200:
                print('Failed to retrieve search results')
                print(query, r, r.text)
                return {}
            docs = json.loads(r.text)['value']
            return docs


    error_string = ""
    has_error = False

    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    prompt_prefix = """<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    {sources}
    

    <|im_end|>"""

    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:

        try:
            search = generate_completion(engine=openai_gpt_deployment, prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input), temperature=0.9, max_tokens=320, stop=["\n"])
        except Exception as e:
            has_error = True
            error_string = e
    else:
        search = user_input

        # Alternatively simply use search_client.search(q, top=3) if not using semantic search
        filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
        r = submitQuery(user_input, fields=searchFields, select=selectFields, scoring=scoringProfile, top=top, fuzzy=False, method="simpleHybridSearch")
        search_results = [doc for doc in r]
        results = [doc['content'][:2000].replace("\n", "").replace("\r", "") for doc in search_results]


    prompt = prompt_prefix.format(sources=results) + prompt_history + user_input + turn_suffix

    try:
        completion = ""
        completion = generate_completion(engine=openai_gpt_deployment, prompt=prompt, temperature=0.7, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"])
        completion = completion.replace('"', "").replace(",", "").replace("'", "")
    except Exception as e:
        has_error = True
        error_string = e


    if has_error == False:
        prompt_history += user_input + turn_suffix + completion + "\n<|im_end|>" + turn_prefix
        history.append("user: " + user_input)
        history.append("assistant: " + completion)

    # Build the search output
    output = []
    values = {}

    for i, doc in enumerate(search_results):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+"None"+'"', '"'+str(i)+'"', '"'+completion+'"']
    output.append(values)

    if has_error == True:
        return str(error_string)
    else:
        return output

udf_evaluateHybridSimpleSearchRAG = udf(evaluateHybridSimpleSearchRAG, ArrayType(StringType()))
df_questions = df_questions.withColumn("SimpleHybridSearchRAG", udf_evaluateHybridSimpleSearchRAG(col("question")))
df_questions.write.saveAsTable("2eval_step7")


StatementMeta(DataDiscovery, 90, 34, Finished, Available)

# Step 8 Semantic Hybrid Search

In [37]:
#8

def evaluateHybridSemanticSearchRAG(user_input):
    #sleep(2)

    serviceName = search_service_name
    service_endpoint = (f"https://{serviceName}.search.windows.net")
    indexName = search_index
    apiKey = search_admin_key
    apiVersion = "2023-07-01-Preview"
    searchFields = 'content, file'
    selectFields = 'content, file'
    scoringProfile = None

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_completion(engine, prompt, temperature, max_tokens, stop):
        completion = openai.Completion.create(
                        engine=engine,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        stop=stop)
        return completion.choices[0].text

    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
    def generate_embeddings(text):
        response = openai.Embedding.create(
            input=text, deployment_id=openai_embeddings_deployment_id)
        embeddings = response['data'][0]['embedding']
        return embeddings

    def getServiceUrl():
        return 'https://' + serviceName + '.search.windows.net'

    def getMethod(servicePath):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.get(getServiceUrl() + servicePath, headers=headers)
        return r

    def postMethod(servicePath, body):
        headers = {'Content-type': 'application/json', 'api-key': apiKey}
        r = requests.post(getServiceUrl() + servicePath, headers=headers, data=body)
        return r

    def semanticHybridSearch(servicePath, query, ntop):

        query_embedding = generate_embeddings(query)

        values = ObjDict()
        values.values = {}
        vector = {}

        vector['value'] = query_embedding
        vector['fields'] = "contentVector"
        vector['k'] = top
        values.values['vector'] = vector
        values.values['search'] = query
        values.values['queryType'] = "semantic"
        values.values['semanticConfiguration'] = "bbc-semantic-config2"
        values.values['queryLanguage'] = "en-us"
        values.values['captions'] = "extractive"
        values.values['answers'] = "extractive"
        values.values['top'] = top

        body_json = json.dumps(values.values)
        servicePath = '/indexes/' + indexName + '/docs/search?api-version=%s' % (apiVersion)
        r = postMethod(servicePath, body_json)
        return r

    
    def submitQuery(query, fields=None, select=None, scoring=None, top=top, fuzzy=False, method="GET"):
            servicePath = '/indexes/' + indexName + '/docs?api-version=%s&search=%s&$top=%d' % \
                (apiVersion, query, top)
            if fields is not None:
                servicePath += '&searchFields=%s' % fields
            if select is not None:
                servicePath += '&$select=%s' % select
            if scoring is not None:
                servicePath += '&scoringProfile=%s' % scoring
            if fuzzy:
                servicePath += '&queryType=full'

            if method == "GET":
                r = getMethod(servicePath)
            elif method == "simpleHybridSearch":
                r = simpleHybridSearch(servicePath, query, top)
            elif method == "semanticHybridSearch":
                r = semanticHybridSearch(servicePath, query, top)
            if r.status_code != 200:
                print('Failed to retrieve search results')
                print(query, r, r.text)
                return {}
            docs = json.loads(r.text)['value']
            return docs


    error_string = ""
    has_error = False


    openai.api_key = openai_api_key
    openai.api_type = openai_api_type
    openai.api_base = f"https://{openai_service}.openai.azure.com"
    openai.api_version = openai_api_version

    prompt_prefix = """<|im_start|>system
    Let's work this out it a step by step to be sure we have the right answer

    Sources:
    {sources}
    
    <|im_end|>"""

    turn_prefix = """
    <|im_start|>user
    """

    turn_suffix = """
    <|im_end|>
    <|im_start|>assistant
    """

    prompt_history = turn_prefix

    history = []

    summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

    Summary:
    {summary}

    Question:
    {question}

    Search query:
    """

    content = ""

    # Exclude category, to simulate scenarios where there's a set of docs you can't see
    exclude_category = None

    if len(history) > 0:

        try:
            search = generate_completion(engine=openai_gpt_deployment, prompt=summary_prompt_template.format(summary="\n".join(history), question=user_input), temperature=0.9, max_tokens=320, stop=["\n"])
        except Exception as e:
            has_error = True
            error_string = e

        print(f"search {search}")
    else:
        search = user_input

        # Alternatively simply use search_client.search(q, top=3) if not using semantic search
        filter = "category ne '{}'".format(exclude_category.replace("'", "''")) if exclude_category else None
        r = submitQuery(user_input, fields=searchFields, select=selectFields, scoring=scoringProfile, top=top, fuzzy=False, method="semanticHybridSearch")
        search_results = [doc for doc in r]
        results = [doc['content'][:2000].replace("\n", "").replace("\r", "") for doc in search_results]


    prompt = prompt_prefix.format(sources=results) + prompt_history + user_input + turn_suffix

    try:
        completion = ""
        completion = generate_completion(engine=openai_gpt_deployment, prompt=prompt, temperature=0.7, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"])
        completion = completion.replace('"', "").replace(",", "").replace("'", "")
    except Exception as e:
        has_error = True
        error_string = e

    if has_error == False:
        prompt_history += user_input + turn_suffix + completion + "\n<|im_end|>" + turn_prefix
        history.append("user: " + user_input)
        history.append("assistant: " + completion)

    # Build the search output
    output = []
    values = {}
    
    for i, doc in enumerate(search_results):
        values['"'+doc['file']+'"'] = ['"'+doc['id']+'"', '"'+str(doc['@search.score'])+'"', '"'+str(doc['@search.rerankerScore'])+'"', '"'+str(i)+'"','"'+completion+'"']
    output.append(values)

    if has_error == True:
        return str(error_string)
    else:
        return output

udf_evaluateHybridSemanticSearchRAG = udf(evaluateHybridSemanticSearchRAG, ArrayType(StringType()))
df_questions = df_questions.withColumn("SemanticHybridSearchRAG", udf_evaluateHybridSemanticSearchRAG(col("question")))

df_questions.write.saveAsTable("2eval_step8")

StatementMeta(DataDiscovery, 90, 37, Finished, Available)

# Evaluation

In [38]:
import pandas as pd
dfq = df_questions.toPandas()
dfq.to_csv("abfss://share@datadiscoverypipeline2.dfs.core.windows.net/bbcsports/csv/dfq_questions2.csv", sep=',')

StatementMeta(DataDiscovery, 90, 38, Finished, Available)

In [3]:
from evaluate import load
from transformers import AutoModelForSequenceClassification, AutoTokenizer
exact_match_metric = load("exact_match")
trec_eval = load("trec_eval")

# We use entailment to do a simple check on the OpenAI answer
global nli_model
nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

global nli_tokenizer
nli_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')


StatementMeta(DataDiscovery, 86, 3, Finished, Available)

2023-06-24 14:48:10.873767: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json: 0.00B [00:00, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

StatementMeta(DataDiscovery, 86, 4, Finished, Available)

StatementMeta(DataDiscovery, 86, 5, Finished, Available)

StatementMeta(DataDiscovery, 86, 6, Finished, Available)

StatementMeta(DataDiscovery, 86, 7, Finished, Available)

# Test the entailment

In [None]:
hyp = "I like ice cream"

# roBERTA Entailment
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# run through model pre-trained on MNLI
x = nli_tokenizer.encode("Do you like ice cream?", hyp, return_tensors='pt',
                        truncation_strategy='only_first')
logits = nli_model(x.to(device))[0]

# we throw away "neutral" (dim 1) and take the probability of
# "entailment" (2) as the probability of the label being true
entail_contradiction_logits = logits[:, [0, 2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:, 1]
assert prob_label_is_true[0].item() > 0

In [None]:
def build_trec_eval(references, predictions, eval_field, filename, index, question):

    def is_float(v):
        
        is_float = True
        try:
            f = float(v)
        except Exception as notFloat:
            is_float = False
        return is_float

    try:
        entailment_score = 0
        bm25_score = 0
        semantic_score = 0
        references["query"] = [index]
        references["q0"] = ["q0"]
        references["docid"] = [filename]
        references["rel"] = [100]
        eval_field = str(eval_field).replace("=", ":")
        sind = eval_field.find("{")
        ind = eval_field.find("}")
        eval_field = eval_field[sind:ind + 1]
        eval_field = eval_field.strip('\n')
        eval_field = eval_field.replace('\n', '')
        eval_field = eval_field.replace('""', '"')

        if '"' not in eval_field:
            eval_field = eval_field.replace(':[', '":[')
            eval_field = eval_field.replace('{', '{"')
            eval_field = eval_field.replace('], ', '], "')

        eval_field = json.loads(eval_field, strict=False)

    except Exception as nan:
        print(f"Exception {nan} {eval_field} {filename} {index}" )
        predictions["query"] = [index]
        predictions["q0"] = ["q0"]
        predictions["docid"] = ["None"]
        predictions["rank"] = [0]
        predictions["score"] = [0]
        predictions["system"] = [question]

        return references, predictions, entailment_score, bm25_score, semantic_score

    lst_query = []
    lst_q0 = []
    lst_docid = []
    lst_rank = []
    lst_score = []
    lst_system = []

    for key in eval_field:
        lst_query.append(index)
        lst_q0.append("q0")
        lst_docid.append(str(key))
        lst_rank.append(eval_field[key][3])

        # Check if a semantic score
        if is_float(eval_field[key][2]) == True:
            lst_score.append(float(eval_field[key][2]))
            semantic_score = float(eval_field[key][2])
            bm25_score = float(eval_field[key][1])
        else:
            lst_score.append(float(eval_field[key][1]))
            bm25_score = float(eval_field[key][1])
        lst_system.append(question)

        if len(eval_field[key]) > 4:
            hyp = eval_field[key][4].strip()

            # roBERTA Entailment
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # run through model pre-trained on MNLI
            x = nli_tokenizer.encode(question, hyp, return_tensors="pt",
                                        truncation=True)
            logits = nli_model(x.to(device))[0]

            # we throw away "neutral" (dim 1) and take the probability of
            # "entailment" (2) as the probability of the label being true
            entail_contradiction_logits = logits[:, [0, 2]]
            probs = entail_contradiction_logits.softmax(dim=1)
            prob_label_is_true = probs[:, 1]
            entailment_score = prob_label_is_true[0].item()

    predictions["query"] = lst_query
    predictions["q0"] = lst_q0
    predictions["docid"] = lst_docid
    predictions["rank"] = lst_rank
    predictions["score"] = lst_score
    predictions["system"] = lst_system

    return references, predictions, entailment_score, bm25_score, semantic_score

def calculate_confidence_score(bm25_scores, semantic_scores):

    try:

        bm25_weight = 0.6
        semantic_weight = 0.4

        # Normalize BM25 scores using min-max normalization
        min_bm25 = float(min(bm25_scores))
        max_bm25 = float(max(bm25_scores))

        if min_bm25 == 0 and max_bm25 == 0:
            return [0]


        normalized_bm25_scores = [(float(score) - min_bm25) / (max_bm25 - min_bm25) for score in bm25_scores]

        # Normalize semantic ranking scores using min-max normalization
        min_semantic = float(min(semantic_scores))
        max_semantic = float(max(semantic_scores))
        normalized_semantic_scores = [(float(score) - min_semantic) / (max_semantic - min_semantic) for score in
                                        semantic_scores]

        # Combine the normalized scores using weights
        confidence_scores = [(bm25 * bm25_weight) + (semantic * semantic_weight)
                                for bm25, semantic in zip(normalized_bm25_scores, normalized_semantic_scores)]

    except Exception as ConfidenceError:
        print(f" Confidence Error {ConfidenceError}")
        return [0]

    return confidence_scores

semantic_search_references = {}
semantic_search_predictions = {}
semantic_search_map = []
semantic_search_geo_map = []
semantic_search_geo_rprec = []
semantic_search_recip_rank = []
semantic_search_entailment_score = []
semantic_search_bm25_score = []
semantic_search_semantic_score = []

simple_search_references = {}
simple_search_predictions = {}
simple_search_map = []
simple_search_geo_map = []
simple_search_geo_rprec = []
simple_search_recip_rank = []
simple_search_entailment_score = []

simple_search_rag_references = {}
simple_search_rag_predictions = {}
simple_search_rag_map = []
simple_search_rag_geo_map = []
simple_search_rag_geo_rprec = []
simple_search_rag_recip_rank = []
simple_search_rag_entailment_score = []

semantic_search_rag_references = {}
semantic_search_rag_predictions = {}
semantic_search_rag_map = []
semantic_search_rag_geo_map = []
semantic_search_rag_geo_rprec = []
semantic_search_rag_recip_rank = []
semantic_search_rag_entailment_score = []
semantic_search_rag_bm25_score = []
semantic_search_rag_semantic_score = []

simple_search_hybrid_references = {}
simple_search_hybrid_predictions = {}
simple_search_hybrid_map = []
simple_search_hybrid_geo_map = []
simple_search_hybrid_geo_rprec = []
simple_search_hybrid_recip_rank = []
simple_search_hybrid_entailment_score = []

semantic_search_hybrid_references = {}
semantic_search_hybrid_predictions = {}
semantic_search_hybrid_map = []
semantic_search_hybrid_geo_map = []
semantic_search_hybrid_geo_rprec = []
semantic_search_hybrid_recip_rank = []
semantic_search_hybrid_entailment_score = []
semantic_search_hybrid_bm25_score = []
semantic_search_hybrid_semantic_score = []

simple_search_hybrid_rag_references = {}
simple_search_hybrid_rag_predictions = {}
simple_search_hybrid_rag_map = []
simple_search_hybrid_rag_geo_map = []
simple_search_hybrid_rag_geo_rprec = []
simple_search_hybrid_rag_recip_rank = []
simple_search_hybrid_rag_entailment_score = []

semantic_search_hybrid_rag_references = {}
semantic_search_hybrid_rag_predictions = {}
semantic_search_hybrid_rag_map = []
semantic_search_hybrid_rag_geo_map = []
semantic_search_hybrid_rag_geo_rprec = []
semantic_search_hybrid_rag_recip_rank = []
semantic_search_hybrid_rag_entailment_score = []
semantic_search_hybrid_rag_bm25_score = []
semantic_search_hybrid_rag_semantic_score = []

for index, row in enumerate(dfq.itertuples()):
    print(f"Processing {index} of {len(dfq)}")

    # Semantic Search Step 1
    semantic_search_references, semantic_search_predictions, entailment_score, bm25_score, semantic_score = build_trec_eval(
        semantic_search_references, semantic_search_predictions, row.SemanticSearch, row.filename, index,
        row.question)
    results = trec_eval.compute(predictions=[semantic_search_predictions], references=[semantic_search_references])
    # confidence score
    semantic_search_bm25_score.append(bm25_score)
    semantic_search_semantic_score.append(semantic_score)

    semantic_search_map.append(float(results['map']))
    semantic_search_geo_map.append(float(results['gm_map']))
    semantic_search_geo_rprec.append(float(results['bpref']))
    semantic_search_recip_rank.append(float(results['recip_rank']))
    semantic_search_entailment_score.append(float(entailment_score))
    # confidence score

    # Simple Search Step 2
    simple_search_references, simple_search_predictions, entailment_score, _, _ = build_trec_eval(
        simple_search_references, simple_search_predictions, row.SimpleSearch, row.filename, index, row.question)
    results = trec_eval.compute(predictions=[simple_search_predictions], references=[simple_search_references])

    simple_search_map.append(float(results['map']))
    simple_search_geo_map.append(float(results['gm_map']))
    simple_search_geo_rprec.append(float(results['bpref']))
    simple_search_recip_rank.append(float(results['recip_rank']))
    simple_search_entailment_score.append(float(entailment_score))

    # Simple Search RAG Step 3
    simple_search_rag_references, simple_search_rag_predictions, entailment_score, _, _ = build_trec_eval(
        simple_search_rag_references, simple_search_rag_predictions, row.SimpleSearchRAG, row.filename, index,
        row.question)
    results = trec_eval.compute(predictions=[simple_search_rag_predictions],
                                references=[simple_search_rag_references])

    simple_search_rag_map.append(float(results['map']))
    simple_search_rag_geo_map.append(float(results['gm_map']))
    simple_search_rag_geo_rprec.append(float(results['bpref']))
    simple_search_rag_recip_rank.append(float(results['recip_rank']))
    simple_search_rag_entailment_score.append(float(entailment_score))

    # Semantic Search RAG Step 4
    semantic_search_rag_references, semantic_search_rag_predictions, entailment_score, bm25_score, semantic_score = build_trec_eval(
        semantic_search_rag_references, semantic_search_rag_predictions, row.SemanticSearchRAG, row.filename, index,
        row.question)
    results = trec_eval.compute(predictions=[simple_search_rag_predictions],
                                references=[semantic_search_rag_references])
    # #confidence score
    semantic_search_rag_bm25_score.append(bm25_score)
    semantic_search_rag_semantic_score.append(semantic_score)

    semantic_search_rag_map.append(float(results['map']))
    semantic_search_rag_geo_map.append(float(results['gm_map']))
    semantic_search_rag_geo_rprec.append(float(results['bpref']))
    semantic_search_rag_recip_rank.append(float(results['recip_rank']))
    semantic_search_rag_entailment_score.append(float(entailment_score))

    # Simple Hybrid Search Step 5
    simple_search_hybrid_references, simple_search_hybrid_predictions, entailment_score, _, _ = build_trec_eval(
        simple_search_hybrid_references, simple_search_hybrid_predictions, row.SimpleHybridSearch, row.filename,
        index, row.question)
    results = trec_eval.compute(predictions=[simple_search_hybrid_predictions],
                                references=[simple_search_hybrid_references])

    simple_search_hybrid_map.append(float(results['map']))
    simple_search_hybrid_geo_map.append(float(results['gm_map']))
    simple_search_hybrid_geo_rprec.append(float(results['bpref']))
    simple_search_hybrid_recip_rank.append(float(results['recip_rank']))
    simple_search_hybrid_entailment_score.append(float(entailment_score))

    # Semantic Hybrid Search Step 6
    semantic_search_hybrid_references, semantic_search_hybrid_predictions, entailment_score, bm25_score, semantic_score = build_trec_eval(
        semantic_search_hybrid_references, semantic_search_hybrid_predictions, row.SemanticHybridSearch,
        row.filename, index, row.question)
    results = trec_eval.compute(predictions=[semantic_search_hybrid_predictions],
                                references=[semantic_search_hybrid_references])
    # confidence score
    semantic_search_hybrid_bm25_score.append(bm25_score)
    semantic_search_hybrid_semantic_score.append(semantic_score)

    semantic_search_hybrid_map.append(float(results['map']))
    semantic_search_hybrid_geo_map.append(float(results['gm_map']))
    semantic_search_hybrid_geo_rprec.append(float(results['bpref']))
    semantic_search_hybrid_recip_rank.append(float(results['recip_rank']))
    semantic_search_hybrid_entailment_score.append(float(entailment_score))

    # Simple Hybrid Search RAG Step 7
    simple_search_hybrid_rag_references, simple_search_hybrid_rag_predictions, entailment_score, _, _ = build_trec_eval(
        simple_search_hybrid_rag_references, simple_search_hybrid_rag_predictions, row.SimpleHybridSearchRAG,
        row.filename, index, row.question)
    results = trec_eval.compute(predictions=[simple_search_hybrid_rag_predictions],
                                references=[simple_search_hybrid_rag_references])

    simple_search_hybrid_rag_map.append(float(results['map']))
    simple_search_hybrid_rag_geo_map.append(float(results['gm_map']))
    simple_search_hybrid_rag_geo_rprec.append(float(results['bpref']))
    simple_search_hybrid_rag_recip_rank.append(float(results['recip_rank']))
    simple_search_hybrid_rag_entailment_score.append(float(entailment_score))

    # Semantic Hybrid Search RAG Step 8
    semantic_search_hybrid_rag_references, semantic_search_hybrid_rag_predictions, entailment_score, bm25_score, semantic_score = build_trec_eval(
        semantic_search_hybrid_rag_references, semantic_search_hybrid_rag_predictions, row.SemanticHybridSearchRAG,
        row.filename, index, row.question)
    results = trec_eval.compute(predictions=[semantic_search_hybrid_rag_predictions],
                                references=[semantic_search_hybrid_rag_references])
    # confidence score
    semantic_search_hybrid_rag_bm25_score.append(bm25_score)
    semantic_search_hybrid_rag_semantic_score.append(semantic_score)

    semantic_search_hybrid_rag_map.append(float(results['map']))
    semantic_search_hybrid_rag_geo_map.append(float(results['gm_map']))
    semantic_search_hybrid_rag_geo_rprec.append(float(results['bpref']))
    semantic_search_hybrid_rag_recip_rank.append(float(results['recip_rank']))
    semantic_search_hybrid_rag_entailment_score.append(float(entailment_score))

print(f"Semantic Search Results")
print(f"Mean average precision: {np.mean(semantic_search_map)}")
print(f"Geometric mean average precision: {np.mean(semantic_search_geo_map)}")
print(f"Binary preference score: {np.mean(semantic_search_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(semantic_search_recip_rank)}")
semantic_search_confidence_score = 0
semantic_search_confidence_score = calculate_confidence_score(semantic_search_bm25_score,
                                                                semantic_search_semantic_score)
print(f"Confidence score: {np.mean(semantic_search_confidence_score)}")

print(f"-----------------------------------------")

print(f"Simple Search Results")
print(f"Mean average precision: {np.mean(simple_search_map)}")
print(f"Geometric mean average precision: {np.mean(simple_search_geo_map)}")
print(f"Binary preference score: {np.mean(simple_search_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(simple_search_recip_rank)}")
print(f"-----------------------------------------")

print(f"Simple Search RAG Results")
print(f"Mean average precision: {np.mean(simple_search_rag_map)}")
print(f"Geometric mean average precision: {np.mean(simple_search_rag_geo_map)}")
print(f"Binary preference score: {np.mean(simple_search_rag_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(simple_search_rag_recip_rank)}")
print(f"Entailment score: {np.mean(simple_search_rag_entailment_score)}")
print(f"-----------------------------------------")

print(f"Semantic Search RAG Results")
print(f"Mean average precision: {np.mean(semantic_search_rag_map)}")
print(f"Geometric mean average precision: {np.mean(semantic_search_rag_geo_map)}")
print(f"Binary preference score: {np.mean(semantic_search_rag_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(semantic_search_rag_recip_rank)}")
print(f"Entailment score: {np.mean(semantic_search_rag_entailment_score)}")
semantic_search_rag_confidence_score = 0
semantic_search_rag_confidence_score = calculate_confidence_score(semantic_search_rag_bm25_score,
                                                                    semantic_search_rag_semantic_score)
print(f"Confidence score: {np.mean(semantic_search_rag_confidence_score)}")
print(f"-----------------------------------------")

print(f"Simple Hybrid Search Results")
print(f"Mean average precision: {np.mean(simple_search_hybrid_map)}")
print(f"Geometric mean average precision: {np.mean(simple_search_hybrid_geo_map)}")
print(f"Binary preference score: {np.mean(simple_search_hybrid_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(simple_search_hybrid_recip_rank)}")
print(f"-----------------------------------------")

print(f"Semantic Hybrid Search Results")
print(f"Mean average precision: {np.mean(semantic_search_hybrid_map)}")
print(f"Geometric mean average precision: {np.mean(semantic_search_hybrid_geo_map)}")
print(f"Binary preference score: {np.mean(semantic_search_hybrid_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(semantic_search_hybrid_recip_rank)}")
semantic_search_hybrid_confidence_score = 0
semantic_search_hybrid_confidence_score = calculate_confidence_score(semantic_search_hybrid_bm25_score,
                                                                        semantic_search_hybrid_semantic_score)
print(f"Confidence score: {np.mean(semantic_search_hybrid_confidence_score)}")
print(f"-----------------------------------------")

print(f"Simple Hybrid Search RAG Results")
print(f"Mean average precision: {np.mean(simple_search_hybrid_rag_map)}")
print(f"Geometric mean average precision: {np.mean(simple_search_hybrid_rag_geo_map)}")
print(f"Binary preference score: {np.mean(simple_search_hybrid_rag_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(simple_search_hybrid_rag_recip_rank)}")
print(f"Entailment score: {np.mean(simple_search_hybrid_rag_entailment_score)}")
print(f"-----------------------------------------")

print(f"Semantic Hybrid Search RAG Results")
print(f"Mean average precision: {np.mean(semantic_search_hybrid_rag_map)}")
print(f"Geometric mean average precision: {np.mean(semantic_search_hybrid_rag_geo_map)}")
print(f"Binary preference score: {np.mean(semantic_search_hybrid_rag_geo_rprec)}")
print(f"Reciprocal rank: {np.mean(semantic_search_hybrid_rag_recip_rank)}")
print(f"Entailment score: {np.mean(semantic_search_hybrid_rag_entailment_score)}")
semantic_search_hybrid_rag_confidence_score = 0
semantic_search_hybrid_rag_confidence_score = calculate_confidence_score(semantic_search_hybrid_rag_bm25_score,
                                                                            semantic_search_hybrid_rag_semantic_score)
print(f"Confidence score: {np.mean(semantic_search_hybrid_rag_confidence_score)}")
print(f"-----------------------------------------")




Semantic Search Results
Mean average precision: 0.6289009497964722
Geometric mean average precision: 0.6289036635006784
Binary preference score: 0.0
Reciprocal rank: 0.6289009497964722
Confidence score: 0.32117506105155313
-----------------------------------------
Simple Search Results
Mean average precision: 0.5637720488466758
Geometric mean average precision: 0.5637752374491181
Binary preference score: 0.0
Reciprocal rank: 0.5637720488466758
-----------------------------------------
Simple Search RAG Results
Mean average precision: 0.5569877883310719
Geometric mean average precision: 0.5569910583446404
Binary preference score: 0.0
Reciprocal rank: 0.5569877883310719
Entailment score: 0.15832776706789012
-----------------------------------------
Semantic Search RAG Results
Mean average precision: 0.5569877883310719
Geometric mean average precision: 0.5569910583446404
Binary preference score: 0.0
Reciprocal rank: 0.5569877883310719
Entailment score: 0.1549185791918322
Confidence score: 0.34570918379938587
-----------------------------------------
Simple Hybrid Search Results
Mean average precision: 0.5936227951153324
Geometric mean average precision: 0.5936259701492538
Binary preference score: 0.0
Reciprocal rank: 0.5936227951153324
-----------------------------------------
Semantic Hybrid Search Results
Mean average precision: 0.6264133876074174
Geometric mean average precision: 0.6264161148801447
Binary preference score: 0.0
Reciprocal rank: 0.6264133876074174
Confidence score: 0.49504462748491823
-----------------------------------------
Simple Hybrid Search RAG Results
Mean average precision: 0.586386250565355
Geometric mean average precision: 0.5863895070104025
Binary preference score: 0.0
Reciprocal rank: 0.586386250565355
Entailment score: 0.1562549940426834
-----------------------------------------
Semantic Hybrid Search RAG Results
Mean average precision: 0.6169154228855721
Geometric mean average precision: 0.6169182451379466
Binary preference score: 0.0
Reciprocal rank: 0.6169154228855721
Entailment score: 0.15223597163569244
Confidence score: 0.5658654374165991
-----------------------------------------