# Set up

In [None]:
from langchain_ollama import OllamaLLM

import sys, os
root_path = os.path.abspath(os.path.join(".."))
if root_path not in sys.path:
    sys.path.append(root_path)
    
from retrieval.retrievers.adaptative_retriever import AdaptativeRetriever
from generation.summarization import Summarization
from generation.generator import Generation
from evaluation.test_bench.scorer import Scorer

# Instantiate embedding model
from langchain_huggingface import HuggingFaceEmbeddings
embedding_model =  HuggingFaceEmbeddings( # Instantiate the embedding method
        model_name="Alibaba-NLP/gte-multilingual-base",     
        model_kwargs={"device" : 'cpu', "trust_remote_code" : True},
        encode_kwargs={'normalize_embeddings': True} 
    )

  from .autonotebook import tqdm as notebook_tqdm





Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
from langchain_chroma import Chroma
from pymongo import MongoClient
community_db = Chroma(
    collection_name="community_db",
    embedding_function=embedding_model,
    persist_directory="../../data/db/rag",
)
items_db = Chroma( # All items in collection
    collection_name="items_db",
    embedding_function=embedding_model,
    persist_directory="../../data/db/testbench",
)
chunked_db = Chroma( # For fine grained similarity search
    collection_name="chunked_db",
    embedding_function=embedding_model,
    persist_directory="../../data/db/chunked",
)

client = MongoClient("mongodb://192.168.211.96:27017/")
collection = client["metadata_db"]["metadata_collection"] # metadata collection, used for the pre filtering step
fr_collection = client["metadata_db"]["filter_metadata_collection"]

model = 'mistral-small3.1:24b'
base_url = 'http://192.168.249.7:11434'
llm = OllamaLLM(base_url=base_url, model=model)
top_k = 20
bleurt_checkpoint = "../../data/BLEURT-20"
bert_model_type = "bert-base-uncased"

print('community db', community_db._collection.count())
print('items db', items_db._collection.count())
print("chunked db : ", chunked_db._collection.count())

community db 174
items db 2520
chunked db :  7032


In [4]:
retriever = AdaptativeRetriever(llm, items_db, community_db, chunked_db, collection, top_k=20)
summarizer = Summarization(llm)
generator = Generation(llm)
scorer = Scorer(bleurt_checkpoint=bleurt_checkpoint, llm=llm, bert_model_type=bert_model_type)


INFO:tensorflow:Reading checkpoint ../../data/BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: ../../data/BLEURT-20\sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


In [5]:
# Load question set and target context
questions_path = "test_set/question_set.json"
ref_path = "test_set/target_context.json"

questions = scorer.load_questions(questions_path)
refs = scorer.load_references(ref_path)

# Test

In [40]:
id = 34
question = questions[id]
target = refs[id]
print(question)

How does the border inspection system communicate with the freight equipment object?


In [41]:
diag, context, ids = retriever.retrieve(question)
summary = summarizer.summarize(question, context)
response = generator.generate(question, summary)
score = scorer.evaluate(target, response, question)

Extracted filter is : {'name': 'freight equipment'}


In [42]:
print(score['judge']['grade'])
print(score['bleurt'])
print(score['meteor'])
print(score['bert'][0])
print(score['bert'][1])
print(score['bert'][2])

0.75
0.57
0.42
0.62
0.63
0.62


In [None]:
scores = {}
for i, question in enumerate(questions):
    print(f"question {i} : {question}")
    try:
        diag, context, ids = retriever.retrieve(question)
        if diag:
            continue
        else:
            summary = summarizer.summarize(question, context)
            response = generator.generate(question, summary)
            score = scorer.evaluate(refs[i], response, question)
            scores[f"{i}"] = score
    except:
        continue
    print('--'*50)

question 0 : What are the elements covered by the sustainable travel area?
Extracted filter is : {'name': 'sustainable travel'}
----------------------------------------------------------------------------------------------------
question 1 : What does the weather domain encompass?
Extracted filter is : {'name': 'weather'}
----------------------------------------------------------------------------------------------------
question 2 : What does the traveler information and personal mobility area focus on? 
Extracted filter is : {'name': 'local traveler information'}
----------------------------------------------------------------------------------------------------
question 3 : Which factors are addressed by the sustainable travel area?
Extracted filter is : {'name': 'sustainable travel'}
----------------------------------------------------------------------------------------------------
question 4 : Describe briefly the roadway maintenance and construction service.
Extracted filter is 

In [15]:
print(scores)



In [29]:
for i in range(len(scores)):
    try:
        print(i)
        print(scores[f"{i}"])
    except:
        print("None")
        continue

0
{'bleurt': 0.62, 'bert': [0.65, 0.39, 0.51], 'meteor': 0.4, 'judge': {'grade': 1.0, 'explanation': 'The answer is highly accurate and addresses the question comprehensively. It includes all the relevant elements covered by the sustainable travel area as mentioned in the reference answer, providing a detailed and effective response.'}}
1
{'bleurt': 0.79, 'bert': [0.81, 0.8, 0.81], 'meteor': 0.74, 'judge': {'grade': 1.0, 'explanation': 'The answer is highly accurate and relevant. It addresses the question thoroughly by providing the same information as the reference answer, including the definition of the weather domain and the specific services related to it.'}}
2
{'bleurt': 0.39, 'bert': [0.38, 0.21, 0.29], 'meteor': 0.12, 'judge': {'grade': 0.5, 'explanation': 'The answer is somewhat relevant and provides some accurate information, such as the focus on location-specific traveler information and time-sensitive data. However, it lacks details about multi-modal options, transfers, pers