In [1]:
import json
import os
import pandas
import numpy
import math
import dotenv

from typing import List, Dict, Literal

In [2]:
env = dotenv.dotenv_values()

Part = Literal["sentence", "paragraph", "page"]
Language = Literal["en", "sk", "de"]
Model = Literal["e5", "labse", "gte"]
Id = str

PARTS: List[Part] = ["paragraph", "sentence", "page"][:1]
LANGUAGES: List[Language] = ["en", "sk", "de"]
MODELS: List[Model] = ["e5", "labse", "gte"]
PG_TGRM = "tgrm"
CHAT_GPT = "chat-gpt"

ALL_MODELS = [PG_TGRM, *MODELS]

DATA_DIR = "../../data"

SPACE = " "


ALL_RETRIEVAL_MODELS = [PG_TGRM, *MODELS, CHAT_GPT]
MODEL_COMBINATIONS = []
for idx in range(len(ALL_RETRIEVAL_MODELS)):
    for j in range(idx + 1, len(ALL_RETRIEVAL_MODELS)):
        m1 = ALL_RETRIEVAL_MODELS[idx]
        m2 = ALL_RETRIEVAL_MODELS[j]
        MODEL_COMBINATIONS.append((m1, m2))


with open(f"{DATA_DIR}/dataset/02_queries-EN.json", "r") as file:
    en_queries: List[str] = json.load(file)
with open(f"{DATA_DIR}/dataset/02_queries-SK.json", "r") as file:
    sk_queries: List[str] = json.load(file)
with open(f"{DATA_DIR}/dataset/02_queries-DE.json", "r") as file:
    de_queries: List[str] = json.load(file)

queries_by_language = {
    "en": en_queries,
    "sk": sk_queries,
    "de": de_queries
}

COEFFS = [0.05, 0.10, 0.20, 0.25, 0.30, 1.00]

In [3]:
def set_attr(object: dict, path: str, value, *, append: bool = False):
    keys = path.split(".")
    last_key = keys[-1]

    for key in keys[:-1]:
        if key not in object:
            object[key] = {}

        object = object[key]

    if last_key in object:
        if append:
            object[last_key].append(value)
        else:
            object[last_key] = value
    else:
        if append:
            object[last_key] = [value]
        else:
            object[last_key] = value


def get_attr(object: dict, path: str):
    keys = path.split(".")
    last_key = keys[-1]

    for key in keys[:-1]:
        if key not in object:
            return None

        object = object[key]

    if last_key not in object:
        return None

    return object[last_key]


def plm():
    result = []

    for part in PARTS:
        for lang in LANGUAGES:
            for model in ALL_MODELS:
                result.append((part, lang, model))

    return result


def plmi():
    result = []

    for part, lang, model in plm():
        for idx in range(len(queries_by_language[lang])):
            result.append((part, lang, model, idx))

    return result


def get_id(part: Part, lang: Language, idx: int) -> Id:
    return f"{part}-{lang}-{idx + 1}"


def get_retrieval_path(model: Model, id: Id):
    return f"{DATA_DIR}/retrieval/{model}/{id}.json"


def normalize(maxx, minn, similarity):
    return (similarity - minn) / (maxx - minn)


def get_top_docs(docs, coeff: float):
    if len(docs) == 0:
        return []

    top_doc = docs[0]
    top_similarity = top_doc["similarity"]
    similarity_threshold = top_similarity * (1 - coeff)

    return list(filter(
        lambda doc: doc["similarity"] >= similarity_threshold,
        docs
    ))


def get_ordering_score(docs_1, docs_2):
    match_count = 0

    l1 = len(docs_1)
    l2 = len(docs_2)

    if min(l1, l2) == 0:
        return 0

    for i in range(min(l1, l2)):
        doc_1 = docs_1[i]
        doc_2 = docs_2[i]

        if doc_1["id"] == doc_2["id"]:
            match_count += 1

    match_score = match_count / max(l1, l2)
    return match_score


def get_intersection_score(docs_1, docs_2):
    def get_doc_id(doc):
        return doc["id"]

    doc_ids_1 = list(map(get_doc_id, docs_1))
    doc_ids_2 = list(map(get_doc_id, docs_2))

    l1 = len(doc_ids_1)
    l2 = len(doc_ids_2)

    if min(l1, l2) == 0:
        return 0

    intersection = set(doc_ids_1).intersection(set(doc_ids_2))
    return len(intersection) / max(l1, l2)

In [4]:
with open(f"{DATA_DIR}/dataset/03_judgements.json", "r") as file:
    judgements_by_query: Dict[str, List[str]] = json.load(file)

JUDGEMENTS = []

for idx, query in enumerate(en_queries):
    relevant_document_ids = judgements_by_query[query]

    JUDGEMENTS.append({
        "model_id": CHAT_GPT,
        "query_id": idx + 1,
        "documents": list(map(
            lambda id: {"id": id},
            relevant_document_ids
        )),
    })

In [5]:
RETRIEVALS = {}

for part, lang, model, idx in plmi():
    RETRIEVALS[(
        part, lang, model, idx
    )] = json.load(open(get_retrieval_path(
        model, get_id(part, lang, idx)
    ), "r"))

In [6]:
def stats(retrievals, get_group_stats_path, group_iterator):
    results = {}

    for key, retrieval in retrievals.items():
        path = get_group_stats_path(key)

        similarities = list(map(
            lambda doc: doc["similarity"],
            retrieval["documents"]
        ))

        if len(similarities) == 0:
            continue

        set_attr(
            results,
            f"{path}.avg",
            numpy.average(similarities),
            append=True
        )

        set_attr(
            results,
            f"{path}.max",
            numpy.max(similarities),
            append=True
        )

        set_attr(
            results,
            f"{path}.min",
            numpy.min(similarities),
            append=True
        )

        set_attr(
            results,
            f"{path}.std",
            numpy.std(similarities),
            append=True
        )

    average = []
    total_maxs = []
    avg_maxs = []
    total_mins = []
    avg_mins = []
    avg_stds = []
    coeffs_from_total = []
    coeffs_from_avg = []

    for group in group_iterator:
        path = get_group_stats_path(group)

        avg_path = f"{path}.avg"
        max_path = f"{path}.max"
        min_path = f"{path}.min"
        std_path = f"{path}.std"

        avg_avg = numpy.max(get_attr(results, avg_path))
        total_max = numpy.max(get_attr(results, max_path))
        avg_max = numpy.average(get_attr(results, max_path))
        total_min = numpy.min(get_attr(results, min_path))
        avg_min = numpy.average(get_attr(results, min_path))
        avg_std = numpy.average(get_attr(results, std_path))
        coeff_from_total = avg_std / total_max
        coeff_from_avg = avg_std / avg_max

        average.append(avg_avg)
        total_maxs.append(total_max)
        avg_maxs.append(avg_max)
        total_mins.append(total_min)
        avg_mins.append(avg_min)
        avg_stds.append(avg_std)
        coeffs_from_total.append(coeff_from_total)
        coeffs_from_avg.append(coeff_from_avg)

    df = pandas.DataFrame({
        "average": average,
        "total_max": total_maxs,
        "avg_max": avg_maxs,
        "total_min": total_mins,
        "avg_min": avg_mins,
        "avg_std": avg_stds,
        "coeff_from_total": coeffs_from_total,
        "coeff_from_avg": coeffs_from_avg
    })

    df.index = pandas.MultiIndex.from_tuples(group_iterator)
    df.columns = pandas.MultiIndex.from_tuples([
        ("", "avg"),
        ("max", "total"), ("max", "average"),
        ("min", "total"), ("min", "average"),
        ("std", "average"),
        ("coeff", "from_total"), ("coeff", "from_average")
    ])

    return df

# Without merging by language

In [7]:
stats(
    RETRIEVALS,
    lambda key: f"{key[0]}.{key[1]}.{key[2]}",
    plm()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,avg,total,average,total,average,average,from_total,from_average
paragraph,en,tgrm,0.27238,1.0,0.516208,0.006098,0.030572,0.0595,0.0595,0.115264
paragraph,en,e5,0.940128,0.996055,0.970051,0.0,0.197145,0.075511,0.07581,0.077842
paragraph,en,labse,0.720384,0.993652,0.846169,0.0,0.309248,0.059178,0.059556,0.069936
paragraph,en,gte,0.843834,0.990857,0.919496,0.0,0.328108,0.053838,0.054334,0.058551
paragraph,sk,tgrm,0.13061,0.828571,0.268057,0.011111,0.025774,0.03057,0.036895,0.114042
paragraph,sk,e5,0.93334,0.991958,0.959784,0.0,0.355945,0.066007,0.066542,0.068772
paragraph,sk,labse,0.736134,0.97268,0.837534,0.0,0.400441,0.046961,0.04828,0.05607
paragraph,sk,gte,0.852503,0.9773,0.903436,0.0,0.387746,0.045455,0.046511,0.050313
paragraph,de,tgrm,0.210203,0.785714,0.313692,0.006897,0.028651,0.036078,0.045918,0.115012
paragraph,de,e5,0.935851,0.99359,0.960118,0.0,0.183805,0.084219,0.084762,0.087717


In [8]:
normalized_retrievals = {}

for part, lang, model in plm():
    queries = queries_by_language[lang]
    retrievals = []

    for idx in range(len(queries)):
        retrieval = RETRIEVALS[(part, lang, model, idx)]
        retrievals.append(retrieval.copy())

    for retrieval in retrievals:
        retrieval["documents"] = list(filter(
            lambda doc: doc["similarity"] > 0,
            retrieval["documents"]
        ))

    max_similarity = None
    min_similarity = None

    for retrieval in retrievals:
        for doc in retrieval["documents"]:
            similarity = doc["similarity"]

            if max_similarity is None:
                max_similarity = similarity

            if min_similarity is None:
                min_similarity = similarity

            if similarity > max_similarity:
                max_similarity = similarity

            if similarity < min_similarity:
                min_similarity = similarity

    for retrieval in retrievals:
        normalized_retrievals[(
            retrieval["part"],
            retrieval["lang"],
            retrieval["model_id"] if "model_id" in retrieval else PG_TGRM,
            retrieval["query_id"] - 1
        )] = {
            **retrieval,
            "documents": list(map(
                lambda doc: {
                    **doc,
                    "similarity": normalize(
                        max_similarity,
                        min_similarity,
                        doc["similarity"]
                    )
                },
                retrieval["documents"])
            )
        }

In [9]:
stats(
    normalized_retrievals,
    lambda key: f"{key[0]}.{key[1]}.{key[2]}",
    plm()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,avg,total,average,total,average,average,from_total,from_average
paragraph,en,tgrm,0.267916,1.0,0.51324,0.0,0.024624,0.059865,0.059865,0.116642
paragraph,en,e5,0.551343,1.0,0.787855,0.0,0.208048,0.08085,0.08085,0.10262
paragraph,en,labse,0.455094,1.0,0.710521,0.0,0.138169,0.08235,0.08235,0.115901
paragraph,en,gte,0.532008,1.0,0.808398,0.0,0.155101,0.082801,0.082801,0.102425
paragraph,sk,tgrm,0.146183,1.0,0.314323,0.0,0.017937,0.037396,0.037396,0.118974
paragraph,sk,e5,0.479848,1.0,0.714494,0.0,0.174609,0.089495,0.089495,0.125256
paragraph,sk,labse,0.466278,1.0,0.695069,0.0,0.163408,0.077548,0.077548,0.111569
paragraph,sk,gte,0.574962,1.0,0.748432,0.0,0.166693,0.081275,0.081275,0.108594
paragraph,de,tgrm,0.261045,1.0,0.393925,0.0,0.027932,0.046324,0.046324,0.117597
paragraph,de,e5,0.578455,1.0,0.707201,0.0,0.17822,0.081467,0.081467,0.115197


In [10]:
def get_retrieval(params, coeff):
    part, lang, model, idx = params

    if model == CHAT_GPT:
        retrieval = JUDGEMENTS[idx]
        top_docs = retrieval["documents"]
    else:
        retrieval = normalized_retrievals[(part, lang, model, idx)]
        top_docs = get_top_docs(retrieval["documents"], coeff)

    length = len(top_docs)

    return top_docs, length

In [11]:
print("part, coeff, pair".ljust(26) +
      "counts".ljust(11) +
      "ordering".ljust(14) +
      "intersection"
      )

print()

for part in PARTS:
    print(f"[{part}]")
    for lang in LANGUAGES:
        print(2*SPACE + f"[{lang}]")

        for coeff in COEFFS:
            print(4*SPACE + "[{:.2f}]".format(coeff))
            queries = queries_by_language[lang]

            for model1, model2 in MODEL_COMBINATIONS:
                lengths_1 = []
                lengths_2 = []
                ordering_scores = []
                intersection_scores = []

                for idx, query in enumerate(queries):
                    docs_1, l1 = get_retrieval(
                        (part, lang, model1, idx), coeff)

                    docs_2, l2 = get_retrieval(
                        (part, lang, model2, idx), coeff)

                    lengths_1.append(l1)
                    lengths_2.append(l2)

                    if not CHAT_GPT in [model1, model2]:
                        ordering_scores.append(
                            get_ordering_score(docs_1, docs_2)
                        )

                    intersection_scores.append(
                        get_intersection_score(docs_1, docs_2)
                    )

                avg_length_1 = numpy.average(lengths_1)
                avg_length_2 = numpy.average(lengths_2)

                if len(ordering_scores):
                    avg_ordering_score = numpy.average(
                        ordering_scores
                    ).round(5)
                else:
                    avg_ordering_score = None

                avg_intersection_score = numpy.average(
                    intersection_scores
                ).round(5)

                print(
                    6*SPACE + f"{model1}/{model2}".ljust(15),
                    "{:.0f}".format(avg_length_1).rjust(5)
                    + "/" +
                    "{:.0f}".format(avg_length_2).ljust(5),
                    3*SPACE +
                    ("{:.5f}".format(
                        avg_ordering_score) if avg_ordering_score is not None else "-".ljust(7)),
                    6*SPACE + "{:.5f}".format(avg_intersection_score)
                )

            print()

    print()

part, coeff, pair         counts     ordering      intersection

[paragraph]
  [en]
    [0.05]
      tgrm/e5             2/2        0.51896       0.56336
      tgrm/labse          2/2        0.53145       0.57792
      tgrm/gte            2/1        0.57815       0.61526
      tgrm/chat-gpt       2/5        -             0.41167
      e5/labse            2/2        0.52309       0.58501
      e5/gte              2/1        0.63541       0.67858
      e5/chat-gpt         2/5        -             0.45485
      labse/gte           2/1        0.61293       0.65610
      labse/chat-gpt      2/5        -             0.43990
      gte/chat-gpt        1/5        -             0.55481

    [0.10]
      tgrm/e5             2/3        0.45118       0.52713
      tgrm/labse          2/3        0.47729       0.54482
      tgrm/gte            2/2        0.53954       0.59601
      tgrm/chat-gpt       2/5        -             0.40982
      e5/labse            3/3        0.44740       0.54508
      e5

# With merging by language

In [12]:
merged_retrievals = {}

for part in PARTS:
    for model in ALL_MODELS:
        for idx in range(len(queries_by_language["en"])):
            documents = []

            for lang in LANGUAGES:
                retrieval = RETRIEVALS[(part, lang, model, idx)]                
                documents.extend(retrieval["documents"].copy())

            # Sort by similarity, descending
            documents.sort(key=lambda doc: doc["similarity"], reverse=True)

            # Remove duplicates
            duplicate_ids = set()

            def is_duplicate(doc):
                if doc["id"] in duplicate_ids:
                    return True

                duplicate_ids.add(doc["id"])
                return False

            documents = list(filter(
                lambda doc: not is_duplicate(doc),
                documents
            ))

            merged_retrievals[(part, model, idx)] = {
                "part": part,
                "model_id": model,
                "query_id": idx + 1,
                "documents": documents
            }

In [13]:
stats(
    merged_retrievals,
    lambda key: f"{key[0]}.{key[1]}",
    [(part, model) for part in PARTS for model in ALL_MODELS]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,avg,total,average,total,average,average,from_total,from_average
paragraph,tgrm,0.27549,1.0,0.527923,0.020833,0.043162,0.059096,0.059096,0.111941
paragraph,e5,0.94281,0.996055,0.972647,0.0,0.72937,0.019244,0.01932,0.019785
paragraph,labse,0.745886,0.993652,0.862686,0.0,0.545362,0.038425,0.038671,0.044541
paragraph,gte,0.854049,0.990857,0.933446,0.0,0.589032,0.031176,0.031463,0.033399


In [14]:
merged_normalized_retrievals = {}
merged_normalization_params = {}

for part in PARTS:
    for model in ALL_MODELS:
        tmp_retrievals = []

        max_similarity = None
        min_similarity = None

        for idx in range(len(queries_by_language["en"])):
            retrieval = merged_retrievals[(part, model, idx)]

            tmp_retrievals.append({
                **retrieval,
                "documents": list(filter(
                    lambda doc: doc["similarity"] > 0,
                    retrieval["documents"]
                ))
            })

        for retrieval in tmp_retrievals:
            for doc in retrieval["documents"]:
                similarity = doc["similarity"]

                if max_similarity is None:
                    max_similarity = similarity

                if min_similarity is None:
                    min_similarity = similarity

                if similarity > max_similarity:
                    max_similarity = similarity

                if similarity < min_similarity:
                    min_similarity = similarity

        merged_normalization_params[(part, model)] = (
            max_similarity,
            min_similarity
        )

        for retrieval in tmp_retrievals:
            merged_normalized_retrievals[(
                part, model, retrieval["query_id"] - 1
            )] = {
                **retrieval,
                "documents": list(map(
                    lambda doc: {
                        **doc,
                        "similarity": normalize(
                            max_similarity,
                            min_similarity,
                            doc["similarity"]
                        )
                    },
                    retrieval["documents"]
                ))
            }

In [15]:
stats(
    merged_normalized_retrievals,
    lambda key: f"{key[0]}.{key[1]}",
    [(part, model) for part in PARTS for model in ALL_MODELS]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,avg,total,average,total,average,average,from_total,from_average
paragraph,tgrm,0.260074,1.0,0.517879,0.0,0.022803,0.060354,0.060354,0.11654
paragraph,e5,0.585043,1.0,0.799554,0.0,0.188035,0.090005,0.090005,0.112569
paragraph,labse,0.490092,1.0,0.730469,0.0,0.194362,0.074566,0.074566,0.102079
paragraph,gte,0.546133,1.0,0.809538,0.0,0.138055,0.083493,0.083493,0.103137


In [16]:
def get_retrieval(params, coeff):
    part, model, idx = params

    if model == CHAT_GPT:
        retrieval = JUDGEMENTS[idx]
        top_docs = retrieval["documents"]
    else:
        retrieval = merged_normalized_retrievals[(part, model, idx)]
        top_docs = get_top_docs(retrieval["documents"], coeff)

    length = len(top_docs)

    return top_docs, length

In [17]:
print("part, coeff, pair".ljust(26) +
      "counts".ljust(11) +
      "ordering".ljust(14) +
      "intersection"
      )

print()

for part in PARTS:
    print(f"[{part}]")

    for coeff in COEFFS:
        print(2*SPACE + "[{:.2f}]".format(coeff))
        queries = queries_by_language[lang]

        for model1, model2 in MODEL_COMBINATIONS:
            lengths_1 = []
            lengths_2 = []
            ordering_scores = []
            intersection_scores = []

            for idx, query in enumerate(queries):
                docs_1, l1 = get_retrieval(
                    (part, model1, idx), coeff)

                docs_2, l2 = get_retrieval(
                    (part, model2, idx), coeff)

                lengths_1.append(l1)
                lengths_2.append(l2)

                if not CHAT_GPT in [model1, model2]:
                    ordering_scores.append(
                        get_ordering_score(docs_1, docs_2)
                    )

                intersection_scores.append(
                    get_intersection_score(docs_1, docs_2)
                )

            avg_length_1 = numpy.average(lengths_1)
            avg_length_2 = numpy.average(lengths_2)

            if len(ordering_scores):
                avg_ordering_score = numpy.average(
                    ordering_scores
                ).round(5)
            else:
                avg_ordering_score = None

            avg_intersection_score = numpy.average(
                intersection_scores
            ).round(5)

            print(
                6*SPACE + f"{model1}/{model2}".ljust(15),
                "{:.0f}".format(avg_length_1).rjust(5)
                + "/" +
                "{:.0f}".format(avg_length_2).ljust(5),
                3*SPACE +
                ("{:.5f}".format(
                    avg_ordering_score) if avg_ordering_score is not None else "-".ljust(7)),
                6*SPACE + "{:.5f}".format(avg_intersection_score)
            )

        print()

print()

part, coeff, pair         counts     ordering      intersection

[paragraph]
  [0.05]
      tgrm/e5             1/2        0.46451       0.51850
      tgrm/labse          1/2        0.51878       0.56697
      tgrm/gte            1/1        0.60107       0.64054
      tgrm/chat-gpt       1/5        -             0.42002
      e5/labse            2/2        0.49147       0.56915
      e5/gte              2/1        0.59083       0.65048
      e5/chat-gpt         2/5        -             0.44021
      labse/gte           2/1        0.57231       0.63018
      labse/chat-gpt      2/5        -             0.41093
      gte/chat-gpt        1/5        -             0.56716

  [0.10]
      tgrm/e5             2/4        0.38428       0.45784
      tgrm/labse          2/3        0.45002       0.52032
      tgrm/gte            2/2        0.55122       0.60677
      tgrm/chat-gpt       2/5        -             0.42026
      e5/labse            4/3        0.40277       0.51249
      e5/gte       