In [1]:
import json
import pandas
import numpy
import dotenv

from typing import List, Dict, Literal

In [2]:
env = dotenv.dotenv_values()

Part = Literal["sentence", "paragraph", "page"]
Language = Literal["en", "sk", "de"]
Model = Literal["e5", "labse", "gte"]
Id = str

PARTS: List[Part] = ["page", "paragraph", "sentence"]
LANGUAGES: List[Language] = ["en", "sk", "de"]
MODELS: List[Model] = ["e5", "labse", "gte"]
PG_TGRM = "tgrm"
CHAT_GPT = "gpt"

ALL_MODELS = [PG_TGRM, *MODELS]

DATA_DIR = "../../data"

SPACE = " "


ALL_RETRIEVAL_MODELS = [PG_TGRM, *MODELS, CHAT_GPT]
MODEL_COMBINATIONS = []
for idx in range(len(ALL_RETRIEVAL_MODELS)):
    for j in range(idx + 1, len(ALL_RETRIEVAL_MODELS)):
        m1 = ALL_RETRIEVAL_MODELS[idx]
        m2 = ALL_RETRIEVAL_MODELS[j]
        MODEL_COMBINATIONS.append((m1, m2))


with open(f"{DATA_DIR}/dataset/02_queries-EN.json", "r") as file:
    en_queries: List[str] = json.load(file)
with open(f"{DATA_DIR}/dataset/02_queries-SK.json", "r") as file:
    sk_queries: List[str] = json.load(file)
with open(f"{DATA_DIR}/dataset/02_queries-DE.json", "r") as file:
    de_queries: List[str] = json.load(file)

queries_by_language = {
    "en": en_queries,
    "sk": sk_queries,
    "de": de_queries
}

COEFFS = [0.05, 0.10, 0.20, 0.25, 0.30, 1.00]

In [3]:
def set_attr(object: dict, path: str, value, *, append: bool = False):
    keys = path.split(".")
    last_key = keys[-1]

    for key in keys[:-1]:
        if key not in object:
            object[key] = {}

        object = object[key]

    if last_key in object:
        if append:
            object[last_key].append(value)
        else:
            object[last_key] = value
    else:
        if append:
            object[last_key] = [value]
        else:
            object[last_key] = value


def get_attr(object: dict, path: str):
    keys = path.split(".")
    last_key = keys[-1]

    for key in keys[:-1]:
        if key not in object:
            return None

        object = object[key]

    if last_key not in object:
        return None

    return object[last_key]


def plm():
    result = []

    for part in PARTS:
        for lang in LANGUAGES:
            for model in ALL_MODELS:
                result.append((part, lang, model))

    return result


def plmi():
    result = []

    for part, lang, model in plm():
        for idx in range(len(queries_by_language[lang])):
            result.append((part, lang, model, idx))

    return result


def get_id(part: Part, lang: Language, idx: int) -> Id:
    return f"{part}-{lang}-{idx + 1}"


def get_retrieval_path(model: Model, id: Id):
    return f"{DATA_DIR}/retrieval/{model}/{id}.json"


def normalize(maxx, minn, similarity):
    return (similarity - minn) / (maxx - minn)


def get_top_docs(docs, coeff: float):
    if len(docs) == 0:
        return []

    top_doc = docs[0]
    top_similarity = top_doc["similarity"]
    similarity_threshold = top_similarity * (1 - coeff)

    return list(filter(
        lambda doc: doc["similarity"] >= similarity_threshold,
        docs
    ))


def get_ordering_score(docs_1, docs_2):
    score = 0

    l1 = len(docs_1)
    l2 = len(docs_2)

    if min(l1, l2) == 0:
        return 0

    for i in range(min(l1, l2)):
        doc_1 = docs_1[i]
        doc_2 = docs_2[i]

        if doc_1["id"] == doc_2["id"]:
            score += 1 / (i + 1)

    n = sum([1 / (i + 1) for i in range(min(l1, l2))])

    match_score = score / n
    return match_score


def get_intersection_score(docs_1, docs_2):
    def get_doc_id(doc):
        return doc["id"]

    doc_ids_1 = list(map(get_doc_id, docs_1))
    doc_ids_2 = list(map(get_doc_id, docs_2))

    l1 = len(doc_ids_1)
    l2 = len(doc_ids_2)

    if min(l1, l2) == 0:
        return 0

    intersection = set(doc_ids_1).intersection(set(doc_ids_2))
    union = set(doc_ids_1).union(set(doc_ids_2))
    n = len(union)

    return len(intersection) / n

In [4]:
with open(f"{DATA_DIR}/dataset/03_judgements.json", "r") as file:
    judgements_by_query: Dict[str, List[str]] = json.load(file)

JUDGEMENTS = []

for idx, query in enumerate(en_queries):
    relevant_document_ids = judgements_by_query[query]

    JUDGEMENTS.append({
        "model_id": CHAT_GPT,
        "query_id": idx + 1,
        "documents": list(map(
            lambda id: {"id": id},
            relevant_document_ids
        )),
    })

In [5]:
RETRIEVALS = {}

for part, lang, model, idx in plmi():
    RETRIEVALS[(
        part, lang, model, idx
    )] = json.load(open(get_retrieval_path(
        model, get_id(part, lang, idx)
    ), "r"))

In [6]:
def stats(retrievals, get_group_stats_path, group_iterator):
    results = {}

    for key, retrieval in retrievals.items():
        path = get_group_stats_path(key)

        similarities = list(map(
            lambda doc: doc["similarity"],
            retrieval["documents"]
        ))

        if len(similarities) == 0:
            continue

        set_attr(
            results,
            f"{path}.avg",
            numpy.average(similarities),
            append=True
        )

        set_attr(
            results,
            f"{path}.max",
            numpy.max(similarities),
            append=True
        )

        set_attr(
            results,
            f"{path}.min",
            numpy.min(similarities),
            append=True
        )

        set_attr(
            results,
            f"{path}.std",
            numpy.std(similarities),
            append=True
        )

    average = []
    total_maxs = []
    avg_maxs = []
    total_mins = []
    avg_mins = []
    avg_stds = []
    coeffs_from_total = []
    coeffs_from_avg = []

    for group in group_iterator:
        path = get_group_stats_path(group)

        avg_path = f"{path}.avg"
        max_path = f"{path}.max"
        min_path = f"{path}.min"
        std_path = f"{path}.std"

        avg_avg = numpy.max(get_attr(results, avg_path))
        total_max = numpy.max(get_attr(results, max_path))
        avg_max = numpy.average(get_attr(results, max_path))
        total_min = numpy.min(get_attr(results, min_path))
        avg_min = numpy.average(get_attr(results, min_path))
        avg_std = numpy.average(get_attr(results, std_path))
        coeff_from_total = avg_std / total_max
        coeff_from_avg = avg_std / avg_max

        average.append(avg_avg)
        total_maxs.append(total_max)
        avg_maxs.append(avg_max)
        total_mins.append(total_min)
        avg_mins.append(avg_min)
        avg_stds.append(avg_std)
        coeffs_from_total.append(coeff_from_total)
        coeffs_from_avg.append(coeff_from_avg)

    df = pandas.DataFrame({
        "average": average,
        "total_max": total_maxs,
        "avg_max": avg_maxs,
        "total_min": total_mins,
        "avg_min": avg_mins,
        "avg_std": avg_stds,
        "coeff_from_total": coeffs_from_total,
        "coeff_from_avg": coeffs_from_avg
    })

    df.index = pandas.MultiIndex.from_tuples(group_iterator)
    df.columns = pandas.MultiIndex.from_tuples([
        ("", "avg"),
        ("max", "total"), ("max", "average"),
        ("min", "total"), ("min", "average"),
        ("std", "average"),
        ("coeff", "from_total"), ("coeff", "from_average")
    ])

    return df

# Without merging by language

In [7]:
stats(
    RETRIEVALS,
    lambda key: f"{key[0]}.{key[1]}.{key[2]}",
    plm()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,avg,total,average,total,average,average,from_total,from_average
page,en,tgrm,0.106532,0.83871,0.302822,0.003448,0.010793,0.029307,0.034943,0.096778
page,en,e5,0.91819,0.988731,0.950456,0.830742,0.857204,0.012766,0.012912,0.013432
page,en,labse,0.676937,0.969995,0.767191,0.44834,0.520494,0.030709,0.031659,0.040028
page,en,gte,0.810895,0.983127,0.906754,0.625348,0.686875,0.026325,0.026776,0.029032
page,sk,tgrm,0.06863,0.592593,0.141298,0.003441,0.009854,0.01547,0.026105,0.109484
page,sk,e5,0.906789,0.982381,0.940184,0.831875,0.850546,0.013728,0.013974,0.014601
page,sk,labse,0.690159,0.951895,0.764757,0.465737,0.547602,0.027838,0.029245,0.036401
page,sk,gte,0.811391,0.960849,0.883686,0.658134,0.698387,0.024193,0.025178,0.027377
page,de,tgrm,0.099177,0.621622,0.180482,0.002967,0.010116,0.018421,0.029634,0.102066
page,de,e5,0.909897,0.983006,0.945173,0.828883,0.850876,0.013456,0.013689,0.014237


In [8]:
normalized_retrievals = {}

for part, lang, model in plm():
    queries = queries_by_language[lang]
    retrievals = []

    for idx in range(len(queries)):
        retrieval = RETRIEVALS[(part, lang, model, idx)]
        retrievals.append(retrieval.copy())

    for retrieval in retrievals:
        retrieval["documents"] = list(filter(
            lambda doc: doc["similarity"] > 0,
            retrieval["documents"]
        ))

    max_similarity = None
    min_similarity = None

    for retrieval in retrievals:
        for doc in retrieval["documents"]:
            similarity = doc["similarity"]

            if max_similarity is None:
                max_similarity = similarity

            if min_similarity is None:
                min_similarity = similarity

            if similarity > max_similarity:
                max_similarity = similarity

            if similarity < min_similarity:
                min_similarity = similarity

    for retrieval in retrievals:
        normalized_retrievals[(
            retrieval["part"],
            retrieval["lang"],
            retrieval["model_id"] if "model_id" in retrieval else PG_TGRM,
            retrieval["query_id"] - 1
        )] = {
            **retrieval,
            "documents": list(map(
                lambda doc: {
                    **doc,
                    "similarity": normalize(
                        max_similarity,
                        min_similarity,
                        doc["similarity"]
                    )
                },
                retrieval["documents"])
            )
        }

In [9]:
stats(
    normalized_retrievals,
    lambda key: f"{key[0]}.{key[1]}.{key[2]}",
    plm()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,avg,total,average,total,average,average,from_total,from_average
page,en,tgrm,0.123415,1.0,0.358419,0.0,0.008793,0.035087,0.035087,0.097893
page,en,e5,0.553509,1.0,0.757732,0.0,0.167493,0.080806,0.080806,0.106642
page,en,labse,0.438214,1.0,0.61123,0.0,0.138319,0.058868,0.058868,0.096311
page,en,gte,0.518608,1.0,0.786537,0.0,0.171969,0.073578,0.073578,0.093546
page,sk,tgrm,0.110649,1.0,0.233991,0.0,0.010886,0.026258,0.026258,0.112217
page,sk,e5,0.497748,1.0,0.719629,0.0,0.124056,0.091212,0.091212,0.126749
page,sk,labse,0.461623,1.0,0.615067,0.0,0.168392,0.057262,0.057262,0.093098
page,sk,gte,0.506275,1.0,0.745097,0.0,0.132971,0.079919,0.079919,0.10726
page,de,tgrm,0.155515,1.0,0.286937,0.0,0.011555,0.029776,0.029776,0.103772
page,de,e5,0.525646,1.0,0.754528,0.0,0.142695,0.087309,0.087309,0.115714


In [10]:
def get_retrieval(params, coeff):
    part, lang, model, idx = params

    if model == CHAT_GPT:
        retrieval = JUDGEMENTS[idx]
        top_docs = retrieval["documents"]
    else:
        retrieval = normalized_retrievals[(part, lang, model, idx)]
        top_docs = get_top_docs(retrieval["documents"], coeff)

    length = len(top_docs)

    return top_docs, length

In [11]:
print("part, coeff, pair".ljust(26) +
      "counts".ljust(11) +
      "ordering".ljust(14) +
      "intersection"
      )

print()

for part in PARTS:
    print(f"[{part}]")
    for lang in LANGUAGES:
        print(2*SPACE + f"[{lang}]")

        for coeff in COEFFS:
            print(4*SPACE + "[{:.2f}]".format(coeff))
            queries = queries_by_language[lang]

            for model1, model2 in MODEL_COMBINATIONS:
                lengths_1 = []
                lengths_2 = []
                ordering_scores = []
                intersection_scores = []

                for idx, query in enumerate(queries):
                    docs_1, l1 = get_retrieval(
                        (part, lang, model1, idx), coeff)

                    docs_2, l2 = get_retrieval(
                        (part, lang, model2, idx), coeff)

                    lengths_1.append(l1)
                    lengths_2.append(l2)

                    if not CHAT_GPT in [model1, model2]:
                        ordering_scores.append(
                            get_ordering_score(docs_1, docs_2)
                        )

                    intersection_scores.append(
                        get_intersection_score(docs_1, docs_2)
                    )

                avg_length_1 = numpy.average(lengths_1)
                avg_length_2 = numpy.average(lengths_2)

                if len(ordering_scores):
                    avg_ordering_score = numpy.average(
                        ordering_scores
                    ).round(5)
                else:
                    avg_ordering_score = None

                avg_intersection_score = numpy.average(
                    intersection_scores
                ).round(5)

                print(
                    6*SPACE + f"{model1}/{model2}".ljust(15),
                    "{:.0f}".format(avg_length_1).rjust(5)
                    + "/" +
                    "{:.0f}".format(avg_length_2).ljust(5),
                    3*SPACE +
                    ("{:.5f}".format(
                        avg_ordering_score) if avg_ordering_score is not None else "-".ljust(7)),
                    6*SPACE + "{:.5f}".format(avg_intersection_score)
                )

            print()

    print()

part, coeff, pair         counts     ordering      intersection

[page]
  [en]
    [0.05]
      tgrm/e5             1/2        0.48733       0.46501
      tgrm/labse          1/2        0.46400       0.44802
      tgrm/gte            1/1        0.46333       0.46667
      tgrm/gpt            1/5        -             0.28970
      e5/labse            2/2        0.52964       0.50599
      e5/gte              2/1        0.65303       0.61514
      e5/gpt              2/5        -             0.43197
      labse/gte           2/1        0.52800       0.50696
      labse/gpt           2/5        -             0.36122
      gte/gpt             1/5        -             0.57678

    [0.10]
      tgrm/e5             1/4        0.48163       0.39577
      tgrm/labse          1/3        0.46242       0.41692
      tgrm/gte            1/2        0.46467       0.45109
      tgrm/gpt            1/5        -             0.29675
      e5/labse            4/3        0.50017       0.43139
      e5/gte 

# With merging by language

In [12]:
merged_retrievals = {}

for part in PARTS:
    for model in ALL_MODELS:
        for idx in range(len(queries_by_language["en"])):
            documents = []

            for lang in LANGUAGES:
                retrieval = RETRIEVALS[(part, lang, model, idx)]                
                documents.extend(retrieval["documents"].copy())

            # Sort by similarity, descending
            documents.sort(key=lambda doc: doc["similarity"], reverse=True)

            # Remove duplicates
            duplicate_ids = set()

            def is_duplicate(doc):
                if doc["id"] in duplicate_ids:
                    return True

                duplicate_ids.add(doc["id"])
                return False

            documents = list(filter(
                lambda doc: not is_duplicate(doc),
                documents
            ))

            merged_retrievals[(part, model, idx)] = {
                "part": part,
                "model_id": model,
                "query_id": idx + 1,
                "documents": documents
            }

In [13]:
stats(
    merged_retrievals,
    lambda key: f"{key[0]}.{key[1]}",
    [(part, model) for part in PARTS for model in ALL_MODELS]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,avg,total,average,total,average,average,from_total,from_average
page,tgrm,0.108411,0.83871,0.311394,0.006147,0.012706,0.029076,0.034668,0.093375
page,e5,0.92191,0.988731,0.954471,0.848138,0.865144,0.012949,0.013096,0.013566
page,labse,0.694903,0.969995,0.783225,0.492377,0.555844,0.028371,0.029249,0.036224
page,gte,0.813864,0.983127,0.909428,0.659103,0.704108,0.024892,0.025319,0.027371
paragraph,tgrm,0.27549,1.0,0.527923,0.020833,0.043162,0.059096,0.059096,0.111941
paragraph,e5,0.94281,0.996055,0.972647,0.0,0.72937,0.019244,0.01932,0.019785
paragraph,labse,0.745886,0.993652,0.862686,0.0,0.545362,0.038425,0.038671,0.044541
paragraph,gte,0.854049,0.990857,0.933446,0.0,0.589032,0.031176,0.031463,0.033399
sentence,tgrm,0.290764,1.0,0.540548,0.024242,0.053092,0.056295,0.056295,0.104145
sentence,e5,0.946311,0.996055,0.972508,0.0,0.519587,0.031917,0.032043,0.032819


In [14]:
merged_normalized_retrievals = {}
merged_normalization_params = {}

for part in PARTS:
    for model in ALL_MODELS:
        tmp_retrievals = []

        max_similarity = None
        min_similarity = None

        for idx in range(len(queries_by_language["en"])):
            retrieval = merged_retrievals[(part, model, idx)]

            tmp_retrievals.append({
                **retrieval,
                "documents": list(filter(
                    lambda doc: doc["similarity"] > 0,
                    retrieval["documents"]
                ))
            })

        for retrieval in tmp_retrievals:
            for doc in retrieval["documents"]:
                similarity = doc["similarity"]

                if max_similarity is None:
                    max_similarity = similarity

                if min_similarity is None:
                    min_similarity = similarity

                if similarity > max_similarity:
                    max_similarity = similarity

                if similarity < min_similarity:
                    min_similarity = similarity

        merged_normalization_params[(part, model)] = (
            max_similarity,
            min_similarity
        )

        for retrieval in tmp_retrievals:
            merged_normalized_retrievals[(
                part, model, retrieval["query_id"] - 1
            )] = {
                **retrieval,
                "documents": list(map(
                    lambda doc: {
                        **doc,
                        "similarity": normalize(
                            max_similarity,
                            min_similarity,
                            doc["similarity"]
                        )
                    },
                    retrieval["documents"]
                ))
            }

In [15]:
stats(
    merged_normalized_retrievals,
    lambda key: f"{key[0]}.{key[1]}",
    [(part, model) for part in PARTS for model in ALL_MODELS]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,max,max,min,min,std,coeff,coeff
Unnamed: 0_level_1,Unnamed: 1_level_1,avg,total,average,total,average,average,from_total,from_average
page,tgrm,0.122831,1.0,0.366636,0.0,0.007879,0.034924,0.034924,0.095256
page,e5,0.524716,1.0,0.756316,0.0,0.120957,0.092101,0.092101,0.121776
page,labse,0.424033,1.0,0.608955,0.0,0.132882,0.059402,0.059402,0.097547
page,gte,0.477621,1.0,0.772553,0.0,0.138893,0.076822,0.076822,0.099439
paragraph,tgrm,0.260074,1.0,0.517879,0.0,0.022803,0.060354,0.060354,0.11654
paragraph,e5,0.585043,1.0,0.799554,0.0,0.188035,0.090005,0.090005,0.112569
paragraph,labse,0.490092,1.0,0.730469,0.0,0.194362,0.074566,0.074566,0.102079
paragraph,gte,0.546133,1.0,0.809538,0.0,0.138055,0.083493,0.083493,0.103137
sentence,tgrm,0.273144,1.0,0.529133,0.0,0.029567,0.057694,0.057694,0.109035
sentence,e5,0.589744,1.0,0.798041,0.0,0.207331,0.086916,0.086916,0.108912


In [16]:
def get_retrieval(params, coeff):
    part, model, idx = params

    if model == CHAT_GPT:
        retrieval = JUDGEMENTS[idx]
        top_docs = retrieval["documents"]
    else:
        retrieval = merged_normalized_retrievals[(part, model, idx)]
        top_docs = get_top_docs(retrieval["documents"], coeff)

    length = len(top_docs)

    return top_docs, length

In [17]:
print("part, coeff, pair".ljust(26) +
      "counts".ljust(11) +
      "ordering".ljust(14) +
      "intersection"
      )

print()

for part in PARTS:
    print(f"[{part}]")

    for coeff in COEFFS:
        print(2*SPACE + "[{:.2f}]".format(coeff))
        queries = queries_by_language[lang]

        for model1, model2 in MODEL_COMBINATIONS:
            lengths_1 = []
            lengths_2 = []
            ordering_scores = []
            intersection_scores = []

            for idx, query in enumerate(queries):
                docs_1, l1 = get_retrieval(
                    (part, model1, idx), coeff)

                docs_2, l2 = get_retrieval(
                    (part, model2, idx), coeff)

                lengths_1.append(l1)
                lengths_2.append(l2)

                if not CHAT_GPT in [model1, model2]:
                    ordering_scores.append(
                        get_ordering_score(docs_1, docs_2)
                    )

                intersection_scores.append(
                    get_intersection_score(docs_1, docs_2)
                )

            avg_length_1 = numpy.average(lengths_1)
            avg_length_2 = numpy.average(lengths_2)

            if len(ordering_scores):
                avg_ordering_score = numpy.average(
                    ordering_scores
                ).round(5)
            else:
                avg_ordering_score = None

            avg_intersection_score = numpy.average(
                intersection_scores
            ).round(5)

            print(
                6*SPACE + f"{model1}/{model2}".ljust(15),
                "{:.0f}".format(avg_length_1).rjust(5)
                + "/" +
                "{:.0f}".format(avg_length_2).ljust(5),
                3*SPACE +
                ("{:.5f}".format(
                    avg_ordering_score) if avg_ordering_score is not None else "-".ljust(7)),
                6*SPACE + "{:.5f}".format(avg_intersection_score)
            )

        print()

print()

part, coeff, pair         counts     ordering      intersection

[page]
  [0.05]
      tgrm/e5             1/2        0.39133       0.38393
      tgrm/labse          1/2        0.42667       0.42977
      tgrm/gte            1/1        0.49200       0.48097
      tgrm/gpt            1/5        -             0.29809
      e5/labse            2/2        0.48673       0.46954
      e5/gte              2/1        0.53255       0.49511
      e5/gpt              2/5        -             0.37134
      labse/gte           2/1        0.46576       0.47212
      labse/gpt           2/5        -             0.33135
      gte/gpt             1/5        -             0.57855

  [0.10]
      tgrm/e5             2/4        0.38519       0.34828
      tgrm/labse          2/3        0.42352       0.39379
      tgrm/gte            2/1        0.49400       0.46255
      tgrm/gpt            2/5        -             0.30539
      e5/labse            4/3        0.45935       0.41686
      e5/gte            