In [1]:
import pandas as pd

In [2]:
from calculate_alignment import calculate_alignment_score

In [3]:
ORGANISM_NAMES = [
    "cat",
    "cattle",
    "fly",
    "krill",
    "moscito",
    "scorpion",
]

In [4]:
ORGANISM_FILES = [f"data/{name}.txt" for name in ORGANISM_NAMES]

# Gzip

In [5]:
# for each of the organism pairs, calculate the alignment score
# only on lowercase triangular matrix

def fill_similarity_matrix(method: str = "gzip") -> pd.DataFrame:
    """Fill the similarity matrix with the alignment scores.

    Args:
        method (str, optional): The method to use for calculating the alignment score. Defaults to "gzip".

    Returns:
        pd.DataFrame: The similarity matrix.
    """
    df = pd.DataFrame(index=ORGANISM_NAMES, columns=ORGANISM_NAMES)
    # note that upper triangular values remain NaN
    # but matrix is symmetric!
    # still, we do not fill them for clarity
    for i in range(len(ORGANISM_NAMES)):
        for j in range(i + 1, len(ORGANISM_NAMES)):
            # skip diagonal and upper triangular matrix
            if i == j:
                df.loc[ORGANISM_NAMES[i], ORGANISM_NAMES[j]] = 1
            if i > j:
                continue
            name1 = ORGANISM_NAMES[i]
            name2 = ORGANISM_NAMES[j]
            result = calculate_alignment_score(
                file_a := ORGANISM_FILES[i], file_b := ORGANISM_FILES[j], method=method
            )
            df.loc[name2, name1] = result
    return df

In [6]:
df_gzip = fill_similarity_matrix(method="gzip")
df_gzip

Unnamed: 0,cat,cattle,fly,krill,moscito,scorpion
cat,,,,,,
cattle,0.891014,,,,,
fly,0.979279,0.976492,,,,
krill,0.970512,0.969861,0.94777,,,
moscito,0.9737,0.977296,0.912908,0.946119,,
scorpion,0.969117,0.968455,0.956007,0.963254,0.95343,


In [7]:
df_7zip = fill_similarity_matrix(method="7zip")
df_7zip

Unnamed: 0,cat,cattle,fly,krill,moscito,scorpion
cat,,,,,,
cattle,0.620084,,,,,
fly,0.932218,0.932603,,,,
krill,0.930544,0.930076,0.850346,,,
moscito,0.935565,0.942713,0.774674,0.860727,,
scorpion,0.929707,0.935973,0.913408,0.920415,0.903042,
