In [1]:
import pandas as pd

In [2]:
from calculate_alignment import calculate_alignment_score

In [3]:
ORGANISM_NAMES = [
    "cat",
    "cattle",
    "mouse",
    "rat",
    "wolf",
    "sheep",
]

In [4]:
ORGANISM_FILES = [f"data/{name}.txt" for name in ORGANISM_NAMES]

# Gzip

In [5]:
# for each of the organism pairs, calculate the alignment score
# only on lowercase triangular matrix


def fill_similarity_matrix(method: str = "gzip") -> pd.DataFrame:
    """Fill the similarity matrix with the alignment scores.

    Args:
        method (str, optional): The method to use for calculating the alignment score. Defaults to "gzip".

    Returns:
        pd.DataFrame: The similarity matrix.
    """
    df = pd.DataFrame(index=ORGANISM_NAMES, columns=ORGANISM_NAMES)
    # note that upper triangular values remain NaN
    # but matrix is symmetric!
    # still, we do not fill them for clarity
    for i in range(len(ORGANISM_NAMES)):
        for j in range(len(ORGANISM_NAMES)):
            name1 = ORGANISM_NAMES[i]
            name2 = ORGANISM_NAMES[j]
            # skip diagonal and upper triangular matrix
            if i == j:
                df.loc[name1, name2] = 1.
            if i >= j:
                continue
            result = calculate_alignment_score(
                file_a=ORGANISM_FILES[i], file_b=ORGANISM_FILES[j], method=method
            )
            df.loc[name2, name1] = result
    return df

In [6]:
df_gzip = fill_similarity_matrix(method="gzip")
df_gzip

16437
16338
32775
16437
16300
32737
16437
16310
32747


16437
16757
33194
16437
16616
33053
16338
16300
32638
16338
16310
32648
16338
16757
33095
16338
16616
32954
16300
16310
32610
16300
16757
33057
16300
16616
32916
16310
16757
33067
16310
16616
32926
16757
16616
33373


Unnamed: 0,cat,cattle,mouse,rat,wolf,sheep
cat,1.0,,,,,
cattle,0.891014,1.0,,,,
mouse,0.907352,0.911593,1.0,,,
rat,0.902371,0.899739,0.832052,1.0,,
wolf,0.867297,0.889939,0.904706,0.900965,1.0,
sheep,0.891014,0.784944,0.903954,0.894968,0.892892,1.0


In [7]:
df_7zip = fill_similarity_matrix(method="7zip")
df_7zip

16437
16338
32775
16437
16300
32737
16437
16310
32747
16437
16757


33194
16437
16616
33053
16338
16300
32638
16338
16310
32648
16338
16757
33095
16338
16616
32954
16300
16310
32610
16300
16757
33057
16300
16616
32916
16310
16757
33067
16310
16616
32926
16757
16616
33373


Unnamed: 0,cat,cattle,mouse,rat,wolf,sheep
cat,1.0,,,,,
cattle,0.620084,1.0,,,,
mouse,0.761506,0.771693,1.0,,,
rat,0.743096,0.763269,0.515718,1.0,,
wolf,0.583887,0.651163,0.76412,0.746678,1.0,
sheep,0.654393,0.433978,0.726661,0.727502,0.655316,1.0


In [8]:
# TODO: some discussion here of the results