## Evaluation of Words as Frames

The Frame Representation Hypothesis consists of assuming words are naturally represented as frames.

One way to verify this claim is by checking if the word matrix representation is naturally full-rank. We can measure this by computing the ratio between the matrix rank and the number of vectors in the matrix.

In this experiment, we also show words are close to orthogonal, although this is an approximation because the longer the word becomes (higher token count), the less likely it is to be orthogonal. However, since most words have a low token count, the average word orthogonality is quite high, which is a good indicator that words are naturally represented as orthogonal frames.

In [1]:
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from frames.nlp.synsets import SupportedLanguages
from frames.representations import FrameUnembeddingRepresentation
from frames.utils.memory import gc_cuda
from frames.utils.settings import load_models

sns.set_style("whitegrid")

In [2]:
MODELS = load_models()

X = "token count"
X_MAX = 16

HUE1 = "Model Family"

HUE2 = "Model"
Y = "Relative Rank (Matrix Rank / Num Tokens)"

In [None]:
def get_tokenization_data(**kwargs):
    with gc_cuda():
        fur = FrameUnembeddingRepresentation.from_model_id(
            language_codes=SupportedLanguages.Llama,
            synsets_kwargs=dict(variations=None),
            **kwargs,
        )
        cols = {HUE1: fur.model.family, HUE2: str(fur.model)}
        df = (
            fur.data.get_dataframe(fur.model.tokenizer, max_token_count=X_MAX)
            .assign(**cols)
            .to_pandas()
        )
        df[Y] = (
            fur.compute_relative_rank(df["tokens"], batch_size=1 << 12).cpu().numpy()
        )
        return df


df = MODELS.to_dict(orient="index").values()
df = pd.concat([get_tokenization_data(**kwargs) for kwargs in df])

df.shape

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

[32m2024-12-06 14:38:27.821[0m | [1mINFO    [0m | [36mframes.models.hf.base[0m:[36m__init__[0m:[36m88[0m - [1mLoaded model: hugging-quants/Meta-Llama-3.1-70B-BNB-NF4-BF16[0m
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
'
  return bound(*args, **kwds)
  0%|          | 0/136 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 1 has a total capacity of 23.59 GiB of which 1.76 GiB is free. Including non-PyTorch memory, this process has 21.60 GiB memory in use. Of the allocated memory 21.28 GiB is allocated by PyTorch, and 71.24 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [None]:
df_hist = df[df[X] < X_MAX - 1].drop_duplicates(["lemma", HUE1])

df_hist.to_pickle("resources/01_token_count_per_model_family.pkl")

g = sns.displot(
    df_hist,
    x=X,
    hue=HUE1,
    kind="hist",
    binwidth=1,
    shrink=1,
    palette="colorblind",
    multiple="dodge",
    height=5,
    aspect=1.1,
    facet_kws=dict(legend_out=False),
)

descriptive_stats = df.groupby(HUE1)[X].describe()

# Extract colors used in the histogram
palette = sns.color_palette(n_colors=len(descriptive_stats.index))
model_families = descriptive_stats.index

for color, model_family in zip(palette, model_families):
    percentile_75 = descriptive_stats.loc[model_family, "75%"]

    # Add dashed vertical line
    g.ax.axvline(percentile_75, color=color, linestyle="--", linewidth=2)

dashed_line = mlines.Line2D(
    [], [], color="black", linestyle="--", linewidth=1.5, label="75% Percentile"
)

legend_data = {
    model_family: plt.Line2D([], [], color=color, marker="o", linestyle="")
    for model_family, color in zip(model_families, palette)
}
legend_data["75% Percentile"] = dashed_line

g.add_legend(legend_data=legend_data, title=HUE1)

plt.xticks(range(1, X_MAX - 1))

plt.savefig(
    "resources/01_token_count_per_model_family.png", dpi=300, bbox_inches="tight"
)

plt.show()

In [None]:
df_lineplot = (
    df.drop_duplicates(["lemma", HUE2])
    .groupby(["lemma", HUE2, X])[Y]
    .mean()
    .reset_index()
)

df_lineplot.to_pickle("resources/02_tokenization_frames.pkl")

sns.lineplot(
    df_lineplot,
    x=X,
    y=Y,
    hue=HUE2,
    style=HUE2,
    markers=True,
    dashes=False,
    palette="colorblind",
)

plt.xticks(range(1, X_MAX))

plt.savefig("resources/02_tokenization_frames.png", dpi=300, bbox_inches="tight")

plt.show()