In [None]:
import torch, sys, os, math, numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
path = "/home/user/llama/llama-2-7b"
os.listdir(path)
pth = path + "/consolidated.00.pth"
pth

In [None]:
state_dict = torch.load(pth, map_location=torch.device("cuda"))
state_dict.keys()

In [None]:
def angles_in_unit_vectors_of_matrix(matrix: torch.Tensor) -> torch.Tensor:
    """Find the angles between all the unit (column) vectors of a matrix.

    Args:
        matrix (torch.Tensor): Matrix of shape (N, M)

    Returns:a
        torch.Tensor: Angles in radians of shape (M, M)
    """
    # Normalize columns to ensure they are unit vectors
    normalized_matrix = matrix / matrix.norm(dim=0)

    # Compute the dot product between all pairs of columns
    dot_products = torch.matmul(normalized_matrix.t(), normalized_matrix)

    # Ensure dot products are within [-1, 1] due to potential numerical issues
    dot_products = torch.clamp(dot_products, min=-1.0, max=1.0)

    # Compute the angles between columns using the arccosine function
    angles = torch.acos(dot_products)

    return angles


def cosines_multidot(matrix: torch.Tensor) -> torch.Tensor:
    """Find the angles between all the unit (column) vectors of a matrix.

    Args:
        matrix (torch.Tensor): Matrix of shape (N, M)

    Returns:a
        torch.Tensor: Angles in radians of shape (M, M)
    """
    # Normalize columns to ensure they are unit vectors
    normalized_matrix = matrix / matrix.norm(dim=0)

    # Compute the dot product between all pairs of columns
    dot_products = torch.matmul(normalized_matrix.t(), normalized_matrix)

    # Ensure dot products are within [-1, 1] due to potential numerical issues
    dot_products = torch.clamp(dot_products, min=-1.0, max=1.0)

    return dot_products

In [None]:
angles = angles_in_unit_vectors_of_matrix(
    state_dict["layers.0.attention.wq.weight"].to("cuda", dtype=torch.float32)
)
angles

In [None]:
cosines = cosines_multidot(
    state_dict["layers.0.attention.wq.weight"].to("cuda", dtype=torch.float32)
)
cosines

In [None]:
import torch


def upper_triangle(symmetric_matrix):
    """
    Extracts the upper triangle of a symmetric matrix (including the diagonal)
    and returns a flat vector without the masked zeros.

    Args:
        symmetric_matrix (torch.Tensor): The symmetric matrix from which the upper
            triangle (including the diagonal) will be extracted.

    Returns:
        torch.Tensor: A 1D tensor containing the upper triangle elements (including
            the diagonal) without the masked zeros.
    """
    # Use torch.triu to extract the upper triangle (including the diagonal)
    upper_triangle = torch.triu(symmetric_matrix)

    # Convert the upper triangle to a flat vector without masked zeros
    flat_vector = upper_triangle[upper_triangle != 0]

    return flat_vector

In [None]:
angledict = {}
for k in state_dict:
    if k.endswith(".weight") and k.startswith("layers."):
        print(k)
        angledict[k] = upper_triangle(
            angles_in_unit_vectors_of_matrix(
                state_dict[k].to("cuda", dtype=torch.float32)
            )
        )

In [None]:
angledict

In [None]:
cpu_angles = angles.to("cpu", dtype=torch.float32)
cpu_cosines = cosines.to("cpu", dtype=torch.float32)


def plot_single(arr, ax):
    a_max, a_min = arr.max().item(), arr.min().item()
    bins = 100
    cpu_arr = arr.to("cpu", dtype=torch.float32)
    hist = torch.histc(cpu_arr, bins=bins)
    ax.bar(range(bins), hist)
    ticks = ticker.FuncFormatter(
        lambda x, pos: "{:.2f}".format((x / bins) * (a_max - a_min) + a_min)
    )
    ax.xaxis.set_major_formatter(ticks)


plot_single()