In [2]:
import json
import dotenv
from fastcore.all import Path
import os
from collections import defaultdict
import numpy as np

dotenv.load_dotenv()

RESULTS_DIR = Path(os.getenv("RESULTS_DIR"))
dataset_names = ["bookcorpus", "pile", "wikitext"]
num_partitions = 5

# Store results for each dataset and layer
results = defaultdict(lambda: defaultdict(list))

# Collect results across partitions
for dataset_name in dataset_names:
    for partition in range(num_partitions):
        results_current = Path.read_json(
            RESULTS_DIR / f"meta-llama/Llama-3.2-1B_{dataset_name}_{partition}.json"
        )
        # Add each layer's intrinsic dimension to results
        for layer, id_value in results_current["intrinsic_dimensions"].items():
            results[dataset_name][layer].append(id_value)

# Calculate statistics
stats = defaultdict(dict)
for dataset_name in dataset_names:
    for layer in results[dataset_name]:
        values = np.array(results[dataset_name][layer])
        stats[dataset_name][layer] = {"mean": np.mean(values), "std": np.std(values)}


In [5]:
stats

defaultdict(dict,
            {'bookcorpus': {'0': {'mean': 10.457163518721302,
               'std': 4.358676151851317},
              '1': {'mean': 14.226724547306048, 'std': 3.891355517920039},
              '2': {'mean': 17.313489401415637, 'std': 6.898752476583827},
              '3': {'mean': 20.0172397732011, 'std': 7.409070500935541},
              '4': {'mean': 21.390270428717105, 'std': 5.37721280703111},
              '5': {'mean': 21.363939477859326, 'std': 5.634595075348991},
              '6': {'mean': 21.72252237713206, 'std': 3.982981788442842},
              '7': {'mean': 20.590806494698718, 'std': 1.9607582071213394},
              '8': {'mean': 19.73455745059297, 'std': 3.258709655098779},
              '9': {'mean': 19.171835935900937, 'std': 2.3373919618395087},
              '10': {'mean': 16.892506643415974, 'std': 0.6167361717659826},
              '11': {'mean': 16.6269253171786, 'std': 0.9062336157305418},
              '12': {'mean': 16.900083652728902, 'std'

In [7]:
import pandas as pd

# Create lists to store the data
rows = []

# Iterate through the nested structure
for dataset in ["bookcorpus", "pile", "wikitext"]:
    for layer in range(16):  # 0-15
        layer_str = str(layer)
        row = {
            "dataset": dataset,
            "layer": layer,
            "mean": stats[dataset][layer_str]["mean"],
            "std": stats[dataset][layer_str]["std"],
        }
        rows.append(row)

# Create DataFrame
df = pd.DataFrame(rows)
df.head()


Unnamed: 0,dataset,layer,mean,std
0,bookcorpus,0,10.457164,4.358676
1,bookcorpus,1,14.226725,3.891356
2,bookcorpus,2,17.313489,6.898752
3,bookcorpus,3,20.01724,7.409071
4,bookcorpus,4,21.39027,5.377213
