In [158]:
import pandas as pd
import neptune
import plotly.express as px
import plotly.graph_objects as go

In [168]:
from typing import List


project = neptune.init_project(
  project="pmtest/llm-random",
  mode="read-only",
  api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMDY0ZDI5Ni05YWU3LTQyNGYtYmY4My1hZTFkY2EzYmUwMjgifQ==",
)
columns = [
    "sys/tags",
    "loss_interval/100",
    "args/learning_rate",
    "sys/name",
    "args/grad_modif_params",
    "step",
    "sys/id"
]

placements = set([
    "post_attn_and_ff",
    "post_norm",
    "post_add",
    "all",
])

def rename_to_common(df: pd.DataFrame):
    return df.rename(columns={"loss_interval/100": "loss", "args/learning_rate": "lr", "args/grad_modif_params": "grad_modif_params"}, inplace=False)


def infere_layer_type(tags: str):
    tags = tags.split(',')
    if "true_baseline" in tags:
        return "baseline"
    else:
        return "regular"

def infere_placement_type(tags: str):
    tags = tags.split(',')
    for t in tags:
        if t in placements:
            return t


def infere_c(grad_modif_params: str):
    params = grad_modif_params.split(',')
    for p in params:
        key, val = p.split('=')
        if key == "c":
            return float(val)
    return None

def infere_eps(grad_modif_params: str):
    params = grad_modif_params.split(',')
    for p in params:
        key, val = p.split('=')
        if key == "eps":
            return float(val)
    return None

https://app.neptune.ai/pmtest/llm-random/


#### Baseline loss vs lr categorised by eps, total_steps

In [160]:
baseline_df = project.fetch_runs_table(tag="true_baseline", columns=columns).to_pandas()
baseline_df = rename_to_common(baseline_df)
baseline_df = baseline_df[baseline_df['grad_modif_params'].notna()]
baseline_df['eps'] = baseline_df['grad_modif_params'].apply(infere_eps)
baseline_df['category'] = baseline_df.apply(lambda x: f"eps_{x['eps']}, step_{x['step']}", axis=1)
baseline_df.sort_values('lr', inplace=True)

In [161]:
fig = px.line(baseline_df, x="lr", y=f"loss", title=f"c vs loss", log_x=True, log_y=True, color='category', markers=True, range_y=[3.8, 7.2])

fig.update_layout(
    title=f"Baseline Loss vs LR. Categorised by eps, total_steps",
    yaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    )
)
fig.show()

# Short Experiments (2k steps)

#### Loss vs LR categorised by baseline and sanity_checks

In [162]:
sanity_check_df = project.fetch_runs_table(owner="szysad", tag=["c_0", "std_v1_c_lr_grid_placement_short"], columns=columns).to_pandas()
sanity_check_df = rename_to_common(sanity_check_df)
sanity_check_df['eps'] = sanity_check_df['grad_modif_params'].apply(infere_eps)
sanity_check_df['placement'] = sanity_check_df['sys/tags'].apply(infere_placement_type)
sanity_check_df['category'] = sanity_check_df.apply(lambda x: f"eps_{x.eps}, step_{x.step}", axis=1)
sanity_check_df.sort_values('lr', inplace=True)

In [163]:
fig = px.line(sanity_check_df.sort_values('lr'), x="lr", y=f"loss", title=f"c vs loss", log_x=True, log_y=True, color='placement', markers=True)


baseline_step_2k = baseline_df[baseline_df['step'] == 2000]
fig.add_trace(go.Scatter(x=baseline_step_2k['lr'], y=baseline_step_2k['loss'], name='baseline',
                         line=dict(color='black', width=2, dash='dash')))

fig.update_layout(
    title=f"Sanity check Loss vs LR categorised by placement. With baseline",
    yaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    )
)
fig.show()

#### Loss vs LR categorised by c for each placement (short run, v1 std norm)

In [164]:
std_v1_df = project.fetch_runs_table(tag="std_v1_c_lr_grid_placement_short", columns=columns).to_pandas()
std_v1_df = rename_to_common(std_v1_df)
std_v1_df['layer_type'] = std_v1_df['sys/tags'].apply(infere_layer_type)
std_v1_df['c'] = std_v1_df['grad_modif_params'].apply(infere_c)
std_v1_df['placement'] = std_v1_df['sys/tags'].apply(infere_placement_type)
std_v1_df.sort_values('lr', inplace=True)

In [165]:
for i, p in enumerate(placements):
    df = std_v1_df[std_v1_df['placement'] == p]
    fig = px.line(df, x="lr", y=f"loss", title=f"c vs loss", log_x=True, log_y=True, color='c', markers=True)
    fig.add_trace(go.Scatter(x=baseline_step_2k['lr'], y=baseline_step_2k['loss'], name='baseline',
                            line=dict(color='black', width=2, dash='dash')))
    fig.update_layout(
        title=f"({i+1}/4) Loss vs Learning Rate categorized by c, for '{p}' normalization. With baseline",
        yaxis = dict(
            showexponent = 'all',
            exponentformat = 'power'
        ),
        xaxis = dict(
            showexponent = 'all',
            exponentformat = 'power'
        )
    )
    fig.show()

# Impact of grad normalization on gradients statistics

In [None]:
# y axis is mean of std of norms of raw gradients
# x axis is learning rate
# color is c
# we do this for each placement

# Long Experiments (16k steps)

In [166]:
table_long = project.fetch_runs_table(tag="post_add_c_lr_grid_long", columns=columns).to_pandas()
table_long['layer_type'] = table_long['sys/tags'].apply(infere_layer_type)
table_long['c'] = table_long['args/grad_modif_params'].apply(infere_c)
table_long.rename(columns={'loss_interval/100': 'loss', 'args/learning_rate': 'lr', 'sys/id': 'id'}, inplace=True)

baseline_long = table_long[table_long['layer_type'] == 'baseline'].sort_values(by='lr')
rest_long = table_long[table_long['layer_type'] != 'baseline'].sort_values(by=['lr', 'c'])


In [167]:
fig = px.line(rest_long, x="lr", y=f"loss", title=f"c vs loss", log_x=True, log_y=True, color='c', markers=True, range_y=[3.8, 7.2])

fig.add_trace(go.Scatter(x=baseline_long['lr'], y=baseline_long['loss'], name='baseline',
                         line=dict(color='black', width=2, dash='dash')))

fig.update_layout(
    title=f"Loss vs Learning Rate for All Layer Typles and Placements",
    yaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    ),
    xaxis = dict(
        showexponent = 'all',
        exponentformat = 'power'
    )
)
fig.show()