# Comparison notebook
Compares the loss of several Neural Networks Transformers.

# Imports

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import matplotlib.colors as mplc

# Load losses

In [40]:
# number of tokens in a block
n_block = 64
# number of block in a batch
n_batch = 32
# number of tokens in the whole tinystories dataset
n_token_full_ds = 3625009 * 128

In [41]:
# model names and description

# these are 24M parameters model
plotted_models_24M = {"6262": {"description" : "24M params, 100% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 1.0}, 
                    "9174": {"description" : "24M params, 10% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 0.1}, 
                    
                    "9263": {"description" : "24M params, 5% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 0.05},
                    "9291": {"description" : "24M params, 2.5% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 0.025},
                    "8767": {"description" : "24M params, 1% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 0.01}}

n_plotted_models_24M = len(plotted_models_24M)

# these are additionnal 31M models
plotted_models_31M = {"8643" :  {"description" : "31M params, 100% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 1.0},
                       "3859" : {"description" : "31M params, 10% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 0.1},
                       "4226" : {"description" : "31M params, 1% TinyStories DS",
                            "n_block" : 64,
                            "n_batch" : 32,
                            "n_iter" : 100000,
                            "dataset_fraction" : 0.01}}

# dictionnary containing both
plotted_model_both_sizes = dict(plotted_models_24M)
plotted_model_both_sizes.update(plotted_models_31M)


In [42]:
for model in plotted_model_both_sizes.keys():
    # how many token the model has seen
    tokens_seen = plotted_model_both_sizes[model]["n_block"] * plotted_model_both_sizes[model]["n_batch"] * plotted_model_both_sizes[model]["n_iter"]
    
    # total available tokens for this training
    tokens_available = n_token_full_ds * plotted_model_both_sizes[model]["dataset_fraction"]
    
    # how many times a token was seen on average
    token_times_seen = max(tokens_seen / tokens_available, 1) # a token cant be seen less than once
    print(f"model {model} has seen {tokens_seen} tokens, in average {token_times_seen} times")

    # append result to dict
    plotted_model_both_sizes[model]["tokens_seen"] = tokens_seen
    plotted_model_both_sizes[model]["token_times_seen"] = token_times_seen





model 6262 has seen 204800000 tokens, in average 1 times
model 9174 has seen 204800000 tokens, in average 4.413782145092605 times
model 9263 has seen 204800000 tokens, in average 8.82756429018521 times
model 9291 has seen 204800000 tokens, in average 17.65512858037042 times
model 8767 has seen 204800000 tokens, in average 44.13782145092605 times
model 8643 has seen 204800000 tokens, in average 1 times
model 3859 has seen 204800000 tokens, in average 4.413782145092605 times
model 4226 has seen 204800000 tokens, in average 44.13782145092605 times


In [43]:
# load model training csv output to pandas df
path = "./Training_Results/"
losses_files_dict = dict(zip(plotted_model_both_sizes.keys(), [m+"_losses.csv" for m in plotted_model_both_sizes.keys()]))
print(losses_files_dict)
losses_df = {}
for model, file in losses_files_dict.items() :
    losses_df[model] = pd.read_csv(path + file)

{'6262': '6262_losses.csv', '9174': '9174_losses.csv', '9263': '9263_losses.csv', '9291': '9291_losses.csv', '8767': '8767_losses.csv', '8643': '8643_losses.csv', '3859': '3859_losses.csv', '4226': '4226_losses.csv'}


# Pre-plot

### Global settings

In [67]:
# define custom colors
colors_distinct_named = ["mediumpurple", "deepskyblue", "darkseagreen", "gold", 
            "firebrick", "tomato", "navy", "darkmagenta", "limegreen", 
            "orange", "gold", "yellowgreen", "darkseagreen", 
            "mediumaquamarine", "turquoise", 
            "royalblue", 
            "orchid", "crimson"]
# convert colors to hex
colors_distinct_hex = [mplc.cnames[c] for c in colors_distinct_named]
# some other nice colors
graph_distinct_summer = ["#27187E","#758BFD","#AEB8FE","#CC2936", "#FF8600"]
color_gradient_summer = ["#057bb0","#390099", "#750090", "#9e0059","#880d1e","#ff5400","#ff7d10","#ffbd00"]

### Utilities

In [45]:

# function for plotting multiple lines from a dataframe. All columns will be plotted except the one specified as x_col.
# Args : 
#   df : input dataframe
#   x_col : column name for x axis
#   startstop : list of 2 integers, start and end index of columns to be plotted. If None, the whole column is plotted.
#   width, height : plot dimensions
#   x_int_only : format x axis as integer
# Returns the corresponding html file

def multiline_plot(df, x_col, startstop=None, colors=colors_distinct_hex, width=800, height=500, x_int_only=True, plot=True, title=None, full_plotly_offline=False):
    if x_col not in df.columns:
        print(f"{x_col} not in dataframe columns")
        return None
    
    y_cols = [col for col in df.columns if col != x_col]
    if not y_cols:
        print("No columns to plot on y axis")
        return None
    if startstop is None :
        startstop = [0, len(df)]

    fig = px.line(
        df.iloc[startstop[0]:startstop[1]],
        x=x_col,
        y=y_cols,
        template="plotly_white",
        width=width,
        height=height,
        color_discrete_map=dict(zip(y_cols, colors)),
        title=title
    )
    #fig.update_xaxes(title=xlabel)
    if x_int_only:
        fig.update_xaxes(tickformat='d')
    fig.update_yaxes(title="Loss")
    if plot:
        fig.show()
    if full_plotly_offline:
        return fig.to_html(full_html=False)
    else:
        return fig





In [46]:

def multiline_plot_dashoption(
    df,
    x_col,
    dot_pattern=None,
    startstop=None,
    colors=None,
    width=800,
    height=500,
    title=None,
    x_int_only=True,
    plot=True,
    full_plotly_offline=False):
    if x_col not in df.columns:
        print(f"{x_col} not in dataframe columns")
        return None

    y_cols = [col for col in df.columns if col != x_col]
    if not y_cols:
        print("No columns to plot on y axis")
        return None

    if startstop is None:
        startstop = [0, len(df)]

    # Determine line dash style for each column
    line_dash = {}
    for col in y_cols:
        if dot_pattern and dot_pattern in col:
            line_dash[col] = 'dot'
        else:
            line_dash[col] = 'solid'

    fig = px.line(
        df.iloc[startstop[0]:startstop[1]],
        x=x_col,
        y=y_cols,
        template="plotly_white",
        width=width,
        height=height,
        color_discrete_map=dict(zip(y_cols, colors)) if colors else None,
        title=title
    )

    # Update line dash for each trace
    for i, col in enumerate(y_cols):
        fig.update_traces(
            selector={'name': col},
            line_dash=line_dash[col]
        )

    if x_int_only:
        fig.update_xaxes(tickformat='d')
    fig.update_yaxes(title="Loss")

    if plot:
        fig.show()
    if full_plotly_offline:
        return fig.to_html(full_html=True)
    else:
        return fig

In [47]:
# name model utility

def name_model(model, plotted_model_dict):
    name = f"T_{model} : "
    name += plotted_model_dict[model]["description"][:10]
    name += f", trained on {plotted_model_dict[model]['dataset_fraction']*100}% of DS"
    return name

def name_model_short(model, plotted_model_dict):
    name = f"T_{model} : "
    name += f"<br>N={plotted_model_dict[model]["description"][:3]}"
    name += f", f={plotted_model_dict[model]['dataset_fraction']*100}% of DS"
    return name

### Compute dataframes

In [48]:
dict_test_losses = {"iteration" : losses_df[list(plotted_models_24M.keys())[0]]["iteration"]}
dict_train_losses = {"iteration" : losses_df[list(plotted_models_24M.keys())[0]]["iteration"]}



for m in plotted_models_24M.keys():
    print(f"Working for model {m}...")
    name = name_model_short(m, plotted_models_24M)
    print(name)
    dict_test_losses["Test Loss "+name] = losses_df[m]["test_loss"].to_numpy()
    dict_train_losses["Train Loss "+name] = losses_df[m]["train_loss"].to_numpy()

df_test_losses_24M = pd.DataFrame(dict_test_losses)
df_train_losses_24M = pd.DataFrame(dict_train_losses)


df_test_losses_24M.head()
df_train_losses_24M.head()

Working for model 6262...
T_6262 : <br>N=24M, f=100.0% of DS
Working for model 9174...
T_9174 : <br>N=24M, f=10.0% of DS
Working for model 9263...
T_9263 : <br>N=24M, f=5.0% of DS
Working for model 9291...
T_9291 : <br>N=24M, f=2.5% of DS
Working for model 8767...
T_8767 : <br>N=24M, f=1.0% of DS


Unnamed: 0,iteration,"Train Loss T_6262 : <br>N=24M, f=100.0% of DS","Train Loss T_9174 : <br>N=24M, f=10.0% of DS","Train Loss T_9263 : <br>N=24M, f=5.0% of DS","Train Loss T_9291 : <br>N=24M, f=2.5% of DS","Train Loss T_8767 : <br>N=24M, f=1.0% of DS"
0,0.0,10.86224,10.894934,10.953832,10.894541,10.882893
1,500.0,6.317381,6.287441,6.325198,6.274004,6.294331
2,1000.0,4.543505,4.568794,4.578363,4.598042,4.560696
3,1500.0,3.821358,3.845598,3.802483,3.822356,3.777266
4,2000.0,3.421336,3.443779,3.409492,3.439152,3.428401


In [49]:
# convert iter to tokens
iter_to_token_seen = n_block * n_batch

df_test_losses_24M["training tokens"] = iter_to_token_seen * df_test_losses_24M["iteration"]
df_train_losses_24M["training tokens"] = iter_to_token_seen * df_train_losses_24M["iteration"]

# remove iteration column
df_test_losses_24M = df_test_losses_24M.drop(columns=["iteration"])
df_train_losses_24M = df_train_losses_24M.drop(columns=["iteration"])

Same process for the 31M model

In [50]:
dict_test_losses = {"iteration" : losses_df[list(plotted_models_31M.keys())[0]]["iteration"]}
dict_train_losses = {"iteration" : losses_df[list(plotted_models_31M.keys())[0]]["iteration"]}



for m in plotted_models_31M.keys():
    print(f"Working for model {m}...")
    name = name_model_short(m, plotted_models_31M)
    print(name)
    dict_test_losses["Test Loss "+name] = losses_df[m]["test_loss"].to_numpy()
    dict_train_losses["Train Loss "+name] = losses_df[m]["train_loss"].to_numpy()

df_test_losses_31M = pd.DataFrame(dict_test_losses)
df_train_losses_31M = pd.DataFrame(dict_train_losses)


df_test_losses_31M.head()
df_train_losses_31M.head()

Working for model 8643...
T_8643 : <br>N=31M, f=100.0% of DS
Working for model 3859...
T_3859 : <br>N=31M, f=10.0% of DS
Working for model 4226...
T_4226 : <br>N=31M, f=1.0% of DS


Unnamed: 0,iteration,"Train Loss T_8643 : <br>N=31M, f=100.0% of DS","Train Loss T_3859 : <br>N=31M, f=10.0% of DS","Train Loss T_4226 : <br>N=31M, f=1.0% of DS"
0,0.0,10.90925,10.840028,10.867797
1,500.0,6.25856,6.261656,6.246
2,1000.0,4.469713,4.4345,4.453432
3,1500.0,3.79709,3.761068,3.774492
4,2000.0,3.457219,3.438394,3.405945


In [51]:
# convert iter to tokens
iter_to_token_seen = n_block * n_batch

df_test_losses_31M["training tokens"] = iter_to_token_seen * df_test_losses_31M["iteration"]
df_train_losses_31M["training tokens"] = iter_to_token_seen * df_train_losses_31M["iteration"]

# remove iteration column
df_test_losses_31M = df_test_losses_31M.drop(columns=["iteration"])
df_train_losses_31M = df_train_losses_31M.drop(columns=["iteration"])

# Plot

In [52]:
save_figs = False
image_width = 1000
image_height=600

### Compare test losses

In [53]:

fig = multiline_plot(df=df_test_losses_24M, x_col="training tokens", 
                startstop=None, colors=colors_distinct_hex, 
                width=image_width, height=image_height, title="Test Loss for 24M model",
                x_int_only=False, plot=True, full_plotly_offline=False);

if save_figs : fig.write_image('Graph_Clean/TestLoss24M.png')

### Compare train losses

In [54]:
fig = multiline_plot(df=df_train_losses_31M, x_col="training tokens", 
                startstop=None, colors=colors_distinct_hex, 
                width=image_width, height=image_height, title="Train Loss for 31M model",
                x_int_only=False, plot=True, full_plotly_offline=False);
if save_figs : fig.write_image('Graph_Clean/TrainLoss31M.png')

In [55]:
fig = multiline_plot(df=df_train_losses_24M, x_col="training tokens", 
                startstop=None, colors=colors_distinct_hex, 
                width=image_width, height=image_height, title="Train Loss for 24M model",
                x_int_only=False, plot=True, full_plotly_offline=False);
if save_figs : fig.write_image('Graph_Clean/TrainLoss24M.png')

### Compare test & train losses

In [56]:
df_both_losses = pd.concat((df_train_losses_24M, df_test_losses_24M[["Test Loss "+name_model_short(m, plotted_models_24M) for m in plotted_models_24M.keys()]] ), axis=1)
df_both_losses.head()

Unnamed: 0,"Train Loss T_6262 : <br>N=24M, f=100.0% of DS","Train Loss T_9174 : <br>N=24M, f=10.0% of DS","Train Loss T_9263 : <br>N=24M, f=5.0% of DS","Train Loss T_9291 : <br>N=24M, f=2.5% of DS","Train Loss T_8767 : <br>N=24M, f=1.0% of DS",training tokens,"Test Loss T_6262 : <br>N=24M, f=100.0% of DS","Test Loss T_9174 : <br>N=24M, f=10.0% of DS","Test Loss T_9263 : <br>N=24M, f=5.0% of DS","Test Loss T_9291 : <br>N=24M, f=2.5% of DS","Test Loss T_8767 : <br>N=24M, f=1.0% of DS"
0,10.86224,10.894934,10.953832,10.894541,10.882893,0.0,10.862324,10.891705,10.954594,10.894548,10.880738
1,6.317381,6.287441,6.325198,6.274004,6.294331,1024000.0,6.328369,6.296029,6.33323,6.284216,6.321471
2,4.543505,4.568794,4.578363,4.598042,4.560696,2048000.0,4.546401,4.56493,4.570868,4.582667,4.595034
3,3.821358,3.845598,3.802483,3.822356,3.777266,3072000.0,3.801347,3.807614,3.792041,3.780358,3.815633
4,3.421336,3.443779,3.409492,3.439152,3.428401,4096000.0,3.447,3.453033,3.455607,3.423772,3.460869


In [57]:
colors_compare = [colors_distinct_hex[k] for k in range(n_plotted_models)]* 2
fig = multiline_plot_dashoption( df=df_both_losses, x_col="training tokens", 
                            dot_pattern="Train", 
                            startstop=None, colors=colors_compare, title="Train & Test Loss for 24M model",
                            width=image_width, height=image_height, x_int_only=False, 
                            plot=True, full_plotly_offline=False);
if save_figs : fig.write_image('Graph_Clean/TrainTestLoss24M.png')

### Plot final loss unique tokens for 24M model

In [58]:
final_losses = []
models = []
unicityfract = []
for model in plotted_models_24M.keys():
    models.append(model)
    final_losses.append(df_test_losses_24M["Test Loss "+name_model_short(model, plotted_models_24M)].to_numpy()[-1])
    unicityfract.append( 1/plotted_models_24M[model]["token_times_seen"] )

df_loss_vs_unicityfract = pd.DataFrame({"model":models, 
                                        "final_losses":final_losses,
                                        "fraction of unique tokens":unicityfract})
                                    

# add reference "ideal" loss as being the one with the most unique tokens
most_unique_token_line = df_loss_vs_unicityfract["fraction of unique tokens"].idxmax()
reference_loss = df_loss_vs_unicityfract.loc[most_unique_token_line, "final_losses"]
df_loss_vs_unicityfract["reference_loss"] = reference_loss

# show head
df_loss_vs_unicityfract.head()

Unnamed: 0,model,final_losses,fraction of unique tokens,reference_loss
0,6262,1.915381,1.0,1.915381
1,9174,1.913745,0.226563,1.915381
2,9263,1.972695,0.113282,1.915381
3,9291,2.094879,0.056641,1.915381
4,8767,2.746964,0.022656,1.915381


In [59]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_loss_vs_unicityfract["fraction of unique tokens"], 
                         y=df_loss_vs_unicityfract["reference_loss"],
                    mode='lines',
                    line=dict(width=2, color='red', dash="dot"),
                    name='final loss for 100% unique tokens'))

for j, m in enumerate(plotted_models_24M.keys()):
    fig.add_trace(go.Scatter(x=[df_loss_vs_unicityfract["fraction of unique tokens"][j]], 
                         y=[df_loss_vs_unicityfract["final_losses"][j]],
                    mode='markers',
                    marker=dict(size=16, color=colors_distinct_named[j]),
                    name=f'final losses after 200M tokens for {name_model_short(m, plotted_models_24M)}'))


fig.update_layout(
    title='Final losses vs fraction of unique tokens',
    xaxis_title='Fraction of unique tokens',
    yaxis_title='Final Losses',
    template="plotly_white",
    showlegend=True,
    yaxis_range=[0,4],
    width=image_width,
    height=image_height*1.1
)

fig.show()

if save_figs : fig.write_image('Graph_Clean/LossVSUniqueTokens.png')


### Plot final loss for both model sizes

In [65]:
final_losses = []
models = []
unicityfract = []
for model in plotted_model_both_sizes.keys():
    models.append(model)
    if model in plotted_models_24M :
        final_losses.append(df_test_losses_24M["Test Loss "+name_model_short(model, plotted_models_24M)].to_numpy()[-1])
    else :
        final_losses.append(df_test_losses_31M["Test Loss "+name_model_short(model, plotted_models_31M)].to_numpy()[-1])
    unicityfract.append( 1/plotted_model_both_sizes[model]["token_times_seen"] )

df_loss_vs_unicityfract_bothmodels = pd.DataFrame({"model":models, 
                                        "final_losses":final_losses,
                                        "fraction of unique tokens":unicityfract})
                                    

# add reference "ideal" loss as being the one with the most unique tokens
most_unique_token_line = df_loss_vs_unicityfract_bothmodels["fraction of unique tokens"].idxmax()
reference_loss = df_loss_vs_unicityfract_bothmodels.loc[most_unique_token_line, "final_losses"]
df_loss_vs_unicityfract_bothmodels["reference_loss"] = reference_loss

# show head
df_loss_vs_unicityfract_bothmodels.head()

Unnamed: 0,model,final_losses,fraction of unique tokens,reference_loss
0,6262,1.915381,1.0,1.915381
1,9174,1.913745,0.226563,1.915381
2,9263,1.972695,0.113282,1.915381
3,9291,2.094879,0.056641,1.915381
4,8767,2.746964,0.022656,1.915381


In [73]:
symbols = ["circle"] * (len(list(df_test_losses_24M.keys()))-1) + ["diamond"] * len(list(df_test_losses_31M.keys()))

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_loss_vs_unicityfract_bothmodels["fraction of unique tokens"], 
                         y=df_loss_vs_unicityfract_bothmodels["reference_loss"],
                    mode='lines',
                    line=dict(width=2, color='red', dash="dot"),
                    name='final loss for 100% unique tokens'))

for j, m in enumerate(plotted_model_both_sizes.keys()):
    fig.add_trace(go.Scatter(x=[df_loss_vs_unicityfract_bothmodels["fraction of unique tokens"][j]], 
                         y=[df_loss_vs_unicityfract_bothmodels["final_losses"][j]],
                    mode='markers',
                    marker_symbol=symbols[j],
                    marker=dict(size=16, color=colors_distinct_named[j]),
                    name=f'final losses after 200M tokens for {name_model_short(m, plotted_model_both_sizes)}'))


fig.update_layout(
    title='Final losses vs fraction of unique tokens',
    xaxis_title='Fraction of unique tokens',
    yaxis_title='Final Losses',
    template="plotly_white",
    showlegend=True,
    yaxis_range=[0,4],
    width=image_width,
    height=image_height*1.1
)

fig.show()

if save_figs : fig.write_image('Graph_Clean/LossVSUniqueTokensBothModels.png')