In [35]:
import plpi.models as P
import transformers as T
import copy

ll_A = P.RobertaConfig(
    attention_probs_dropout_prob=0.1,
    classifier_dropout=None,
    hidden_act="gelu",
    hidden_dropout_prob=0.1,
    hidden_size=8192,
    initializer_range=0.02,
    intermediate_size=28672,
    layer_norm_eps=1e-05,
    max_position_embeddings=4096,
    model_type="roberta",
    num_attention_heads=64,
    num_hidden_layers=1,
    position_embedding_type="absolute",
    bos_token_id=0,
    pad_token_id=1,
    eos_token_id=2,
    type_vocab_size=1,
    use_cache=True,
    vocab_size=32000,
)
ll_B = copy.copy(ll_A)
ll_B.plpi_head_configuration="pairwise"
ll_C = copy.copy(ll_A)
ll_C.plpi_head_configuration="symmetric"

In [36]:
lla = P.RobertaModel(ll_A)
llb = P.RobertaModel(ll_B)
llc = P.RobertaModel(ll_C)

In [37]:
def param(m, layers):
    n_enc = sum(p.numel() for p in m.encoder.parameters())
    n_emb = sum(p.numel() for p in m.embeddings.parameters())
    n_pool = sum(p.numel() for p in m.pooler.parameters())
    total = n_emb + layers * n_enc + n_pool
    print(f"{total=:_}, {n_enc=:_}, {n_pool=:_}, {n_enc=:_}, total_n_enc={n_enc * layers:_}")
    return total

In [38]:
nlla = param(lla, 80)
nllb = param(llb, 80)
nllc = param(llc, 80)

total=59_426_832_384, n_enc=738_299_904, n_pool=67_117_056, n_enc=738_299_904, total_n_enc=59_063_992_320
total=54_141_353_984, n_enc=672_231_424, n_pool=67_117_056, n_enc=672_231_424, total_n_enc=53_778_513_920
total=54_057_467_904, n_enc=671_182_848, n_pool=67_117_056, n_enc=671_182_848, total_n_enc=53_694_627_840


In [39]:
print(100 * (nlla - nllb) / nlla)
print(100 * (nlla - nllc) / nlla)

8.894094111977362
9.035252704207133
