In [1]:
import plpi.models as P
import transformers as T

small = P.RobertaConfig(
    classifier_dropout=None,
    hidden_act="gelu",
    hidden_size=512,
    intermediate_size=2048,
    layer_norm_eps=1e-05,
    max_position_embeddings=514,
    num_attention_heads=8,
    num_hidden_layers=4,
    position_embedding_type="absolute",
    bos_token_id=0,
    pad_token_id=1,
    eos_token_id=2,
    type_vocab_size=1,
    use_cache=True,
    vocab_size=50265,
    plpi_head_configuration="vanilla",
)
base = P.RobertaConfig(
    classifier_dropout=None,
    hidden_act="gelu",
    hidden_size=768,
    intermediate_size=3072,
    layer_norm_eps=1e-05,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    position_embedding_type="absolute",
    bos_token_id=0,
    pad_token_id=1,
    eos_token_id=2,
    type_vocab_size=1,
    use_cache=True,
    vocab_size=50265,
    plpi_head_configuration="vanilla",
)
large = P.RobertaConfig(
    classifier_dropout=None,
    hidden_act="gelu",
    hidden_size=1024,
    intermediate_size=4096,
    layer_norm_eps=1e-05,
    max_position_embeddings=514,
    num_attention_heads=16,
    num_hidden_layers=24,
    position_embedding_type="absolute",
    bos_token_id=0,
    pad_token_id=1,
    eos_token_id=2,
    type_vocab_size=1,
    use_cache=True,
    vocab_size=50265,
    plpi_head_configuration="vanilla",
)

In [3]:
import copy

small_A = small
small_B = copy.copy(small)
small_B.plpi_head_configuration="pairwise"
small_C = copy.copy(small)
small_C.plpi_head_configuration="symmetric"

base_A = base
base_B = copy.copy(base)
base_B.plpi_head_configuration="pairwise"
base_C = copy.copy(base)
base_C.plpi_head_configuration="symmetric"

large_A = large
large_B = copy.copy(large)
large_B.plpi_head_configuration="pairwise"
large_C = copy.copy(large)
large_C.plpi_head_configuration="symmetric"

In [4]:
smA = P.RobertaModel(small_A).num_parameters()
smB = P.RobertaModel(small_B).num_parameters()
smC = P.RobertaModel(small_C).num_parameters()

baA = P.RobertaModel(base_A).num_parameters()
baB = P.RobertaModel(base_B).num_parameters()
baC = P.RobertaModel(base_C).num_parameters()

laA = P.RobertaModel(large_A).num_parameters()
laB = P.RobertaModel(large_B).num_parameters()
laC = P.RobertaModel(large_C).num_parameters()

In [5]:
print(f"{smA:_}, {smB:_}, {smC:_}")
print(f"{baA:_}, {baB:_}, {baC:_}")
print(f"{laA:_}, {laB:_}, {laC:_}")

38_872_576, 37_953_024, 37_821_952
124_645_632, 118_148_352, 117_558_528
355_359_744, 331_742_208, 330_169_344


In [6]:
print(100 * (smA - smB) / smA)
print(100 * (smA - smC) / smA)
print()
print(100 * (baA - baB) / baA)
print(100 * (baA - baC) / baA)
print()
print(100 * (laA - laB) / laA)
print(100 * (laA - laC) / laA)

2.3655545750299645
2.702738300646708

5.212601433157321
5.685802130635432

6.646092135861061
7.088703891006856
