In [6]:
from transformers import BartConfig, BartTokenizer, BartForConditionalGeneration
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, RobertaModel

In [7]:
BartConfig()

BartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "scale_embedding": false,
  "transformers_version": "4.27.0.dev0",
  "use_cache": true,
  "vocab_size": 50265
}

In [8]:
bart_config = BartConfig(
    vocab_size=50265,
    d_model=768,
    encoder_ffn_dim=3072,
    encoder_layers=12,
    encoder_attention_heads=12,
    decoder_ffn_dim=3072,
    decoder_layers=12,
    decoder_attention_heads=12,
    max_position_embeddings=512,
)
bart_config

BartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "bart",
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "scale_embedding": false,
  "transformers_version": "4.27.0.dev0",
  "use_cache": true,
  "vocab_size": 50265
}

In [9]:
bart = BartForConditionalGeneration(bart_config)
bart

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(514, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, 

In [10]:
pretrained_roberta = RobertaModel.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
bart.model.encoder.embed_tokens = pretrained_roberta.embeddings.word_embeddings
bart.model.encoder.embed_positions = pretrained_roberta.embeddings.position_embeddings
for bart_layer, roberta_layer in zip(bart.model.encoder.layers, pretrained_roberta.encoder.layer):
    bart_layer.self_attn.k_proj = roberta_layer.attention.self.key
    bart_layer.self_attn.v_proj = roberta_layer.attention.self.value
    bart_layer.self_attn.q_proj = roberta_layer.attention.self.query
    bart_layer.self_attn.out_proj = roberta_layer.attention.output.dense
    bart_layer.self_attn_layer_norm = roberta_layer.attention.output.LayerNorm
    bart_layer.fc1 = roberta_layer.intermediate.dense
    bart_layer.fc2 = roberta_layer.output.dense
    bart_layer.final_layer_norm = roberta_layer.output.LayerNorm

bart.model.decoder.embed_tokens = pretrained_roberta.embeddings.word_embeddings
bart.model.decoder.embed_positions = bart.model.decoder.embed_positions
for bart_layer, roberta_layer in zip(bart.model.decoder.layers, pretrained_roberta.encoder.layer):
    bart_layer.self_attn.k_proj = roberta_layer.attention.self.key
    bart_layer.self_attn.v_proj = roberta_layer.attention.self.value
    bart_layer.self_attn.q_proj = roberta_layer.attention.self.query
    bart_layer.self_attn.out_proj = roberta_layer.attention.output.dense
    bart_layer.self_attn_layer_norm = roberta_layer.attention.output.LayerNorm
    bart_layer.encoder_attn.k_proj = roberta_layer.attention.self.key
    bart_layer.encoder_attn.v_proj = roberta_layer.attention.self.value
    bart_layer.encoder_attn.q_proj = roberta_layer.attention.self.query
    bart_layer.encoder_attn.out_proj = roberta_layer.attention.output.dense
    bart_layer.encoder_attn_layer_norm = roberta_layer.attention.output.LayerNorm
    bart_layer.fc1 = roberta_layer.intermediate.dense
    bart_layer.fc2 = roberta_layer.output.dense
    bart_layer.final_layer_norm = roberta_layer.output.LayerNorm

bart.model.shared = pretrained_roberta.embeddings.word_embeddings

In [16]:
bart

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): Embedding(514, 768, padding_idx=1)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elemen

In [15]:
bart.save_pretrained("./models/pretrained_bart_from_roberta")