In [1]:
import torch
import numpy as np
import transformers

In [2]:
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2Config,
    is_apex_available,
    set_seed,
)

In [3]:
model = Wav2Vec2ForCTC.from_pretrained(
    'mesolitica/wav2vec2-xls-r-300m-mixed', 
)

In [4]:
model.config

Wav2Vec2Config {
  "_name_or_path": "mesolitica/wav2vec2-xls-r-300m-mixed",
  "activation_dropout": 0.05,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.05,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "layer",
  "feat_proj_dropout": 0.05,
  "feat_quantizer_dropout": 0.0,
  "final_dropout": 0.0,
  "gradient_ch

In [5]:
half_config = Wav2Vec2Config.from_pretrained('mesolitica/wav2vec2-xls-r-300m-mixed', 
                                             num_hidden_layers = 12,
                                            vocab_size = 40)

In [6]:
half = Wav2Vec2ForCTC(half_config)

In [7]:
import torch.nn as nn

def copy_layers(src_layers, dest_layers, layers_to_copy):
    layers_to_copy = nn.ModuleList([src_layers[i] for i in layers_to_copy])
    assert len(dest_layers) == len(layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
    dest_layers.load_state_dict(layers_to_copy.state_dict())

layers_to_copy = [0,1,2,3,4,5,6,7,8,9,10,11]

copy_layers(model.wav2vec2.encoder.layers, half.wav2vec2.encoder.layers, layers_to_copy)

In [8]:
# feature_extractor, feature_projection

half.wav2vec2.feature_extractor.load_state_dict(model.wav2vec2.feature_extractor.state_dict())
half.wav2vec2.feature_projection.load_state_dict(model.wav2vec2.feature_projection.state_dict())

<All keys matched successfully>

In [9]:
!rm -rf 300m-12-layers

In [10]:
half.save_pretrained("300m-12-layers")

In [11]:
!ls -lh 300m-12-layers

total 627M
-rw-r--r-- 1 husein husein 2.1K Jan  30 13:50 config.json
-rw-r--r-- 1 husein husein 627M Jan  30 13:50 pytorch_model.bin


In [12]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
udev             32G     0   32G   0% /dev
tmpfs           6.3G  3.0M  6.3G   1% /run
/dev/nvme0n1p2  916G  736G  133G  85% /
tmpfs            32G  220M   32G   1% /dev/shm
tmpfs           5.0M  4.0K  5.0M   1% /run/lock
tmpfs            32G     0   32G   0% /sys/fs/cgroup
/dev/loop0      128K  128K     0 100% /snap/bare/5
/dev/loop3      6.9M  6.9M     0 100% /snap/ngrok/89
/dev/loop2       66M   66M     0 100% /snap/gtk-common-themes/1519
/dev/loop4       50M   50M     0 100% /snap/snapd/17883
/dev/loop6      347M  347M     0 100% /snap/gnome-3-38-2004/115
/dev/loop8       46M   46M     0 100% /snap/snap-store/599
/dev/loop9       92M   92M     0 100% /snap/gtk-common-themes/1535
/dev/loop11     347M  347M     0 100% /snap/gnome-3-38-2004/119
/dev/loop10      46M   46M     0 100% /snap/snap-store/638
/dev/nvme0n1p1  511M  5.3M  506M   2% /boot/efi
/dev/sda1       880G  499G  337G  60% /home/husein/ssd1
tmpfs          

In [13]:
from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel

In [14]:
feature_extractor = AutoFeatureExtractor.from_pretrained('mesolitica/wav2vec2-xls-r-300m-mixed')

In [15]:
feature_extractor.save_pretrained("300m-12-layers")

['300m-12-layers/preprocessor_config.json']