In [2]:
import pandas as pd
df = pd.read_csv('D:\\Maya\\asr_geo2\\dataset_clean.csv')


In [6]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

train_df = train_df.drop('index', axis=1)
test_df = test_df.drop('index', axis=1)

train_df.to_csv("train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv("test.csv", sep="\t", encoding="utf-8", index=False)

print("Saved train.csv and test.csv")

Saved train.csv and test.csv


In [8]:
import IPython, random
rand_int = random.randint(0, len(train_df)-1)
file_name = train_df['path'][rand_int]
print(train_df['transcript'][rand_int])
IPython.display.Audio(file_name)

აქ მრავლადაა მეცხოველეობისა და ფერმერული გაერთიანებები ასევე გეოლოგიური და არქეოლოგიური ძეგლები


In [13]:
from datasets import load_dataset

common_voice_train = load_dataset("csv", data_files={"train": "train.csv"}, delimiter="\t")["train"]
common_voice_test = load_dataset("csv", data_files={"test": "test.csv"}, delimiter="\t")["test"]


In [27]:
def extract_all_chars(batch):
    all_text = " ".join(batch["transcript"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


In [37]:
vocab_train = common_voice_train.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    remove_columns=common_voice_train.column_names
)

vocab_test = common_voice_test.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    remove_columns=common_voice_test.column_names
)

In [41]:
vocab_list = list(sorted(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])))
vocab_list = [vocab for vocab in vocab_list if vocab not in [" ", "\u0307"]]

special_vocab = ["<pad>", "<s>", "</s>", "<unk>", "|"]
vocab_dict = {v: k for k, v in enumerate(special_vocab + vocab_list)}
print(len(vocab_dict))
print(vocab_dict)


38
{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'ა': 5, 'ბ': 6, 'გ': 7, 'დ': 8, 'ე': 9, 'ვ': 10, 'ზ': 11, 'თ': 12, 'ი': 13, 'კ': 14, 'ლ': 15, 'მ': 16, 'ნ': 17, 'ო': 18, 'პ': 19, 'ჟ': 20, 'რ': 21, 'ს': 22, 'ტ': 23, 'უ': 24, 'ფ': 25, 'ქ': 26, 'ღ': 27, 'ყ': 28, 'შ': 29, 'ჩ': 30, 'ც': 31, 'ძ': 32, 'წ': 33, 'ჭ': 34, 'ხ': 35, 'ჯ': 36, 'ჰ': 37}


In [40]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [43]:
model_name_or_path = "facebook/wav2vec2-base"
save_dir = 'D:\\Maya\\asr_geo2'

In [44]:
import os
from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = None
if os.path.exists(save_dir):
    last_checkpoint = get_last_checkpoint(save_dir)

print(last_checkpoint if last_checkpoint else str(None))

None


In [46]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|",
    do_lower_case=False
)

In [47]:
text = "თრიალეთის ქედის აღმოსავლეთ კალთაზე"
print(" ".join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))

თ რ ი ა ლ ე თ ი ს | ქ ე დ ი ს | ა ღ მ ო ს ა ვ ლ ე თ | კ ა ლ თ ა ზ ე
თრიალეთის ქედის აღმოსავლეთ კალთაზე


In [49]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-base"
)

In [61]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

In [51]:
if len(processor.tokenizer.get_vocab()) == len(processor.tokenizer):
    print(len(processor.tokenizer))

38


In [63]:
if not os.path.exists('./new_model/processor/'):
    print("Saving ...")
    processor.save_pretrained('./new_model/processor/')
    print("Saved!")

In [59]:
import torchaudio
import librosa
import numpy as np

target_sampling_rate = 16_000

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(
    np.asarray(speech_array),
    orig_sr=sampling_rate,
    target_sr=target_sampling_rate
)
        
    batch["speech"] = speech_array
    batch["sampling_rate"] = target_sampling_rate
    batch["duration_in_seconds"] = len(batch["speech"]) / target_sampling_rate
    batch["target_text"] = batch["transcript"]
    return batch

# You can change the num_proc below, depends on your cpu and cpu cores
common_voice_train = common_voice_train.map(speech_file_to_array_fn, batch_size=4, remove_columns=common_voice_train.column_names, num_proc=1)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, batch_size= 4, remove_columns=common_voice_test.column_names, num_proc=1)

common_voice_train[0].keys()

print(common_voice_train[0]["sampling_rate"])
print(common_voice_test[0]["sampling_rate"])

Map:   0%|          | 0/11941 [00:00<?, ? examples/s]



Map:   0%|          | 0/2986 [00:00<?, ? examples/s]

16000
16000


In [64]:
def prepare_dataset(batch):
    # (optional) sanity check
    assert len(set(batch["sampling_rate"])) == 1, "Mixed sampling rates inside the batch."

    batch["input_values"] = processor(
        batch["speech"],
        sampling_rate=batch["sampling_rate"][0]
    ).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids

    return batch


_common_voice_train = common_voice_train.map(
    prepare_dataset,
    remove_columns=common_voice_train.column_names,
    batch_size=4,
    batched=True,
    num_proc=1
)

_common_voice_test = common_voice_test.map(
    prepare_dataset,
    remove_columns=common_voice_test.column_names,
    batch_size=4,
    batched=True,
    num_proc=1
)


Map:   0%|          | 0/11941 [00:00<?, ? examples/s]



Map:   0%|          | 0/2986 [00:00<?, ? examples/s]

In [66]:
print(_common_voice_train.column_names)
print(_common_voice_test.column_names)
print(type(_common_voice_train[0]["input_values"]))
print(len(_common_voice_train[0]["input_values"]))
print(_common_voice_train[0]["labels"][:20])
decoded = processor.tokenizer.decode(_common_voice_train[0]["labels"])
print(decoded)
print(processor.tokenizer.convert_ids_to_tokens(_common_voice_train[0]["labels"][:30]))


['input_values', 'labels']
['input_values', 'labels']
<class 'list'>
111168
[22, 5, 22, 23, 24, 16, 21, 18, 4, 12, 5, 17, 5, 16, 9, 7, 18, 6, 21, 18]
სასტუმრო თანამეგობრობის სამხედრო მრჩევლის გენერალ დაგლას მაკართურის რეზიდენცია გახდა
['ს', 'ა', 'ს', 'ტ', 'უ', 'მ', 'რ', 'ო', '|', 'თ', 'ა', 'ნ', 'ა', 'მ', 'ე', 'გ', 'ო', 'ბ', 'რ', 'ო', 'ბ', 'ი', 'ს', '|', 'ს', 'ა', 'მ', 'ხ', 'ე', 'დ']


In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import torch 

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [68]:
# take a small fake batch (e.g. 3 samples)
batch = [_common_voice_train[i] for i in range(3)]

out = data_collator(batch)

print(out.keys())
print(out["labels"])


KeysView({'input_values': tensor([[0.0007, 0.0007, 0.0007,  ..., 0.0000, 0.0000, 0.0000],
        [0.0020, 0.0020, 0.0020,  ..., 0.0000, 0.0000, 0.0000],
        [0.0003, 0.0003, 0.0003,  ..., 0.0003, 0.0003, 0.0003]]), 'labels': tensor([[  22,    5,   22,   23,   24,   16,   21,   18,    4,   12,    5,   17,
            5,   16,    9,    7,   18,    6,   21,   18,    6,   13,   22,    4,
           22,    5,   16,   35,    9,    8,   21,   18,    4,   16,   21,   30,
            9,   10,   15,   13,   22,    4,    7,    9,   17,    9,   21,    5,
           15,    4,    8,    5,    7,   15,    5,   22,    4,   16,    5,   14,
            5,   21,   12,   24,   21,   13,   22,    4,   21,    9,   11,   13,
            8,    9,   17,   31,   13,    5,    4,    7,    5,   35,    8,    5,
         -100, -100, -100, -100, -100],
        [  16,   18,   12,    5,   16,    5,   29,    9,   22,    4,   24,   33,
            9,   10,   22,    4,   22,   35,   10,    5,    8,    5,   22,   35,
 

In [70]:
import evaluate
import os

wer_metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
    if isinstance(label_str, list):
        if isinstance(pred_str, list) and len(pred_str) == len(label_str):
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str[index]}"')

        else:
            for index in random.sample(range(len(label_str)), 3):
                print(f'reference: "{label_str[index]}"')
                print(f'predicted: "{pred_str}"')

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

os.environ["WANDB_DISABLED"] = "true"

In [84]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    model_name_or_path,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    ctc_zero_infinity=True,
    bos_token_id=processor.tokenizer.bos_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer.get_vocab())
    
)

# model.freeze_feature_extractor()
save_dir = './new_model/'
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir=save_dir,
    group_by_length=True,

    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    seed=0,
    # because we are using little batch-size, we set accumulation step to 2 so every 2 epoch the gradiant will be optimize
    gradient_accumulation_steps=2,
    eval_strategy="steps",
    num_train_epochs=4,
    fp16=True,
    # save_steps=10,
    eval_steps=50, 
    logging_steps=10,
    learning_rate=1e-4,
    warmup_steps=500,
    # save_total_limit=2,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=_common_voice_train,
    eval_dataset=_common_voice_test,
    tokenizer=processor.feature_extractor,
)

loading configuration file config.json from cache at C:\Users\MayaD\.cache\huggingface\hub\models--facebook--wav2vec2-base\snapshots\0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8\config.json
Model config Wav2Vec2Config {
  "activation_dropout": 0.0,
  "adapter_attn_dim": null,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activati

Safetensors PR exists


In [83]:
from transformers.utils import logging
logging.set_verbosity_info()

trainer.train()
metrics = trainer.evaluate()
trainer.save_model(save_dir)
trainer.save_metrics("eval", metrics)
processor.save_pretrained(save_dir)

***** Running training *****
  Num examples = 11,941
  Num Epochs = 4
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 20
  Gradient Accumulation steps = 2
  Total optimization steps = 2,392
  Number of trainable parameters = 94,400,934


Step,Training Loss,Validation Loss,Wer
50,47.6087,13.208774,1.0



***** Running Evaluation *****
  Num examples = 2986
  Batch size = 10


reference: "ჩიმენტოს აკადემია წარმოადგენდა ევროპის პირველ სამეცნიერო საზოგადოებას"
predicted: ""
reference: "არსებობს ვერსია რომ სურათი დადგმულია და მასზე გამოსახული პიროვნება არ არის გარსია"
predicted: ""
reference: "ქალაქი მდებარეობს კუნძულის დასავლეთ სანაპირო ზოლში"
predicted: ""


KeyboardInterrupt: 

In [88]:
import torch
print(torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print(torch.version.cuda)

2.8.0+cpu
CUDA available: False
None
