In [25]:
# Импорт необходимых библиотек
import pandas as pd
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Trainer, TrainingArguments
import torch
import os
from sklearn.model_selection import train_test_split
!pip install optuna
import optuna
from optuna.trial import TrialState

# Путь к папке с аудиофайлами
audio_dir = '/kaggle/input/bengaliai-speech/examples/'

# Получение списка аудиофайлов
audio_files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]

# Создание DataFrame
df = pd.DataFrame({
    'audio_path': [os.path.join(audio_dir, file) for file in audio_files]
})

# Разделение на обучающую и валидационную выборки
train_df, validation_df = train_test_split(df, test_size=0.2, random_state=42)

# Загрузка предварительно обученной модели и токенизатора
model_name = "ai4bharat/indicwav2vec_v1_bengali"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Функция для чтения аудиофайла
def read_audio(file_path):
    speech, _ = sf.read(file_path)
    return speech

# Функция для подготовки данных
def prepare_data(df):
    input_values = []
    for _, row in df.iterrows():
        speech = read_audio(row['audio_path'])
        input_value = tokenizer(speech, return_tensors="pt", padding="longest").input_values[0]
        input_values.append(input_value)

    return torch.stack(input_values)

# Подготовка данных
train_input_values = prepare_data(train_df)
validation_input_values = prepare_data(validation_df)

# Настройки обучения
training_args = TrainingArguments(
  output_dir="/kaggle/working/results",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=1,
  save_steps=400,
  eval_steps=400,
  logging_steps=400
)

# Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_input_values,
    eval_dataset=validation_input_values
)

# Запуск обучения
trainer.train()

# Сохранение модели
model.save_pretrained("/kaggle/working/fine_tuned_model")

best_params = {"beam_width": 768}

if FIND_PARAMS:

    valid = pd.read_csv(DATA / "train.csv") # dtype={"id": str}
    valid = valid.query('split=="valid"').sample(n=10000, random_state=42).reset_index(drop=True)
    valid_audio_paths = [str(TRAIN / f"{aid}.mp3") for aid in valid["id"].values]

    valid_dataset = BengaliSRTestDataset(
        valid_audio_paths, SAMPLING_RATE
    )

    collate_func = partial(
        processor_with_lm.feature_extractor,
        return_tensors="pt", sampling_rate=SAMPLING_RATE,
        padding=True,
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=8, shuffle=False,
        num_workers=2, collate_fn=collate_func, drop_last=False,
        pin_memory=True,
    )
    # Calculating the base score
    print(constants)
    logits = inference(model, valid_loader)
    base_preds = decode(logits)
    gts = valid["sentence"].values.tolist()
    base_wer_score = score(gts, base_preds)
    print(f"Base wer score: {base_wer_score}")

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=25)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    if study.best_value < base_wer_score:
        print(f"Base score improved to {study.best_value} from {base_wer_score}. Assigning {study.best_params} to best_params")
        best_params = study.best_params


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_values,attention_mask,output_attentions,output_hidden_states,return_dict,labels,label,labels,label_ids.

In [None]:
best_params = {'alpha': 0.3802723523729998, 'beta': 0.053996879617918436, 'beam_width': 768}

In [None]:
print(f"Running the inference with params: {best_params}")

In [None]:
test = pd.read_csv(DATA / "sample_submission.csv", dtype={"id": str})
test_audio_paths = [str(TEST / f"{aid}.mp3") for aid in test["id"].values]

test_dataset = BengaliSRTestDataset(
    test_audio_paths, SAMPLING_RATE
)
collate_func = partial(
    processor_with_lm.feature_extractor,
    return_tensors="pt", sampling_rate=SAMPLING_RATE,
    padding=True,
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=8, shuffle=False,
    num_workers=2, collate_fn=collate_func, drop_last=False,
    pin_memory=True,
)

pred_sentence_list = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        x = batch["input_values"]
        x = x.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(True):
            y = model(x).logits
        y = y.detach().cpu().numpy()

        for l in y:
            sentence = processor_with_lm.decode(l, **best_params).text
            pred_sentence_list.append(sentence)


pp_pred_sentence_list = [postprocess(s) for s in tqdm(pred_sentence_list)]

# SUBMITION

In [None]:
test["sentence"] = pp_pred_sentence_list
test.to_csv("submission.csv", index=False)
print(test.head())