In [19]:
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from datasets import load_dataset,load_metric,Dataset
import librosa
import numpy as np


In [20]:
dataset = load_dataset('csv',data_files=r'D:\model_code\wav2vec2\wav2vec_ds.csv',split='train')

In [21]:
split = dataset.train_test_split(train_size=0.9)
train = split['train']
test = split['test']
_train = Dataset.from_dict(train[:])
_test = Dataset.from_dict(test[:])

In [22]:
label_names = set((i['label'],i['class']) for i in dataset)

In [23]:
label2id, id2label = dict(), dict()
for i, label in label_names:
    label2id[label] = str(i)
    id2label[str(i)] = label


In [24]:
label2id, id2label

({'unknown': '0', 'adele': '1', 'hilfe_hilfe': '2'},
 {'0': 'unknown', '1': 'adele', '2': 'hilfe_hilfe'})

In [25]:
model_id = 'facebook/wav2vec2-base'
max_duration = 3

In [26]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)



In [27]:
def preprocess_function(examples):
    audio_arrays = [librosa.load(i,sr=None)[0] for i in examples['path']]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    return inputs

In [28]:
train_encodings = _train.map(preprocess_function, remove_columns=["path", "class"], batched=True)
test_encodings = _test.map(preprocess_function, remove_columns=["path", "class"], batched=True)


Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [29]:
model_name = 'wav2vec2-finetune'
batch_size = 32

args = TrainingArguments(
    f"{model_name}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)



In [30]:
metric = load_metric('accuracy')
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [31]:
#to run on cpu
train_encodings.set_format(type='torch')
test_encodings.set_format(type='torch')

In [32]:
num_labels = len(label_names)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [34]:
trainer.train()

  0%|          | 0/520 [00:00<?, ?it/s]

{'loss': 1.0889, 'grad_norm': 1.428753137588501, 'learning_rate': 5.76923076923077e-06, 'epoch': 0.19}
{'loss': 1.0655, 'grad_norm': 2.1796815395355225, 'learning_rate': 1.153846153846154e-05, 'epoch': 0.38}
{'loss': 1.0239, 'grad_norm': 6.573427200317383, 'learning_rate': 1.7307692307692306e-05, 'epoch': 0.57}
{'loss': 0.9634, 'grad_norm': 3.7728822231292725, 'learning_rate': 2.307692307692308e-05, 'epoch': 0.76}
{'loss': 0.8823, 'grad_norm': 21.10834503173828, 'learning_rate': 2.884615384615385e-05, 'epoch': 0.95}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.7840138077735901, 'eval_accuracy': 0.7293333333333333, 'eval_runtime': 367.2582, 'eval_samples_per_second': 2.042, 'eval_steps_per_second': 0.065, 'epoch': 0.99}
{'loss': 0.782, 'grad_norm': 12.104599952697754, 'learning_rate': 2.9487179487179487e-05, 'epoch': 1.14}
{'loss': 0.6887, 'grad_norm': 16.3126220703125, 'learning_rate': 2.884615384615385e-05, 'epoch': 1.33}
{'loss': 0.6527, 'grad_norm': 14.112991333007812, 'learning_rate': 2.8205128205128207e-05, 'epoch': 1.52}
{'loss': 0.5829, 'grad_norm': 15.667430877685547, 'learning_rate': 2.7564102564102562e-05, 'epoch': 1.71}
{'loss': 0.5926, 'grad_norm': 10.122722625732422, 'learning_rate': 2.6923076923076923e-05, 'epoch': 1.9}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.5586026906967163, 'eval_accuracy': 0.7906666666666666, 'eval_runtime': 368.1825, 'eval_samples_per_second': 2.037, 'eval_steps_per_second': 0.065, 'epoch': 1.99}
{'loss': 0.5904, 'grad_norm': 17.335559844970703, 'learning_rate': 2.628205128205128e-05, 'epoch': 2.09}
{'loss': 0.5135, 'grad_norm': 50.1258430480957, 'learning_rate': 2.564102564102564e-05, 'epoch': 2.27}
{'loss': 0.5002, 'grad_norm': 29.135984420776367, 'learning_rate': 2.5e-05, 'epoch': 2.46}
{'loss': 0.4867, 'grad_norm': 27.903987884521484, 'learning_rate': 2.435897435897436e-05, 'epoch': 2.65}
{'loss': 0.4782, 'grad_norm': 18.028261184692383, 'learning_rate': 2.3717948717948718e-05, 'epoch': 2.84}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.4885983467102051, 'eval_accuracy': 0.8173333333333334, 'eval_runtime': 362.8129, 'eval_samples_per_second': 2.067, 'eval_steps_per_second': 0.066, 'epoch': 3.0}
{'loss': 0.4791, 'grad_norm': 183.9615936279297, 'learning_rate': 2.307692307692308e-05, 'epoch': 3.03}
{'loss': 0.4494, 'grad_norm': 26.864418029785156, 'learning_rate': 2.2435897435897437e-05, 'epoch': 3.22}
{'loss': 0.4551, 'grad_norm': 17.395029067993164, 'learning_rate': 2.1794871794871795e-05, 'epoch': 3.41}
{'loss': 0.4302, 'grad_norm': 17.27464485168457, 'learning_rate': 2.1153846153846157e-05, 'epoch': 3.6}
{'loss': 0.4208, 'grad_norm': 31.04962921142578, 'learning_rate': 2.0512820512820515e-05, 'epoch': 3.79}
{'loss': 0.4516, 'grad_norm': 19.88152313232422, 'learning_rate': 1.9871794871794873e-05, 'epoch': 3.98}


  0%|          | 0/24 [00:00<?, ?it/s]

{'eval_loss': 0.4183487892150879, 'eval_accuracy': 0.8373333333333334, 'eval_runtime': 363.2037, 'eval_samples_per_second': 2.065, 'eval_steps_per_second': 0.066, 'epoch': 4.0}
{'loss': 0.399, 'grad_norm': 11.915410995483398, 'learning_rate': 1.923076923076923e-05, 'epoch': 4.17}
{'loss': 0.4224, 'grad_norm': 15.922377586364746, 'learning_rate': 1.858974358974359e-05, 'epoch': 4.36}
{'loss': 0.3955, 'grad_norm': 27.909072875976562, 'learning_rate': 1.7948717948717948e-05, 'epoch': 4.55}


In [None]:
model_path = f'D:/model_code/models/wav2vec2/trail 1'
trainer.save_model(f'{model_path}/{model_name}')