In [97]:
from transformers import AutoFeatureExtractor
from transformers import TFAutoModelForAudioClassification
from datasets import load_dataset,Dataset
import librosa
import tensorflow as tf
import numpy as np

In [10]:
model_id = 'facebook/wav2vec2-base'
max_duration = 1

In [4]:
dataset = load_dataset('csv',data_files=r'D:\model_code\wav2vec2\wav2vec_ds.csv',split='train')

In [6]:
split = dataset.train_test_split(train_size=0.9)
train = split['train']
test = split['test']
_train = Dataset.from_dict(train[:])
_test = Dataset.from_dict(test[:])

In [7]:
label_names = set((i['label'],i['class']) for i in dataset)
label2id, id2label = dict(), dict()
for i, label in label_names:
    label2id[label] = str(i)
    id2label[str(i)] = label


In [8]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)



In [12]:
def preprocess_function(examples):
    audio_arrays = [librosa.load(i,sr=None)[0] for i in examples['path']]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    return inputs

In [13]:
train_encodings = _train.map(preprocess_function, remove_columns=["path", "class"], batched=True)
test_encodings = _test.map(preprocess_function, remove_columns=["path", "class"], batched=True)

Map:   0%|          | 0/6750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [15]:
num_labels = len(label_names)
model = TFAutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    from_pt=True,
)





TFWav2Vec2ForSequenceClassification has backpropagation operations that are NOT supported on CPU. If you wish to train/fine-tune this model, you need a GPU or a TPU
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFWav2Vec2ForSequenceClassification: ['project_hid.weight', 'project_q.weight', 'project_q.bias', 'project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing TFWav2Vec2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFWav2Vec2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFWav2Ve

In [149]:
def to_tf_dataset(encodings):
    def gen():
        for item in encodings:
            yield {
                'input_values': item['input_values'],
                'labels': item['label']
            }

    dataset = tf.data.Dataset.from_generator(
        gen,
        output_signature={
            'input_values': tf.TensorSpec(shape=(None,), dtype=tf.float32),
            'labels': tf.TensorSpec(shape=(), dtype=tf.int64),
        }
    )
    return dataset.batch(32)

In [150]:
train_dataset = to_tf_dataset(train_encodings)

test_dataset = to_tf_dataset(test_encodings)

In [151]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [152]:
model.fit(train_dataset,epochs=2,batch_size=32)

Epoch 1/2
      1/Unknown - 130s 130s/step - loss: 1.1575 - accuracy: 0.2812

In [138]:
model.summary()

Model: "tf_wav2_vec2_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 wav2vec2 (TFWav2Vec2MainLa  multiple                  94371712  
 yer)                                                            
                                                                 
 projector (Dense)           multiple                  196864    
                                                                 
 classifier (Dense)          multiple                  771       
                                                                 
Total params: 94569347 (360.75 MB)
Trainable params: 94569347 (360.75 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
