In [1]:
! pip install datasets
! pip install evaluate
! pip install transformers
!pip install huggingface_hub



In [21]:
# create metadata (labels) to create the dataset object

import os
import csv

directory = "/content/drive/MyDrive/deceptive-16khz/"
data = []

for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        label = filename.split("_")[1]
        label = 1 if label == 'lie' else 0
        data.append((filename, label))

csv_file_path = "/content/drive/MyDrive/deceptive-16khz/metadata.csv"


with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "label"])
    writer.writerows(data)

print(f"Metadata file created at {csv_file_path}")


Metadata file created at /content/drive/MyDrive/deceptive-16khz/metadata.csv


In [3]:
from datasets import load_dataset
from datasets import Audio
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/deceptive-16khz",split = "train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

Resolving data files:   0%|          | 0/116 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/116 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

In [6]:
dataset = dataset.train_test_split(test_size=0.2)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 92
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 23
    })
})

In [8]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [9]:
encoded_dataset = dataset.map(preprocess_function, remove_columns="audio", batched=True)


Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

In [10]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 92
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 23
    })
})

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [13]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = 2
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/hubert-base-ls960", num_labels=num_labels
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.weight', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
!pip install accelerate -U
!pip install transformers[torch]



In [22]:
training_args = TrainingArguments(
    output_dir= "hubert_deception",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.690993,0.565217
2,No log,0.691499,0.565217
3,No log,0.686186,0.565217
4,No log,0.680731,0.565217
5,No log,0.67756,0.608696
6,No log,0.677636,0.608696
7,0.338000,0.671796,0.608696
8,0.338000,0.665419,0.608696
9,0.338000,0.658753,0.652174
10,0.338000,0.655076,0.652174


TrainOutput(global_step=50, training_loss=0.28764320373535157, metrics={'train_runtime': 135.4694, 'train_samples_per_second': 33.956, 'train_steps_per_second': 0.369, 'total_flos': 2.785324493952e+16, 'train_loss': 0.28764320373535157, 'epoch': 33.33})

In [17]:
from datasets import load_dataset, Audio

dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/deceptive-16khz",split = "train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

Resolving data files:   0%|          | 0/116 [00:00<?, ?it/s]

In [19]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="/content/hubert_deception/checkpoint-9")
audio_file = dataset[0]["audio"]["path"]
classifier(audio_file)

[{'score': 0.5115541219711304, 'label': 'LABEL_0'},
 {'score': 0.488445907831192, 'label': 'LABEL_1'}]