In [1]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install transformers --quiet
!pip install huggingface_hub --quiet

!pip install accelerate -U --quiet
!pip install transformers[torch] --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

### Generate Labels for Data

In [2]:
# create metadata (labels) to create the dataset object

import os
import csv

directory = "/content/drive/MyDrive/deceptive-16khz/"
data = []

for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        label = filename.split("_")[1]
        label = 1 if label == 'lie' else 0
        data.append((filename, label))

csv_file_path = "/content/drive/MyDrive/deceptive-16khz/metadata.csv"


with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "label"])
    writer.writerows(data)

print(f"Metadata file created at {csv_file_path}")


Metadata file created at /content/drive/MyDrive/deceptive-16khz/metadata.csv


### Split data into sliding windows

In [3]:
# Windowing function
def window_audio(audio_array, window_size=10, overlap=0.75):
    sr = 16000
    window_size_samples = int(window_size * sr)
    overlap_samples = int(window_size_samples * overlap)

    windows = []
    for i in range(0, len(audio_array) - window_size_samples, overlap_samples):
        window = audio_array[i:i + window_size_samples]
        windows.append(window)

    return windows


### Training


In [4]:
from datasets import load_dataset
from datasets import Audio
from datasets import DatasetDict, Dataset
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score,confusion_matrix,precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
from collections import defaultdict
#read data
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/deceptive-16khz",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)
# apply windowing to the dataset
windowed_dataset = defaultdict(list)
for example in dataset:
    audio_array = example["audio"]["array"]
    windows = window_audio(audio_array)

    for window in windows:
        windowed_dataset["audio"].append({"array": window})
        windowed_dataset["label"].append(example["label"])

dataset = Dataset.from_dict(windowed_dataset)
print(dataset)
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000*10, truncation=True)
    return inputs

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)

    # Calculate precision, recall, and f1 score
    precision = precision_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')
    recall = recall_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')
    f1 = f1_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1}


num_labels = 2
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_f1_scores = []
all_confusion_matrices = []
for fold, (train_idx, test_idx) in enumerate(kf.split(dataset)):
    print(f"\n----- Fold {fold + 1} -----")

    # Create datasets for this fold
    train_dataset = dataset.select(train_idx)
    test_dataset = dataset.select(test_idx)

    # Preprocess the datasets
    encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns="audio", batched=True)
    encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns="audio", batched=True)

    # Model initialization
    num_labels = 2
    model = AutoModelForAudioClassification.from_pretrained(
        "facebook/hubert-base-ls960", num_labels=num_labels
    )



    training_args = TrainingArguments(
        output_dir= "hubert_deception-1",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=8,
        num_train_epochs=10,
        warmup_ratio=0.1,
        logging_steps=10,
        load_best_model_at_end=True,
        push_to_hub=False,
      )

    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=encoded_train_dataset,
      eval_dataset=encoded_test_dataset,
      tokenizer=feature_extractor,
      compute_metrics=compute_metrics,
  )

    trainer.train()

    eval_results = trainer.evaluate()

    # Print F1 score for this fold
    print(f"Fold {fold + 1} - F1 Score: {eval_results['eval_f1_score']}")
        # Append F1 score and confusion matrix to lists
    all_f1_scores.append(eval_results['eval_f1_score'])

# Calculate mean F1 score
mean_f1_score = np.mean(all_f1_scores)
print(f"\nMean F1 Score across all folds: {mean_f1_score}")



Resolving data files:   0%|          | 0/117 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/116 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio', 'label'],
    num_rows: 115
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 333
})


preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]


----- Fold 1 -----


Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.688217,0.753957,0.522388,0.374047
2,0.674000,0.578941,0.756756,0.746269,0.743058
4,0.512800,0.406557,0.821885,0.820896,0.820656
6,0.425300,0.388056,0.855568,0.850746,0.850078
8,0.338700,0.308398,0.880597,0.880597,0.880597
9,0.320100,0.306169,0.880597,0.880597,0.880597


Fold 1 - F1 Score: 0.8805970149253731

----- Fold 2 -----


Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.694571,0.228113,0.477612,0.308759
2,0.674400,0.612674,0.795309,0.641791,0.597192
4,0.506700,0.465042,0.856577,0.850746,0.850613
6,0.398100,0.450693,0.863423,0.850746,0.850147
8,0.334600,0.448055,0.863423,0.850746,0.850147
9,0.306700,0.443194,0.863423,0.850746,0.850147


  _warn_prf(average, modifier, msg_start, len(result))


Fold 2 - F1 Score: 0.8501465884861408

----- Fold 3 -----


Map:   0%|          | 0/266 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.68668,0.751767,0.671642,0.624484
2,0.672300,0.587333,0.670166,0.671642,0.670301
4,0.540200,0.443482,0.858626,0.835821,0.830236
6,0.398000,0.409153,0.880125,0.865672,0.862605
8,0.353700,0.407595,0.880125,0.865672,0.862605
9,0.341600,0.405789,0.880125,0.865672,0.862605


Fold 3 - F1 Score: 0.8626052129290654

----- Fold 4 -----


Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.687341,0.761364,0.575758,0.449545
2,0.672400,0.602134,0.681596,0.681818,0.677686
4,0.517400,0.420517,0.867769,0.848485,0.844406
6,0.399200,0.400734,0.881423,0.848485,0.842572
8,0.351400,0.379815,0.900826,0.878788,0.875524
9,0.298400,0.375034,0.900826,0.878788,0.875524


Fold 4 - F1 Score: 0.8755244755244754

----- Fold 5 -----


Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.684645,0.758675,0.727273,0.700296
2,0.675600,0.602439,0.706577,0.69697,0.699229
4,0.531600,0.410779,0.832692,0.833333,0.832816
6,0.421700,0.34393,0.878788,0.878788,0.878788
8,0.394900,0.315921,0.878788,0.878788,0.878788
9,0.352900,0.312037,0.878788,0.878788,0.878788


Fold 5 - F1 Score: 0.8787878787878788

Mean F1 Score across all folds: 0.8695322341305867


In [5]:
from datasets import load_dataset, Audio

dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/deceptive-16khz",split = "train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

Resolving data files:   0%|          | 0/117 [00:00<?, ?it/s]

In [7]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="/content/hubert_deception-1/checkpoint-80")
for file in dataset :
  audio = file["audio"]["path"]
  print(classifier(audio))

[{'score': 0.6854604482650757, 'label': 'LABEL_1'}, {'score': 0.3145395517349243, 'label': 'LABEL_0'}]
[{'score': 0.9022054672241211, 'label': 'LABEL_1'}, {'score': 0.09779457747936249, 'label': 'LABEL_0'}]
[{'score': 0.7441938519477844, 'label': 'LABEL_0'}, {'score': 0.25580617785453796, 'label': 'LABEL_1'}]
[{'score': 0.9032216668128967, 'label': 'LABEL_1'}, {'score': 0.09677833318710327, 'label': 'LABEL_0'}]
[{'score': 0.8836219310760498, 'label': 'LABEL_1'}, {'score': 0.11637815088033676, 'label': 'LABEL_0'}]
[{'score': 0.8940549492835999, 'label': 'LABEL_1'}, {'score': 0.10594508796930313, 'label': 'LABEL_0'}]
[{'score': 0.8764867782592773, 'label': 'LABEL_0'}, {'score': 0.12351320683956146, 'label': 'LABEL_1'}]
[{'score': 0.8802664279937744, 'label': 'LABEL_0'}, {'score': 0.11973357945680618, 'label': 'LABEL_1'}]
[{'score': 0.8819144368171692, 'label': 'LABEL_0'}, {'score': 0.11808554828166962, 'label': 'LABEL_1'}]
[{'score': 0.8738248944282532, 'label': 'LABEL_0'}, {'score': 0.1