## Fine Tuning superb/wav2vec2-base-superb-er With Shemo Persian Dataset


In [21]:
import os
import sys
from transformers import AutoFeatureExtractor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments
import torch
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import gc


CWD_PATH = os.getcwd()
ROOT_PATH = os.path.abspath(os.path.join(CWD_PATH, '..'))
UTILS_PATH = os.path.join(ROOT_PATH, 'utils')
DATASETS_BASE_PATH = os.path.join(ROOT_PATH, 'data')
if UTILS_PATH not in sys.path:
    sys.path.append(UTILS_PATH)

LABELS_PATH = os.path.join(ROOT_PATH, 'labels')
MODELS_PATH = os.path.join(ROOT_PATH, 'models')
os.makedirs(LABELS_PATH, exist_ok=True)
os.makedirs(MODELS_PATH, exist_ok=True)
MODEL_NAME = "superb/wav2vec2-base-superb-er"
AUDIO_MAX_LENGTH = 8 
SAMPLE_RATE = 16000

print(ROOT_PATH)
print(LABELS_PATH)


/home/dbk/fine-tuned-voice-based-semantic-analytics-for-Persian-language
/home/dbk/fine-tuned-voice-based-semantic-analytics-for-Persian-language/labels


In [5]:
extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=4
)
id2label = model.config.id2label
label2id = {v: k for k, v in id2label.items()}



In [8]:
def load_and_validate_data(basePath, representationFilePath):
    with open(representationFilePath, 'r', encoding='utf-8') as f:
        metadata = json.load(f)    
    paths, labels = [], []
    for fileName, details in metadata.items():
        filePath = os.path.join(basePath, details["path"])
        if os.path.exists(filePath):
            try:
                librosa.load(filePath, sr=SAMPLE_RATE, duration=1)
                paths.append(filePath)
                labels.append(details["emotion"].lower())
            except:
                print(f"Skipping corrupted file: {filePath}")
    return pd.DataFrame({'speech': paths, 'label': labels})

def extract_features(file_path):
    try:
        audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, duration=AUDIO_MAX_LENGTH)
        inputs = extractor(
            audio,
            sampling_rate=SAMPLE_RATE,
            return_tensors="pt",
            padding="max_length",
            max_length=SAMPLE_RATE*AUDIO_MAX_LENGTH,
            truncation=True
        )
        return inputs.input_values[0].numpy()
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return np.zeros(SAMPLE_RATE*AUDIO_MAX_LENGTH)  # Return silence if error

def precompute_and_save_features(df, save_path):
    features = []
    for path in tqdm(df['speech'], desc="Extracting features"):
        features.append(extract_features(path))
    np.save(save_path, np.array(features))

class AudioFeaturesDataset(torch.utils.data.Dataset):
    def __init__(self, features_path, labels):
        self.features = np.load(features_path, mmap_mode='r')  # Memory-mapped for efficiency
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        return {
            'input_values': torch.tensor(self.features[idx]),
            'labels': torch.tensor(self.labels[idx])
        }
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


In [None]:
shemo_dataset_audio_files_path = os.path.join(DATASETS_BASE_PATH,'shemo/shemo')
shemo_dataset_representation_file_path = os.path.join(DATASETS_BASE_PATH, 'shemo/modified_shemo.json')

shemo_df = load_and_validate_data(
    shemo_dataset_audio_files_path,
    shemo_dataset_representation_file_path
)

In [11]:
label_mapping = {'happiness': 'hap', 'anger': 'ang', 'sadness': 'sad', 'neutral': 'neu'}
shemo_df = shemo_df[shemo_df['label'].isin(label_mapping.keys())].copy()
shemo_df['label'] = shemo_df['label'].map(label_mapping)
shemo_df['label_id'] = shemo_df['label'].map(label2id)

In [12]:
train_df, rest_df = train_test_split(shemo_df,test_size=0.8 , random_state=42)
test_df = rest_df.groupby('label').head(100)

In [23]:
print("Precomputing training features...")
precompute_and_save_features(train_df, f"{LABELS_PATH}/shemo_train_labels.npy")
print("Precomputing evaluation features...")
precompute_and_save_features(test_df, f"{LABELS_PATH}/shemo_test_labels.npy")

Precomputing training features...


Extracting features: 100%|██████████| 553/553 [00:01<00:00, 388.12it/s]


Precomputing evaluation features...


Extracting features: 100%|██████████| 400/400 [00:01<00:00, 385.17it/s]


In [26]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    logging_steps=100,
    report_to="none",
    dataloader_num_workers=2,
)



In [25]:
train_dataset = AudioFeaturesDataset(
    f"{LABELS_PATH}/shemo_train_labels.npy",
    train_df['label_id'].values
)

test_dataset = AudioFeaturesDataset(
    f"{LABELS_PATH}/shemo_train_labels.npy",
    test_df['label_id'].values
)


In [28]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [29]:
fine_tuned_with_shemo_model_path = os.path.join(MODELS_PATH,'w2v_fine_tuned_with_shemo_voice_based_semantic_analytics')

In [None]:
print("Starting training...")
try:
    trainer.train()
    print("Training completed successfully!")
    model.save_pretrained(fine_tuned_with_shemo_model_path)
    extractor.save_pretrained(fine_tuned_with_shemo_model_path)
    print("Model saved successfully")
except Exception as e:
    print(f"Training failed: {str(e)}")
    print("Saving current progress...")

Starting training...


Step,Training Loss,Validation Loss


In [None]:
model = Wav2Vec2ForSequenceClassification.from_pretrained(fine_tuned_with_shemo_model_path)
extractor = AutoFeatureExtractor.from_pretrained(fine_tuned_with_shemo_model_path)

id2label = model.config.id2label
label2id = {v: k for k, v in id2label.items()}

def predict(audio_path):
    audio, _ = librosa.load(audio_path, sr=SAMPLE_RATE, duration=8)
    inputs = extractor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    return id2label[torch.argmax(logits).item()]

number_of_correct_predictions = 0
number_of_incorrect_predictions = 0
number_of_total_predictions = len(test_df)


for index, row in test_df.iterrows():
    prediction = predict(row['speech'])
    if prediction == row['label']:
        number_of_correct_predictions += 1
    else:
        number_of_incorrect_predictions += 1
        print(f"number of correct predictions: {number_of_correct_predictions}" )
        print(f"number of incorrect predictions: {number_of_incorrect_predictions}")    



print("Final Result")
print(f"number of correct predictions: {number_of_correct_predictions}" )
print(f"number of incorrect predictions: {number_of_incorrect_predictions}")
accuracy   =   number_of_correct_predictions//number_of_total_predictions
print(f"accuracy: {accuracy}")
