# Dataset Prep

In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import torchaudio
from transformers import Wav2Vec2FeatureExtractor
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# Step 1: Set up paths and initialize processor
data_dir = '/home/cendekiaa/code/Lab/AI701/LMGC/data/train'  # Update this to your actual training data directory
processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True)

# Step 2: Create a function to load audio data information
def load_audio_data_info(data_dir):
    audio_files = []
    genre_mapping = {}
    
    for genre in os.listdir(data_dir):
        if genre not in genre_mapping:
            genre_mapping[genre] = len(genre_mapping)
        
        genre_dir = os.path.join(data_dir, genre)
        for audio_file in os.listdir(genre_dir):
            if audio_file.endswith('.wav'):
                audio_files.append({
                    'file': os.path.join(genre_dir, audio_file),
                    'genre': genre,
                    'label': genre_mapping[genre]
                })
    
    return audio_files, genre_mapping

# Step 3: Load audio data info
audio_files, genre_mapping = load_audio_data_info(data_dir)

# Print some information about the dataset
print(f"Total number of audio files: {len(audio_files)}")
print("Genre mapping:", genre_mapping)
print("\nSample entries:")
for entry in audio_files[:5]:
    print(entry)

# Step 4: Create a DataFrame for easy manipulation
df = pd.DataFrame(audio_files)
print("\nDataset summary:")
print(df['genre'].value_counts())

# Step 5: Function to preprocess audio with error handling
def preprocess_audio(audio_file, processor):
    try:
        waveform, sample_rate = torchaudio.load(audio_file)
        
        # Resample if necessary
        if sample_rate != processor.sampling_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, processor.sampling_rate)
            waveform = resampler(waveform)
        
        # Convert to mono if necessary
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        # Process audio
        input_values = processor(waveform.squeeze().numpy(), sampling_rate=processor.sampling_rate, return_tensors="pt").input_values
        
        return input_values.squeeze()
    except Exception as e:
        print(f"Error processing {audio_file}: {str(e)}")
        return None

# Step 6: Process all files and identify problematic ones
print("\nProcessing all files:")
problematic_files = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    audio_tensor = preprocess_audio(row['file'], processor)
    if audio_tensor is None:
        problematic_files.append(row['file'])

if problematic_files:
    print(f"\nFound {len(problematic_files)} problematic files:")
    for file in problematic_files:
        print(file)
    df = df[~df['file'].isin(problematic_files)]
    print(f"\nRemoved problematic files. New dataset size: {len(df)}")
else:
    print("\nAll files processed successfully.")

# Step 7: Create a custom dataset class with error handling
class AudioDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.dataframe = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        audio_tensor = preprocess_audio(row['file'], self.processor)
        if audio_tensor is None:
            # Return a zero tensor of the expected shape if processing fails
            audio_tensor = torch.zeros((1, self.processor.sampling_rate))  # 1 second of silence
        return {
            'input_values': audio_tensor,
            'label': torch.tensor(row['label']),
            'genre': row['genre']
        }

# Step 8: Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

print("\nTrain set size:", len(train_df))
print("Validation set size:", len(val_df))

# Step 9: Create datasets
train_dataset = AudioDataset(train_df, processor)
val_dataset = AudioDataset(val_df, processor)

print("\nSample from train dataset:")
print(train_dataset[0])

print("\nDataset preparation complete. You can now proceed with model training.")

# Optional: Save the DataFrame for future use
df.to_csv('audio_dataset_info.csv', index=False)
print("\nDataset information saved to 'audio_dataset_info.csv'")

Total number of audio files: 12
Genre mapping: {'Keroncong': 0, 'Dangdut': 1}

Sample entries:
{'file': '/home/cendekiaa/code/Lab/AI701/LMGC/data/train/Keroncong/Keroncong_1_sample_3.wav', 'genre': 'Keroncong', 'label': 0}
{'file': '/home/cendekiaa/code/Lab/AI701/LMGC/data/train/Keroncong/Keroncong_1_sample_2.wav', 'genre': 'Keroncong', 'label': 0}
{'file': '/home/cendekiaa/code/Lab/AI701/LMGC/data/train/Keroncong/Keroncong_2_sample_2.wav', 'genre': 'Keroncong', 'label': 0}
{'file': '/home/cendekiaa/code/Lab/AI701/LMGC/data/train/Keroncong/Keroncong_1_sample_1.wav', 'genre': 'Keroncong', 'label': 0}
{'file': '/home/cendekiaa/code/Lab/AI701/LMGC/data/train/Keroncong/Keroncong_2_sample_3.wav', 'genre': 'Keroncong', 'label': 0}

Dataset summary:
Keroncong    6
Dangdut      6
Name: genre, dtype: int64

Processing all files:


  0%|          | 0/12 [00:00<?, ?it/s]


All files processed successfully.

Train set size: 9
Validation set size: 3

Sample from train dataset:
{'input_values': tensor([0.0436, 0.0688, 0.0688,  ..., 0.3783, 0.3756, 0.4072]), 'label': tensor(0), 'genre': 'Keroncong'}

Dataset preparation complete. You can now proceed with model training.

Dataset information saved to 'audio_dataset_info.csv'


# Start FineTune

In [3]:
import torch
from torch import nn
from transformers import AutoModel, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 1: Define the updated model architecture
class GenreClassifier(nn.Module):
    def __init__(self, pretrained_model, num_genres):
        super().__init__()
        self.mert = pretrained_model
        self.classifier = nn.Linear(1024, num_genres)  # MERT's hidden size is 1024
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_values, labels=None):
        outputs = self.mert(input_values)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        
        return {'loss': loss, 'logits': logits} if loss is not None else logits

# Step 2: Load the pretrained MERT model
pretrained_model = AutoModel.from_pretrained("m-a-p/MERT-v1-330M", trust_remote_code=True)

# Step 3: Initialize the genre classifier
num_genres = len(genre_mapping)  # Assuming genre_mapping is defined from the dataset preparation
model = GenreClassifier(pretrained_model, num_genres)

# Step 4: Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # Adjust as needed
    per_device_train_batch_size=1,  # Adjust based on your GPU memory
    per_device_eval_batch_size=1,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=1,  # Log every step
    logging_strategy="steps",  # Log based on steps instead of epoch
    evaluation_strategy="epoch",  # Still evaluate every epoch
    learning_rate=2e-5,  # Adjust as needed
    warmup_steps=500,
    weight_decay=0.01,
)

# Step 6: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Step 7: Train the model
print("Starting training...")
trainer.train()

# Step 8: Evaluate the model
print("Evaluating the model...")
eval_results = trainer.evaluate()
print(eval_results)

# Step 9: Save the model
print("Saving the model...")
trainer.save_model("./finetuned_mert_genre_classifier")

print("Finetuning complete!")



Some weights of the model checkpoint at m-a-p/MERT-v1-330M were not used when initializing MERTModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing MERTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MERTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MERTModel were not initialized from the model checkpoint at m-a-p/MERT-v1-330M and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7366,0.717193,0.333333,0.166667,0.111111,0.333333
2,0.6903,0.71452,0.333333,0.166667,0.111111,0.333333
3,0.6503,0.711225,0.333333,0.166667,0.111111,0.333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating the model...


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7171931266784668, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.16666666666666666, 'eval_precision': 0.1111111111111111, 'eval_recall': 0.3333333333333333, 'eval_runtime': 53.6458, 'eval_samples_per_second': 0.056, 'eval_steps_per_second': 0.056, 'epoch': 3.0}
Saving the model...
Finetuning complete!

Testing the model on a few samples:


RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [6]:
print("\nTesting the model on a few samples:")
model.eval()
with torch.no_grad():
    for i in range(min(5, len(val_dataset))):
        sample = val_dataset[i]
        input_values = sample['input_values'].unsqueeze(0).to(device)  # Move to the same device as the model
        outputs = model(input_values)
        
        # Handle different output formats
        if isinstance(outputs, dict):
            logits = outputs['logits']
        else:
            logits = outputs
        
        # Ensure logits is a 2D tensor
        if logits.dim() == 1:
            logits = logits.unsqueeze(0)
        
        predicted_label = logits.cpu().argmax(dim=-1).item()
        predicted_genre = [k for k, v in genre_mapping.items() if v == predicted_label][0]
        actual_genre = sample['genre']
        print(f"Sample {i+1}: Predicted: {predicted_genre}, Actual: {actual_genre}")


Testing the model on a few samples:
Sample 1: Predicted: Dangdut, Actual: Dangdut
Sample 2: Predicted: Dangdut, Actual: Keroncong
Sample 3: Predicted: Dangdut, Actual: Keroncong
