# Klasifikasi Surah Al-Qur'an - Versi `torchaudio` / FFmpeg

### Versi Lanjutan - Dengan Deeper Fine-Tuning & Reciter-Wise Split

Notebook ini menggunakan `torchaudio` untuk memuat audio, yang memerlukan instalasi FFmpeg yang dapat diakses oleh Python.

**Fitur Kunci**:
- **Audio Backend**: Menggunakan `torchaudio`.
- **Deeper Fine-Tuning**: Melatih dua layer teratas dari Wav2Vec 2.0.
- **Reciter-Wise Split**: Menggunakan metode splitting terbaik untuk evaluasi.

--- 
## 1. Import Pustaka & Pengaturan Awal

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler, autocast
from transformers import Wav2Vec2Model
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

--- 
## 2. Memuat Dataset (Dengan Reciter-Wise Split)

In [None]:
DATASET_ROOT = r"audio_data_processed"

all_qaris = [d for d in sorted(os.listdir(DATASET_ROOT)) if os.path.isdir(os.path.join(DATASET_ROOT, d))]
print(f"Found {len(all_qaris)} unique reciters.")

train_qaris, test_qaris = train_test_split(all_qaris, test_size=0.15, random_state=42)
train_qaris, val_qaris = train_test_split(train_qaris, test_size=(15/85), random_state=42)

print(f"Training reciters ({len(train_qaris)}): {train_qaris}")
print(f"Validation reciters ({len(val_qaris)}): {val_qaris}")
print(f"Test reciters ({len(test_qaris)}): {test_qaris}")

def get_files_for_qaris(qari_list):
    file_paths = []
    labels = []
    for qari_folder in qari_list:
        qari_path = os.path.join(DATASET_ROOT, qari_folder)
        for filename in os.listdir(qari_path):
            if filename.endswith(".mp3") and len(filename) >= 6:
                surah_label = filename[:3]
                full_path = os.path.join(qari_path, filename)
                file_paths.append(full_path)
                labels.append(surah_label)
    return file_paths, labels

X_train_paths, y_train_labels = get_files_for_qaris(train_qaris)
X_val_paths, y_val_labels = get_files_for_qaris(val_qaris)
X_test_paths, y_test_labels = get_files_for_qaris(test_qaris)

all_labels = y_train_labels + y_val_labels + y_test_labels
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
class_names = label_encoder.classes_

y_train = label_encoder.transform(y_train_labels)
y_val = label_encoder.transform(y_val_labels)
y_test = label_encoder.transform(y_test_labels)

print(f"\nUkuran data latih: {len(X_train_paths)} file")
print(f"Ukuran data validasi: {len(X_val_paths)} file")
print(f"Ukuran data uji: {len(X_test_paths)} file")

--- 
## 3. Preprocessing & Dataset Loader (with `torchaudio`)

In [None]:
SAMPLE_RATE = 16000
DURATION = 5 

class AudioDataset(Dataset):
    def __init__(self, paths, labels, target_sr=SAMPLE_RATE, duration_s=DURATION):
        self.paths = paths
        self.labels = labels
        self.target_sr = target_sr
        self.num_samples = self.target_sr * duration_s

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        label = self.labels[idx]
        try:
            waveform, sr = torchaudio.load(path)
        except Exception as e:
            print(f"\nError saat memuat file {path}: {e}")
            return torch.zeros(self.num_samples), label

        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        if sr != self.target_sr:
            resampler = torchaudio.transforms.Resample(sr, self.target_sr)
            waveform = resampler(waveform)
        if waveform.shape[1] > self.num_samples:
            waveform = waveform[:, :self.num_samples]
        else:
            pad_size = self.num_samples - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_size))
            
        return waveform.squeeze(0), label

--- 
## 4. Model & Arsitektur

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Model akan berjalan di: {device.type}")

wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)

class ContrastiveModel(nn.Module):
    def __init__(self, base_model):
        super(ContrastiveModel, self).__init__()
        self.wav2vec = base_model
        self.projection = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 128)
        )

    def forward(self, x):
        outputs = self.wav2vec(x).last_hidden_state
        pooled_output = torch.mean(outputs, dim=1)
        embedding = self.projection(pooled_output)
        return embedding

contrastive_net = ContrastiveModel(wav2vec_model).to(device)

--- 
## 5. Pembelajaran Representasi dengan Deeper Fine-Tuning

In [None]:
class TripletDataset(Dataset):
    def __init__(self, paths, labels, dataset_class):
        self.paths = paths
        self.labels = np.array(labels)
        self.dataset = dataset_class(self.paths, self.labels)
        self.labels_set = set(self.labels)
        self.label_to_indices = {label: np.where(self.labels == label)[0]
                                     for label in self.labels_set}

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, index):
        anchor_path = self.paths[index]
        anchor_label = self.labels[index]
        anchor_waveform, _ = self.dataset[index]

        positive_index = index
        if len(self.label_to_indices[anchor_label]) > 1:
            while positive_index == index:
                positive_index = np.random.choice(self.label_to_indices[anchor_label])
        
        positive_waveform, _ = self.dataset[positive_index]

        negative_label = np.random.choice(list(self.labels_set - {anchor_label}))
        negative_index = np.random.choice(self.label_to_indices[negative_label])
        negative_waveform, _ = self.dataset[negative_index]
        
        return anchor_waveform, positive_waveform, negative_waveform

In [None]:
# Hyperparameters
BATCH_SIZE = 32
NUM_WORKERS = 2
PROJECTION_HEAD_LR = 1e-4
WAV2VEC_LR = 5e-6 
NUM_EPOCHS_CONTRASTIVE = 100

print("Hyperparameters:")
print(f"BATCH_SIZE: {BATCH_SIZE}")
print(f"NUM_WORKERS: {NUM_WORKERS}")
print(f"PROJECTION_HEAD_LR: {PROJECTION_HEAD_LR}")
print(f"WAV2VEC_LR: {WAV2VEC_LR}")
print(f"NUM_EPOCHS: {NUM_EPOCHS_CONTRASTIVE}")

# --- Training Setup ---
triplet_train_dataset = TripletDataset(X_train_paths, y_train, AudioDataset)
train_loader = DataLoader(triplet_train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)

# Unfreeze DUA layer teratas dari Wav2Vec 2.0
for param in contrastive_net.wav2vec.parameters():
    param.requires_grad = False
for layer in contrastive_net.wav2vec.encoder.layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True
print("\nDua layer teratas Wav2Vec 2.0 telah di-unfreeze untuk fine-tuning.")

# Buat parameter groups untuk learning rate yang berbeda
optimizer = optim.Adam([
    {'params': contrastive_net.projection.parameters(), 'lr': PROJECTION_HEAD_LR},
    {'params': contrastive_net.wav2vec.encoder.layers[-2:].parameters(), 'lr': WAV2VEC_LR}
])

loss_fn = nn.TripletMarginLoss(margin=1.0, p=2)
scaler = GradScaler()

print("\nStarting Deeper Fine-Tuning...")

for epoch in range(NUM_EPOCHS_CONTRASTIVE):
    contrastive_net.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS_CONTRASTIVE} | Loss: N/A")
    
    for anchor, positive, negative in progress_bar:
        anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)
        optimizer.zero_grad(set_to_none=True)
        
        with autocast(device_type='cuda'):
            anchor_embed = contrastive_net(anchor)
            positive_embed = contrastive_net(positive)
            negative_embed = contrastive_net(negative)
            loss = loss_fn(anchor_embed, positive_embed, negative_embed)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        progress_bar.set_description(f"Epoch {epoch+1}/{NUM_EPOCHS_CONTRASTIVE} | Loss: {loss.item():.4f}")
        
    avg_loss = total_loss / len(train_loader)
    print(f"    -> Epoch {epoch+1} Selesai, Rata-rata Contrastive Loss: {avg_loss:.4f}")

--- 
## 6. Ekstraksi Embedding & Klasifikasi

In [None]:
def extract_embeddings(model, paths, labels, dataset_class):
    model.eval()
    embeddings = []
    dataset = dataset_class(paths, labels)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
    
    with torch.no_grad():
        for waveforms, _ in tqdm(loader, desc="Mengekstrak Embedding"):
            waveforms = waveforms.to(device)
            with autocast(device_type='cuda'):
                embeds = model(waveforms).cpu().numpy()
            embeddings.append(embeds)
            
    return np.vstack(embeddings)

print("Mengekstrak embedding final dari data latih dan uji...")
X_train_embed = extract_embeddings(contrastive_net, X_train_paths, y_train, AudioDataset)
X_test_embed = extract_embeddings(contrastive_net, X_test_paths, y_test, AudioDataset)
print("Ekstraksi embedding selesai.")

### 6.1. Klasifikasi dengan k-NN (Baseline)

In [None]:
print("Melatih Classifier k-NN (Baseline)...")
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn_classifier.fit(X_train_embed, y_train)
y_pred_knn = knn_classifier.predict(X_test_embed)
print("Pelatihan k-NN selesai.")

### 6.2. Klasifikasi dengan MLP (Pembanding)

In [None]:
print("Melatih Classifier MLP (Pembanding)...")
mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(256, 128), 
    max_iter=500, 
    random_state=42, 
    early_stopping=True,
    n_iter_no_change=10
)
mlp_classifier.fit(X_train_embed, y_train)
y_pred_mlp = mlp_classifier.predict(X_test_embed)
print("Pelatihan MLP selesai.")

--- 
## 7. Evaluasi Kinerja Model

In [None]:
print("="*50)
print("HASIL EVALUASI MODEL")
print("="*50)

# --- Kinerja k-Nearest Neighbors (k-NN) ---
print("\n--- Kinerja k-Nearest Neighbors (k-NN) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print("\nLaporan Klasifikasi:")
knn_report_labels = np.unique(np.concatenate((y_test, y_pred_knn)))
knn_report_names = label_encoder.inverse_transform(knn_report_labels)
print(classification_report(y_test, y_pred_knn, labels=knn_report_labels, target_names=knn_report_names, zero_division=0))

# --- Kinerja Multilayer Perceptron (MLP) ---
print("\n--- Kinerja Multilayer Perceptron (MLP) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp):.4f}")
print("\nLaporan Klasifikasi:")
mlp_report_labels = np.unique(np.concatenate((y_test, y_pred_mlp)))
mlp_report_names = label_encoder.inverse_transform(mlp_report_labels)
print(classification_report(y_test, y_pred_mlp, labels=mlp_report_labels, target_names=mlp_report_names, zero_division=0))