In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForSequenceClassification
from datasets import load_dataset
import soundfile as sf
import torch
import torch

# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = Wav2Vec2ForSequenceClassification.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", num_labels=3)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
model = model.to(device)



Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


## Load dataset

In [45]:
import torchaudio
import os
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import librosa
import numpy as np
# Define labels based on file names and folders


LABELS = {
    "Type A": {
        "1.wav": 1,
        "2.wav": 2
    },
    "Type B": {
        "1.wav": 1,
        "2.wav": 2
    }
}

class CustomDataset(Dataset):
    def __init__(self, folder_path, transform=None, target_length=16000):
        self.folder_path = folder_path
        self.transform = transform
        self.target_length = target_length
        self.file_list = []

        # Traverse the directory and collect all file paths
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file.endswith(".wav"):
                    self.file_list.append(os.path.join(root, file))

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):

        # Get the file path
        file_path = self.file_list[idx]

        # Load the audio file
        waveform, sample_rate = torchaudio.load(file_path)
        if sample_rate != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)


        # Optionally apply a transformation
        if self.transform:
            waveform = self.transform(waveform)

        # Determine the length of the waveform
        current_length = waveform.size(1)


        if current_length < self.target_length:
            # Padding
            padding = self.target_length - current_length
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        elif current_length > self.target_length:
            # Truncate
            waveform = waveform[:, :self.target_length]
        # Extract folder name and file name
        folder_name = os.path.basename(os.path.dirname(file_path))
        file_name = os.path.basename(file_path)
        # Determine label based on folder and file names

        if folder_name in LABELS:

            label = LABELS[folder_name].get(file_name, 0)  # Use "Other" if file_name not in labels[folder_name]
        else:
            label = 0  # Use "Other" for folders not in labels


        input_values = processor(waveform.squeeze(0).tolist(), return_tensors="pt", padding="longest", sampling_rate=16000).input_values

        return input_values, torch.tensor(label)

# Example usage
dataset = CustomDataset('/content/drive/MyDrive/preprocessing', target_length=40000)

In [46]:
from torch.utils.data import Dataset, DataLoader, random_split

# Xác định kích thước của các phần
dataset_size = len(dataset)
train_size = int(0.7 * dataset_size)
val_size = int(0.15 * dataset_size)
test_size = dataset_size - train_size - val_size

# Chia dataset thành các phần
train_dataset, val_dataset, test_dataset = random_split(
    dataset,
    [train_size, val_size, test_size]
)

# Tạo DataLoader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
for batch_idx, batch in enumerate(val_loader):
    input_values, labels  = batch
    break


In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

import torch.nn.functional as F

# Giả sử model, train_loader, num_epochs đã được định nghĩa

# Prepare the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss()
num_epochs=100
# Example of a training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch_idx, batch in enumerate(train_loader):
        input_values, labels  = batch

        # input_values = input_values.squeeze(1)

        # Kiểm tra kích thước dữ liệu đầu vào

        # Loại bỏ các chiều thừa
        input_values = input_values.squeeze(1)
        input_values = input_values.to(device)

        # Zero gradients, backward pass, optimize
        optimizer.zero_grad()
        outputs = model(input_values)
        # Calculate loss
        logits = outputs.logits.to(device)

        loss = loss_fn(outputs.logits, labels.to(device))
        epoch_loss += loss.item()  # Accumulate the loss for the epoch

        loss.backward()
        optimizer.step()

        # Print loss for this batch
        print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item()}")

    # Print average loss for the epoch
    avg_epoch_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] Average Loss: {avg_epoch_loss}")

    # Validation phase
    model.eval()  # Switch model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():  # No need to calculate gradients
        for batch in val_loader:
            input_values, labels = batch

            # Loại bỏ các chiều thừa
            input_values = input_values.squeeze(1)
            input_values = input_values.to(device)
            labels = labels.to(device)

            outputs = model(input_values)
            logits = outputs.logits

            # Calculate loss
            loss = loss_fn(logits, labels)
            val_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Print validation results
    avg_val_loss = val_loss / len(val_loader)
    accuracy = 100 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}] Validation Loss: {avg_val_loss}, Accuracy: {accuracy:.2f}%")


Epoch [1/100], Batch [1/5], Loss: 0.7217339277267456
Epoch [1/100], Batch [2/5], Loss: 0.9419032335281372
Epoch [1/100], Batch [3/5], Loss: 0.6877179145812988
Epoch [1/100], Batch [4/5], Loss: 0.7583523392677307
Epoch [1/100], Batch [5/5], Loss: 0.8224935531616211
Epoch [1/100] Average Loss: 0.7864401936531067
Epoch [1/100] Validation Loss: 0.7269726395606995, Accuracy: 74.17%
Epoch [2/100], Batch [1/5], Loss: 0.7344317436218262
Epoch [2/100], Batch [2/5], Loss: 0.8475151658058167
Epoch [2/100], Batch [3/5], Loss: 0.6934850215911865
Epoch [2/100], Batch [4/5], Loss: 1.0502469539642334
Epoch [2/100], Batch [5/5], Loss: 0.7614120841026306
Epoch [2/100] Average Loss: 0.8174181938171386
Epoch [2/100] Validation Loss: 0.7146750688552856, Accuracy: 71.67%
Epoch [3/100], Batch [1/5], Loss: 0.81313556432724
Epoch [3/100], Batch [2/5], Loss: 0.7489638328552246
Epoch [3/100], Batch [3/5], Loss: 0.653595507144928
Epoch [3/100], Batch [4/5], Loss: 0.9386232495307922
Epoch [3/100], Batch [5/5], Los

KeyboardInterrupt: 

## Save checkpoint



In [None]:
import torch

# Lưu trọng số của mô hình
torch.save(model.state_dict(), 'model_weights.pth')

# Nếu bạn muốn lưu cả cấu trúc mô hình và trọng số
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,
    'loss': avg_epoch_loss,
}, 'checkpoint.pth')

## Load & Usage fineturned Model

In [5]:
import torch
from transformers import Wav2Vec2ForSequenceClassification
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ForSequenceClassification
from datasets import load_dataset
import soundfile as sf
import torch


# Tạo mô hình mới với cùng cấu trúc
model = Wav2Vec2ForSequenceClassification.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", num_labels=3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

# Tải trọng số vào mô hình
model.load_state_dict(torch.load('/content/drive/MyDrive/model_weights.pth'))
model.to(device)



Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [58]:
import torchaudio

def wav2input(file_path):

  waveform, sample_rate = torchaudio.load(file_path)
  if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)
  if sample_rate != 16000:
      waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

  current_length = waveform.size(1)
  padding = 40000 - current_length
  waveform = torch.nn.functional.pad(waveform, (0, padding))
  input_values = processor(waveform.squeeze(0).tolist(), return_tensors="pt", padding="longest", sampling_rate=16000).input_values
  return input_values


def logit2output(logits):
  _, predicted = torch.max(logits, 1)
  print(logits)
  if predicted == 1:
    return "Bật camera lên"
  elif predicted == 2:
    return "Đóng cửa lại"
  else:
    return "Không tồn tại mệnh lệnh"



In [61]:
input = wav2input("/content/drive/MyDrive/preprocessing/B20DCCN568_Quynh_Nam459/Type A/1.wav")
print(f"predicted: {logit2output(model(input.to(device)).logits)}")

tensor([[-1.1864,  2.0820, -1.3033]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
predicted: Bật camera lên


In [60]:
from torch.nn import CrossEntropyLoss
loss_fn = CrossEntropyLoss()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():  # No need to calculate gradients
  for batch in test_loader:
      input_values, labels = batch

      # Loại bỏ các chiều thừa
      input_values = input_values.squeeze(1)
      input_values = input_values.to(device)
      labels = labels.to(device)

      outputs = model(input_values)
      logits = outputs.logits

      # Calculate loss
      loss = loss_fn(logits, labels)
      val_loss += loss.item()

      # Calculate accuracy
      _, predicted = torch.max(logits, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  # Print validation results
  avg_val_loss = val_loss / len(val_loader)
  accuracy = 100 * correct / total
  print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy:.2f}%")

Validation Loss: 0.19902896881103516, Accuracy: 95.83%


In [None]:
transcription = processor.batch_decode(predicted_ids)


['nừi tôi n làn', 'nừi tôi n làn']

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
