In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Voice_Vertification

/content/drive/MyDrive/Voice_Vertification


In [None]:
!pip install comet_ml

In [4]:
from comet_ml import Experiment

In [None]:
!pip install torchaudio

In [None]:
!pip3 install transformers==4.9.2 soundfile datasets==1.11.0 pyctcdecode
!pip3 install https://github.com/kpu/kenlm/archive/master.zip

In [7]:
from transformers.file_utils import cached_path, hf_bucket_url
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import os, zipfile

# SET-UP

In [8]:
TEST_ROOT = 'Zalo_Voice_Verification/Train-Test-Data/dataset/272-M-26/'

In [9]:
import glob 
import random

test_files = glob.glob(os.path.join(TEST_ROOT,'*.wav'))

In [10]:
test_file = random.choice(test_files)
print(test_file)

Zalo_Voice_Verification/Train-Test-Data/dataset/272-M-26/speaker_272-12.wav


In [11]:
import torchaudio
import torchaudio.transforms as tf

test , sr = torchaudio.load(test_file)

In [12]:
test = tf.Resample(sr,16000)(test)

In [13]:
tf.MelSpectrogram(sample_rate =  16000, n_fft= 400)(test).shape


At least one mel filterbank has all zero values. The value for `n_mels` (128) may be set too high. Or, the value for `n_freqs` (201) may be set too low.



torch.Size([1, 128, 301])

In [14]:
tf.MFCC(sample_rate =16000,n_mfcc=40)(test).shape


At least one mel filterbank has all zero values. The value for `n_mels` (128) may be set too high. Or, the value for `n_freqs` (201) may be set too low.



torch.Size([1, 40, 301])

In [15]:
!apt-get install libsox-fmt-all libsox-dev sox > /dev/null
! python -m pip install torchaudio > /dev/null
! python -m pip install git+https://github.com/facebookresearch/WavAugment.git > /dev/null
!pip install ffmpeg-python > /dev/null

  Running command git clone -q https://github.com/facebookresearch/WavAugment.git /tmp/pip-req-build-nzkd1pjd


In [16]:
import augment

def augment_wav(signal,sr=16000):
    signal = torch.from_numpy(signal)
    reverb_signal = augment.EffectChain().reverb(50, 50, 50).channels(1).apply(signal, src_info={'rate': sr})
    noise_generator = lambda: torch.zeros_like(reverb_signal).uniform_()
    additive_noise_signal = augment.EffectChain().additive_noise(noise_generator, snr=random.uniform(15,25)).apply(reverb_signal, src_info={'rate': sr})
    time_dropout_signal =  augment.EffectChain().time_dropout(max_seconds=random.uniform(0.1,0.25)).apply(additive_noise_signal, src_info={'rate': sr})
    return time_dropout_signal,sr

In [17]:
cache_dir = './cache/'
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)
lm_file = hf_bucket_url("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
lm_file = cached_path(lm_file,cache_dir=cache_dir)
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
    zip_ref.extractall(cache_dir)
lm_file = cache_dir + 'vi_lm_4grams.bin'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
pretrained_model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", cache_dir=cache_dir)

In [19]:
def wav2vec(signal, sr):
  input_values = processor(
        signal[0], 
        sampling_rate= sr, 
        return_tensors="pt"
  ).input_values
  logits = pretrained_model(input_values).logits[0].T
  return logits.unsqueeze(dim = 0)

In [20]:
def get_embedding(file, type = 'wav2vec', sample_rate =  16000, mode = 'val', n_fft = 400, n_mfcc = 40):
  test , sr = torchaudio.load(file)
  test = tf.Resample(sr,sample_rate)(test)
  if mode == 'train':
    test, sr = augment_wav(test.numpy())
  if type == 'melspec':
    test = tf.MelSpectrogram(sample_rate = sample_rate, n_fft= n_fft)(test)
  elif type == 'mfcc':
    test = tf.MFCC(sample_rate = sample_rate,n_mfcc = n_mfcc)(test)
  elif type == 'wav2vec':
    try:
      test = wav2vec(test, sample_rate)
    except:
      test = torch.ones((1,110,110))
  if test.shape[0] !=1:
    test = test[0].unsqueeze(dim=0)
  return test

get_embedding(test_file,type = 'wav2vec').shape

torch.Size([1, 110, 187])

In [21]:
DATA_ROOT = 'Zalo_Voice_Verification/Train-Test-Data/dataset'
subject_folders = os.listdir(DATA_ROOT)
print(len(subject_folders))
# test_subject_folder =  os.path.join(DATA_ROOT, random.choice(subject_folders)) 

400


In [22]:
paths = glob.glob(DATA_ROOT+'/*/*.wav')
file_n = len(paths)
print(file_n)

10560


In [23]:
def get_prefix(file):
  split = file.split('/')
  return '/'.join(split[:-1]),split[-2]

In [24]:
print(get_prefix(test_file))

('Zalo_Voice_Verification/Train-Test-Data/dataset/272-M-26', '272-M-26')


In [25]:
import json

def read_json(file):
  with open(file,'r') as f:
    data = json.load(f)
  return data

def write_json(file,data):
  with open(file,'w') as f:
    json.dump(data,f,indent = 4)

In [26]:
SAVE_TRAIN_PATH = 'train_data.json'
SAVE_VAL_PATH = 'test_data.json'

In [27]:
from tqdm.notebook import tqdm

def get_data(save_train_file = SAVE_TRAIN_PATH, save_test_file = SAVE_VAL_PATH, test_size = 0.2):
    train_total = {}
    test_total = {}
    test_len = int(file_n * test_size)
    train_len = file_n - test_len

    for i,subject in tqdm(enumerate(subject_folders)):
      paths = glob.glob(os.path.join(DATA_ROOT,subject)+"/*/*.wav")
      for j,path in enumerate(paths):
        if j < int(len(paths) * test_size):
          test_total[path] = i
        else:
          train_total[path] = i
      if i == 5:
        break
        
    write_json(save_train_file,train_total)
    write_json(save_test_file,test_total)

# get_data()

# Dataloader

In [28]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import numpy as np
import cv2


class SpeechDataset(Dataset):
    def __init__(self, json_file, resize = None, up_channel= False, type = 'wav2vec', mode = 'train'):
        super(SpeechDataset, self).__init__()
        self.data = list(read_json(json_file).items())
        if type == 'mfcc':
          max_sequence_len = 40
        elif type == 'melspec':
          max_sequence_len = 128
        elif type == 'wav2vec':
          max_sequence_len = 110
        self.max_sequence_len = max_sequence_len
        self.resize = resize
        self.mode = mode
        self.up_channel = up_channel
        self.type = type
        

    def preprocess(self, em):
        if self.up_channel:
          em = np.stack((em,)*3, axis=-1)
        if self.resize is not None:
          em = cv2.resize(em.T,self.resize).T
          em = np.expand_dims(em, axis =0)
        return torch.FloatTensor(em)

    def __getitem__(self, idx):
        path, label = self.data[idx]
        em = get_embedding(path, type = self.type, mode = self.mode).detach().numpy()
        em = self.pad(em)
        em = self.preprocess(em)
        return self.normalize(em), torch.LongTensor([int(label)])

    def __len__(self):
        return len(self.data)
    
    def normalize(self,em,mean=0.5,std=0.5):
        # return (em - em.min())/(em.max()-em.min())
        return em

    def pad(self,em): 
        if em.shape[2] < self.max_sequence_len:
            pad = np.zeros((em.shape[0],em.shape[1],self.max_sequence_len - em.shape[2]))
            em = np.concatenate((em, pad), axis=2)
        else:
            em = em[:,:,:self.max_sequence_len] 
        assert em.shape == (1,self.max_sequence_len,self.max_sequence_len)
        return em

In [29]:
def get_loader(bs, nw, siz):  

  train_dataset = SpeechDataset(json_file=SAVE_TRAIN_PATH, resize = siz, up_channel=False, mode='val')
  test_dataset = SpeechDataset(json_file=SAVE_VAL_PATH, resize = siz, up_channel=False, mode='val')

  train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS
  )

  test_loader = DataLoader(
          dataset=test_dataset,
          batch_size=BATCH_SIZE,
          num_workers=NUM_WORKERS
  )
  print("DONE!")
  return train_loader,test_loader

# Model

In [30]:
import torch.nn as nn  
import torch.nn.functional as F
import torch 
from torchsummary import summary


class ResNetBlock(nn.Module):
    def __init__(self, in_depth, depth, first=False):
        super(ResNetBlock, self).__init__()
        self.first = first
        self.conv1 = nn.Conv2d(in_depth, depth, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(depth)
        self.lrelu = nn.LeakyReLU(0.01)
        self.dropout = nn.Dropout(0.5)
        self.conv2 = nn.Conv2d(depth, depth, kernel_size=3, stride=3, padding=1)
        self.conv11 = nn.Conv2d(in_depth, depth, kernel_size=3, stride=3, padding=1)
        if not self.first :
            self.pre_bn = nn.BatchNorm2d(in_depth)

    def forward(self, x):
        prev = x
        prev_mp =  self.conv11(x)
        if not self.first:
            out = self.pre_bn(x)
            out = self.lrelu(out)
        else:
            out = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.lrelu(out)
        out = self.dropout(out)
        out = self.conv2(out)
        out = out + prev_mp
        return out


class ResModel(nn.Module):
    def __init__(self):
        super(ResModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.block1 = ResNetBlock(32, 32,  True)
        self.mp = nn.MaxPool2d(3, stride=3, padding=1)
        self.block2 = ResNetBlock(32, 32,  False)
        self.block3 = ResNetBlock(32, 32,  False)
        self.block4= ResNetBlock(32, 32, False)
        self.block5= ResNetBlock(32, 32, False)
        self.block6 = ResNetBlock(32, 32, False)
        self.block7 = ResNetBlock(32, 32, False)
        self.block8 = ResNetBlock(32, 32, False)
        self.block9 = ResNetBlock(32, 32, False)
        self.block10 = ResNetBlock(32, 32, False)
        self.block11 = ResNetBlock(32, 32, False)
        self.lrelu = nn.LeakyReLU(0.01)
        self.bn = nn.BatchNorm2d(32)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(32, 128)
        self.fc2 = nn.Linear(128, 6)
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        batch_size = x.size(0)
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.mp(out)
        out = self.block3(out)
        out = self.block4(out)
        out = self.mp(out)
        out = self.block5(out)
        out = self.block6(out)
        out = self.mp(out)
        out = self.block7(out)
        out = self.block8(out)
        out = self.mp(out)
        out = self.block9(out)
        out = self.block10(out)
        out = self.mp(out)
        out = self.block11(out)
        out = self.bn(out)
        out = self.lrelu(out)
        out = self.mp(out)
        out = out.view(batch_size, -1)
        out = self.dropout(out)
        out = self.fc1(out)
        out = self.lrelu(out)
        out = self.fc2(out)
        out = self.logsoftmax(out)
        return out

In [31]:
in1 = torch.randn((2,1,110,110))
model = ResModel()
out = model(in1)
# summary(model, (1, 110, 110),device = 'cpu')
print(out.shape)

torch.Size([2, 6])


# Experiment

In [32]:
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
# Create an experiment with your api key
experiment = Experiment(
    api_key="Your api key",
    project_name="Speech Indentification V3",
    workspace="maxph2211",
)

hyper_params = {
    "re_siz": 112,
    "n_fft" : 400,
    "n_mfcc" : 40,  
    "batch_size": 64,
    "num_epochs": 300,
    "learning_rate": 0.0001
}

experiment.log_parameters(hyper_params)

In [34]:
device = 'cuda'
EPOCH_N = hyper_params['num_epochs']
LEARNING_RATE = hyper_params['learning_rate']
ckpt = 'resnet.pth'
BEST_LOSS = np.inf
BATCH_SIZE = hyper_params['batch_size']
NUM_WORKERS = 2
re_siz =  None #(hyper_params['siz'],hyper_params['siz']) 

In [35]:
train_loader,test_loader = get_loader(BATCH_SIZE, NUM_WORKERS, re_siz)

DONE!


In [36]:
# model.load_state_dict(torch.load(ckpt))
model = model.to(device)
# criterion = nn.CrossEntropyLoss().to(device)
criterion = nn.NLLLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.8, patience=5, verbose=True)

In [37]:
def train_epoch(train_loader, model, criterion, optimizer):
  model.train()
  train_loss_epoch = 0
  train_acc_epoch = []
  for em,label in tqdm(train_loader):
    optimizer.zero_grad()
    em = em.to(device)
    label = label.to(device)
    out = model(em)
    loss = criterion(out,label.squeeze(dim=1))
    train_loss_epoch+=loss.item()
    loss.backward()
    optimizer.step()
    _, predict = out.max(dim=1)
    train_acc_epoch.append(accuracy_score(predict.cpu().numpy(), label.cpu().numpy()))
  return sum(train_acc_epoch)/len(train_acc_epoch), train_loss_epoch, model , optimizer


def val_epoch(test_loader, model, criterion, optimizer):
  model.eval()
  val_loss_epoch = 0
  val_acc_epoch = []
  with torch.no_grad():
    for em,label in tqdm(test_loader):
      em = em.to(device)
      label = label.to(device)
      out = model(em)
      loss = criterion(out,label.squeeze(dim=1))
      val_loss_epoch+=loss.item()
      _, predict = out.max(dim=1)
      val_acc_epoch.append(accuracy_score(predict.cpu().numpy(), label.cpu().numpy()))
    return sum(val_acc_epoch)/len(val_acc_epoch), val_loss_epoch


def save_checkpoint(val_loss_epoch,ckpt,model):
  global BEST_LOSS
  if val_loss_epoch < BEST_LOSS:
    BEST_LOSS = val_loss_epoch
    torch.save(model.state_dict(),ckpt)

In [None]:
for epoch in range(EPOCH_N):
    with experiment.train():
      mean_train_acc, train_loss_epoch, model , optimizer = train_epoch(train_loader, model, criterion, optimizer)

      experiment.log_metrics({
            "loss": train_loss_epoch,
            "acc": mean_train_acc
      }, epoch=epoch)

    with experiment.test():
      mean_val_acc, val_loss_epoch = val_epoch(test_loader, model, criterion, optimizer)
      save_checkpoint(val_loss_epoch,ckpt,model)
      scheduler.step(val_loss_epoch)
      experiment.log_metrics({
            "loss": val_loss_epoch,
            "acc": mean_val_acc
      }, epoch=epoch)
    
    print("EPOCH: ", epoch+1," - TRAIN_LOSS: ", train_loss_epoch," - TRAIN_ACC: ",mean_train_acc, " || VAL_LOSS: ", val_loss_epoch, " - VAL_ACC: ", mean_val_acc)

In [None]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/maxph2211/speech-indentification-v3/5fc553f1812c4aaebb69e0f5dd990290
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     test_acc [29]   : (0.10677083333333333, 0.2109375)
COMET INFO:     test_loss [29]  : (3.531217575073242, 3.565745234489441)
COMET INFO:     train_acc [29]  : (0.14192708333333334, 0.23828125)
COMET INFO:     train_loss [53] : (1.6536426544189453, 14.366368532180786)
COMET INFO:   Parameters:
COMET INFO:     batch_size    : 64
COMET INFO:     learning_rate : 0.0001
COMET INFO:     n_fft         : 400
COMET INFO:     n_mfcc        : 40
COMET INFO:     num_epochs    : 300
COMET INFO:     siz           : 112
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     i