<a href="https://colab.research.google.com/github/lollipop6370/ML2021/blob/main/ML2021_HW04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download dataset

In [None]:
!gdown --id 'paste your own data download link' --output Dataset.zip # 1gaFy8RaQVUEXo2n0peCBR5gYKCB-mNHc
!unzip Dataset.zip

# Import Packages


In [56]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import LambdaLR

# For data preprocess
import numpy as np
import math
import random
import csv
import os
import json
from pathlib import Path
from torch.nn.utils.rnn import pad_sequence

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

myseed = 42069  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

# Some Utilities

In [12]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

# Dataset

In [35]:
class MyDataset(Dataset):
    def __init__(self, path, segment_len=128):
        self.path = path
        self.segment_len = segment_len

        # Load the mapping from speaker name to their corresponding id.
        dataPath = Path(self.path) / "mapping.json"
        with dataPath.open("r", encoding="utf-8") as f:
            mapping = json.load(f)
            self.speaker2id = mapping["speaker2id"]

        # Load metadata of training data
        metadata_path = Path(self.path) / "metadata.json"
        with metadata_path.open("r", encoding="utf-8") as f:
            metadata = json.load(f)["speakers"]

        # Get the total number of speaker.
        self.speaker_num = len(metadata.keys())
        self.data = []
        for speaker in metadata.keys():
            for soundFile in metadata[speaker]:
                self.data.append([soundFile["feature_path"], self.speaker2id[speaker]])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        feat_path, speaker = self.data[index]
        # Load sound file
        mel = torch.load(Path(self.path) / feat_path)

        # Segment mel-spectrogram into "segment_len" frames.
        if len(mel) > self.segment_len:
            start = random.randint(0, len(mel) - self.segment_len)
            mel = torch.FloatTensor(mel[start:start+self.segment_len])
        else:
            mel = torch.FloatTensor(mel)
        #
        speaker = torch.FloatTensor([speaker]).long()
        return mel, speaker

    def get_speaker_number(self):
        return self.speaker_num

# DataLoader

In [36]:
def collate_batch(batch):
    # let [batch1, batch2] to mel, speaker = [batch1.mel, batch2.mel], [batch1.speaker, batch2.speaker]
    mel, speaker = zip(*batch)
    # pad the features in the same batch to make their length the same.
    mel = pad_sequence(mel, batch_first=True, padding_value=-20) # new tensor shap: (batch_size, length, 40)

    return mel, torch.LongTensor(speaker)

def getDataLoader(path, batch_size, n_workers):
    # Create Dataset
    myDataset = MyDataset(path)
    speaker_num = myDataset.get_speaker_number()
    # Random split dataset into training dataset and validation dataset.
    trainSetLen = int(0.9 * len(myDataset))
    trainSet, validSet = random_split(myDataset, [trainSetLen, len(myDataset) - trainSetLen])

    train_loader = DataLoader(
        trainSet,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers,
        pin_memory=True,
        collate_fn=collate_batch
    )

    valid_loader = DataLoader(
        validSet,
        batch_size=batch_size,
        num_workers=n_workers,
        drop_last=True,
        pin_memory=True,
        collate_fn=collate_batch
    )

    return train_loader, valid_loader, speaker_num

# Model

In [47]:
# if use conformer
from torchaudio.models import Conformer

class Classifier(nn.Module):
    def __init__(self, d_model=80, n_spks=600, dropout=0.1):
        super().__init__()

        # 40 features transformation to 80  (batch size, lenght, 40) -> (batch size, lenght, 80)
        self.prenet = nn.Linear(40, d_model)

        """ Use conformer
        self.conformer = Conformer(
            input_dim=d_model,
            num_heads=2,
            ffn_dim=256,
            num_layers=2,
            depthwise_conv_kernel_size=15,
            dropout=0.1
        )

        """
        # transformer's encoder
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            dim_feedforward=256,
            nhead=2
        )

        # self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=2)

        # project the dimention of feature from d_model into speaker nums.
        self.pred_layer = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(d_model, n_spks)
        )

        self.criterion = nn.CrossEntropyLoss()

    def forward(self, mels):
        """
        args:
          mels: (batch size, sequence length, feature=40)
        return:
          out: (batch size, n_spks)
        """
        out = self.prenet(mels) # out: (batch size, sequence length, 80)
        # change dimention 0 and 1 for transformer input.
        out = out.permute(1, 0, 2)  # out: (sequence length, batch size, 80)
        # transformer encoder layer
        out = self.encoder_layer(out) # out: (sequence length, batch size, 80)
        # turn dimention back
        out = out.transpose(0, 1) # out: (batch size, sequence lenght, 80)
        # mean pooling
        out = out.mean(dim=1) # out: (batch size, 80)
        out = self.pred_layer(out) # out: (batch size, n_spks)
        return out

    def cal_loss(self, pred, target):
        return self.criterion(pred, target)

# Learning rate scheduler

In [38]:
def get_scheduler(optimizer, num_warmup_steps, num_training_steps, num_cycle=0.5, last_epoch=-1):
    def lr_lambda(current_step):
        # Warmup
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        # Decade
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycle) * 2.0 * progress)))
    return LambdaLR(optimizer, lr_lambda, last_epoch)

# Create object

In [52]:
# Hyperparameter
device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/

config = {
    'data_dir': './Dataset',
    'n_epochs': 70000,                # maximum number of epochs
    'batch_size': 32,               # mini-batch size for dataloader
    'optimizer': 'AdamW',              # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)
        'lr': 1e-3,                 # learning rate of SGD
        # 'momentum': 0.9              # momentum for SGD
    },
    'early_stop': 1000,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.ckpt',  # your model will be saved here
    'n_workers': 8,
    'warmup_steps': 1000
}
# DataLoader
train_loader, valid_loader, speaker_num = getDataLoader(config['data_dir'], config['batch_size'], config['n_workers'])
# Model
model = Classifier(n_spks=speaker_num).to(device)
# Optimizer
optimizer = getattr(torch.optim, config['optimizer'])(model.parameters(), **config['optim_hparas'])
# Learning rate scheduler
scheduler = get_scheduler(optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=config['n_epochs'])

# Train

In [59]:
def train(train_dataLoader, valid_dataLoader, model, optimizer, n_epoch, device):
    min_crossEntropy = 1000.
    current_epoch = 0
    early_stop = 0
    loss_record = {'train': [], 'dev': []}
    while(current_epoch < n_epoch):
        model.train()
        for mel, speaker in train_dataLoader:
            optimizer.zero_grad()
            mel, speaker = mel.to(device), speaker.to(device)
            pred = model(mel)
            cross_loss = model.cal_loss(pred, speaker)
            cross_loss.backward()
            optimizer.step()
            scheduler.step()
            loss_record['train'].append(cross_loss.detach().cpu().item())

        dev_crossEntropy = dev(valid_dataLoader, model, device)
        loss_record['dev'].append(dev_crossEntropy)
        if dev_crossEntropy < min_crossEntropy:
            min_crossEntropy = dev_crossEntropy
            print('Saving model (epoch = {:4d}, loss = {:.4f})'.format(current_epoch + 1, min_crossEntropy))
            torch.save(model.state_dict(), config['save_path'])
            early_stop_cnt = 0
        else:
            early_stop += 1

        if early_stop_cnt > config['early_stop']:
            break

        current_epoch += 1

    print('Finished training after {} epochs.'.format(current_epoch))
    return min_crossEntropy, loss_record

# Validation

In [58]:
def dev(valid_dataLoader, model, device):
    model.eval()
    total_loss = 0
    for mel, speaker in valid_dataLoader:
        mel, speaker = mel.to(device), speaker.to(device)
        with torch.no_grad():
          pred = model(mel)
          dev_loss = model.cal_loss(pred, speaker)
        total_loss += dev_loss.detach().cpu().item() * len(mel)

    total_loss = total_loss / len(valid_dataLoader.dataset)
    return total_loss

# Test

In [49]:
# Load testing data
class Test_dataset(Dataset):
    def __init__(self, path, segement_len=128):
        self.data_dir = path
        self.test_path = Path(path) / "testdata.json"
        self.segement_len = segement_len
        with self.test_path.open("r", encoding="utf-8") as f:
            self.testdata = json.load(f)["utterances"]

    def __len__(self):
        return len(self.testdata)

    def __getitem__(self, index):
        utterances = self.test_data[index]
        feature_path = utterances["feature_path"]
        mel = troch.load(Path(self.data_dir) / feature_path)

        return feature_path, mel

def collate_batch_test(batch):
    feature_path, mel = zip(*batch)
    return feature_path, torch.stack(mel)

def get_test_dataLoader(data_dir, n_workers):
    test_dataset = Test_dataset(data_dir)
    test_dataLoader = DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=False,
        drop_last=False,
        num_workers=n_workers,
        pin_memory=True,
        collate_fn=collate_batch_test
    )
    return test_dataLoader

def test(data_dir, model_path, n_workers, device, output_path):
    # Create testdataLoader
    test_dataLoader = get_test_dataLoader(data_dir, n_workers)
    # Load speaker2id mapping
    mapping_path = Path(data_dir) / "mapping.json"
    mapping = json.load(mapping_path.open())
    n_speakers = len(mapping["id2speaker"])
    # Load model and parameter(weight)
    model = Classifier(n_spks=n_speakers).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    print("[Info]: Finished creating model.")

    # Start testing
    results = [["Id","Category"]]
    for feature_path, mel in test_dataLoader:
        with torch.no_gard():
            mel = mel.to(device)
            pred = model(mel)
            pred = pred.argmax(1).cpu().numpy()
            results.append([feature_path, mapping["id2speaker"][str(pred)]])

    with open(output_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(results)

# Start training

In [None]:
model_loss, loss_record = train(train_loader, valid_loader, model, optimizer, config['n_epochs'], device)

# Testing

In [None]:
test(config["data_dir"], "./models/model.ckpt", config["n_workers"], device, "./output.csv")