## The DARPA TIMIT Acoustic-Phonetic Continuous Speech Corpus (TIMIT)
The TIMIT corpus of reading speech has been designed to provide speech data for the acquisition of acoustic-phonetic knowledge and for the development and evaluation of automatic speech recognition systems.

This homework is a multiclass classification task, 
we are going to train a deep neural network classifier to predict the phonemes for each frame from the speech corpus TIMIT.

link: https://academictorrents.com/details/34e2b78745138186976cbc27939b1b34d18bd5b3

## Download Data
Download data from google drive, then unzip it.

You should have `timit_11/train_11.npy`, `timit_11/train_label_11.npy`, and `timit_11/test_11.npy` after running this block.<br><br>
`timit_11/`
- `train_11.npy`: training data<br>
- `train_label_11.npy`: training label<br>
- `test_11.npy`:  testing data<br><br>

**notes: if the google drive link is dead, you can download the data directly from Kaggle and upload it to the workspace**




In [None]:
!gdown --id '1HPkcmQmFGu-3OknddKIa5dNDsR05lIQR' --output data.zip
!unzip data.zip
!ls 

In [None]:
import gc
import numpy as np
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
train = np.load('./timit_11/train_11.npy')
train_label = np.load('./timit_11/train_label_11.npy')
test = np.load('./timit_11/test_11.npy')

print('Size of training data:', train.shape)
print('Size of testing data:', test.shape)

In [None]:
class TIMITDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = torch.from_numpy(x).float()
        if y is not None:
            self.y = torch.LongTensor(y.astype(np.int))
        else:
            self.y = None

    def __getitem__(self, index):
        if self.y is not None:
            return self.x[index], self.y[index]
        else:
            return self.x[index]

    def __len__(self):
        return len(self.x)

In [None]:
# set fixed random seed
def set_random_seed(seed=0):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
split_ratio = 0.2
train_size = int(train.shape[0] * (1 - split_ratio))
train_x, train_y, valid_x, valid_y = train[:train_size], train_label[:train_size], train[train_size:], train_label[train_size:]

# calculate the mean and std of training data
mean_array = np.mean(train_x, axis=0, keepdims=True) 
std_array = np.std(train_x, axis=0, keepdims=True)

# normalize data by the mean and std of training data
train_x = (train_x - mean_array) / std_array
valid_x = (valid_x - mean_array) / std_array
test = (test - mean_array) / std_array

# reshape data for CNN (batch, channel, width, height)
train_x = np.reshape(train_x, (-1, 1, 11, 39))
valid_x = np.reshape(valid_x, (-1, 1, 11, 39))
test = np.reshape(test, (-1, 1, 11, 39))

print('Size of training set:', train_x.shape)
print('Size of validation set:', valid_x.shape)

In [None]:
batch_size = 2048

# create dataset
train_dataset = TIMITDataset(train_x, train_y)
valid_dataset = TIMITDataset(valid_x, valid_y)
test_dataset = TIMITDataset(test, None)

# create data loader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
del train, train_label, train_x, train_y, valid_x, valid_y, test, mean_array, std_array
gc.collect()

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        p = 0.3
        self.net = nn.Sequential(
            nn.Conv2d(1, 256, (3, 39)),
            nn.Dropout2d(p),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 512, (3, 1)),
            nn.Dropout2d(p),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.Conv2d(512, 768, (3, 1)),
            nn.Dropout2d(p),
            nn.BatchNorm2d(768),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3840, 1560),
            nn.PReLU(),
            nn.Dropout(p),
            nn.BatchNorm1d(1560),
            nn.Linear(1560, 512),
            nn.PReLU(),
            nn.Dropout(p),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.PReLU(),
            nn.Dropout(p),
            nn.BatchNorm1d(256),
            nn.Linear(256, 64),
            nn.PReLU(),
            nn.Dropout(p),
            nn.BatchNorm1d(64),
            nn.Linear(64, 39)
        )

    def forward(self, x):
        x = self.net(x)
        return x

In [None]:
num_epochs = 150
learning_rate = 0.0001

In [None]:
# predictions of all model
predicts = []

for m in range(1, 6):
    print("Training model {}".format(m))
    set_random_seed(m)
    
    model_path = './model_{}.ckpt'.format(m)
    model = Classifier().to(device)

    # only L2 regularize on weights
    weight_params, bias_params = [], []
    for name, param in model.named_parameters():
        if 'bias' in name:
            bias_params += [param]
        else:
            weight_params += [param]

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam([{'params': weight_params, 'weight_decay': 0.001}, {'params': bias_params, 'weight_decay': 0.0}], lr=learning_rate)
    
    # set optimizer scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', 0.5, 8, threshold_mode='abs', cooldown=3, min_lr=0.000005, verbose=True)
    
    best_acc = 0.0
    for epoch in range(num_epochs):
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0

        # training
        model.train() # set the model to training mode
        for i, data in tqdm(enumerate(train_dataloader), total=len(train_dataset) / batch_size):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            batch_loss = criterion(outputs, labels)
            _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
            batch_loss.backward()
            optimizer.step()

            train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
            train_loss += batch_loss.item()

        # validation
        if len(valid_dataset) > 0:
            model.eval() # set the model to evaluation mode
            with torch.no_grad():
                for i, data in enumerate(valid_dataloader):
                    inputs, labels = data
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    batch_loss = criterion(outputs, labels)
                    _, val_pred = torch.max(outputs, 1)

                    val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
                    val_loss += batch_loss.item()

                print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
                    epoch + 1, num_epochs, train_acc/len(train_dataset), train_loss/len(train_dataloader), val_acc/len(valid_dataset), val_loss/len(valid_dataloader)
                ))

                # if the model improves, save a checkpoint at this epoch
                if val_acc > best_acc:
                    best_acc = val_acc
                    torch.save(model.state_dict(), model_path)
                    print('saving model with acc {:.3f}'.format(best_acc/len(valid_dataset)))

                scheduler.step(val_acc / len(valid_dataset))

        else:
            print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
                epoch + 1, num_epochs, train_acc/len(train_dataset), train_loss/len(train_dataloader)
            ))

    # if not validating, save the last epoch
    if len(valid_dataset) == 0:
        torch.save(model.state_dict(), model_path)
        print('saving model at last epoch')
        
    # create model and load weights from checkpoint
    model.load_state_dict(torch.load(model_path))
    
    # predict output
    predict = []
    model.eval() # set the model to evaluation mode
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            inputs = data
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability

            for y in test_pred.cpu().numpy():
                predict.append(y)
    
    # post processing
    for i in range(1, len(predict) - 1):
        if predict[i - 1] == predict[i + 1] and predict[i - 1] != predict[i]:
            predict[i] = predict[i - 1]
    
    predicts.append(predict)
    
    del model
    gc.collect()

In [None]:
from collections import Counter

# ensemble predictions
predict = []
for i in range(len(predicts[0])):
    pred = Counter([predicts[m][i] for m in range(5)])
    predict.append(pred.most_common(1)[0][0])

# save prediction
with open('prediction.csv', 'w') as f:
    f.write('Id,Class\n')
    for i, y in enumerate(predict):
        f.write('{},{}\n'.format(i, y))


## **Reference**
This code is modified based on TA's sample code in course ML 2021 @ NTUEE.  
Copying or reusing this code is required to specify the original author.  
Source: https://github.com/ga642381/ML2021-Spring/blob/main/HW02/HW02-1.ipynb

* Convolutional Neural Networks for Speech Recognition (IEEE/ACM)  
https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CNN_ASLPTrans2-14.pdf