<a href="https://colab.research.google.com/github/patty-13/DEEP_LEARNING_NJIT/blob/main/HW3_DEEPLEARNING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Data
Download data from google drive, then unzip it.

You should have
- `data/train_split.txt`: training metadata
- `data/train_labels`: training labels
- `data/test_split.txt`: testing metadata
- `data/feat/train/*.pt`: training feature
- `data/feat/test/*.pt`:  testing feature

after running the following block.

In [None]:
!pip install --upgrade gdown

# Main link
!gdown --id '1rxGSKq18pO5HbHLx_mTk8NUHM96PxWFs' --output data.zip
!unzip -q data.zip

Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1rxGSKq18pO5HbHLx_mTk8NUHM96PxWFs 

unzip:  cannot find or open data.zip, data.zip.zip or data.zip.ZIP.


In [None]:
!ls data

feat  test_split.txt  train_label.txt  train_split.txt


# Some Utility Functions
**Fixes random number generator seeds for reproducibility.**

In [None]:
import numpy as np
import torch
import random

def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [None]:
import os
import torch
from tqdm import tqdm

def load_feat(path):
    feat = torch.load(path)
    return feat

def shift(x, n):
    if n < 0:
        left = x[0].repeat(-n, 1)
        right = x[:n]
    elif n > 0:
        right = x[-1].repeat(n, 1)
        left = x[n:]
    else:
        return x

    return torch.cat((left, right), dim=0)

def concat_feat(x, concat_n):
    assert concat_n % 2 == 1 # n must be odd
    if concat_n < 2:
        return x
    seq_len, feature_dim = x.size(0), x.size(1)
    x = x.repeat(1, concat_n)
    x = x.view(seq_len, concat_n, feature_dim).permute(1, 0, 2) # concat_n, seq_len, feature_dim
    mid = (concat_n // 2)
    for r_idx in range(1, mid+1):
        x[mid + r_idx, :] = shift(x[mid + r_idx], r_idx)
        x[mid - r_idx, :] = shift(x[mid - r_idx], -r_idx)

    return x.permute(1, 0, 2).view(seq_len, concat_n * feature_dim)

def preprocess_data(split, feat_dir, phone_path, concat_nframes, train_ratio=0.8, random_seed=1213):
    class_num = 41 # NOTE: pre-computed, should not need change

    if split == 'train' or split == 'val':
        mode = 'train'
    elif split == 'test':
        mode = 'test'
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    label_dict = {}
    if mode == 'train':
        for line in open(os.path.join(phone_path, f'{mode}_label.txt')).readlines():
            line = line.strip('\n').split(' ')
            label_dict[line[0]] = [int(p) for p in line[1:]]

        # split training and validation data
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.seed(random_seed)
        random.shuffle(usage_list)
        train_len = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:train_len] if split == 'train' else usage_list[train_len:]

    elif mode == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    max_len = 3000000
    X = torch.empty(max_len, 39 * concat_nframes)
    if mode == 'train':
        y = torch.empty(max_len, dtype=torch.long)

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode == 'train':
          label = torch.LongTensor(label_dict[fname])

        X[idx: idx + cur_len, :] = feat
        if mode == 'train':
          y[idx: idx + cur_len] = label

        idx += cur_len

    X = X[:idx, :]
    if mode == 'train':
      y = y[:idx]

    print(f'[INFO] {split} set')
    print(X.shape)
    if mode == 'train':
      print(y.shape)
      return X, y
    else:
      return X


# Dataset

In [None]:
import torch
from torch.utils.data import Dataset

class SpeechDataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        if y is not None:
            self.label = torch.LongTensor(y)
        else:
            self.label = None

    def __getitem__(self, idx):
        if self.label is not None:
            return self.data[idx], self.label[idx]
        else:
            return self.data[idx]

    def __len__(self):
        return len(self.data)


# Model
Feel free to modify the structure of the model.

In [None]:
#MODEL 1
# Bidirectional LSTM (BiLSTM)
# import torch.nn as nn
# import torch.optim as optim

# class BiLSTM(nn.Module):
#     def __init__(self, input_dim, num_layers, hidden_dim, num_classes):
#         super(BiLSTM, self).__init__()
#         self.hidden_dim = hidden_dim
#         self.num_layers = num_layers
#         self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
#         self.fc = nn.Linear(hidden_dim * 2, num_classes)

#     def forward(self, x):
#         h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).to(x.device)
#         c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_dim).to(x.device)
#         out, _ = self.lstm(x, (h0, c0))
#         out = self.fc(out[:, -1, :])
#         return out



In [None]:
# #MODEL 3
# #TRANSFORMER
# # import torch
# import torch.nn as nn
# from torch.optim import Adam
# from torch.optim.lr_scheduler import ReduceLROnPlateau
# from torch.utils.data import DataLoader
# from tqdm import tqdm

# # Define the CNN model
# class CNNClassifier(nn.Module):
#     def __init__(self, input_dim, output_dim, num_conv_layers=2, num_fc_layers=2, hidden_dim=512, dropout=0.5):
#         super(CNNClassifier, self).__init__()

#         self.conv_layers = nn.ModuleList()
#         self.conv_layers.append(nn.Conv1d(400, hidden_dim, kernel_size=3, padding=1))
#         self.conv_layers.append(nn.ReLU())
#         self.conv_layers.append(nn.MaxPool1d(kernel_size=2))

#         for _ in range(num_conv_layers - 1):
#             self.conv_layers.append(nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1))
#             self.conv_layers.append(nn.ReLU())
#             self.conv_layers.append(nn.MaxPool1d(kernel_size=2))

#         self.fc_layers = nn.ModuleList()
#         self.fc_layers.append(nn.Linear(hidden_dim, hidden_dim))
#         self.fc_layers.append(nn.ReLU())
#         self.fc_layers.append(nn.Dropout(p=dropout))

#         for _ in range(num_fc_layers - 1):
#             self.fc_layers.append(nn.Linear(hidden_dim, hidden_dim))
#             self.fc_layers.append(nn.ReLU())
#             self.fc_layers.append(nn.Dropout(p=dropout))

#         self.output_layer = nn.Linear(hidden_dim, output_dim)

#     # def forward(self, x):
#     #     for layer in self.conv_layers:
#     #         x = layer(x)

#     #     x = x.mean(dim=2)  # Global average pooling

#     #     for layer in self.fc_layers:
#     #         x = layer(x)

#     #     x = self.output_layer(x)
#     #     return x
#     def forward(self, x):

#       for layer in self.conv_layers:
#           x = layer(x)

#       # Check the dimensions before applying global average pooling
#       if x.dim() == 3:  # Ensure it's a 3D tensor (batch_size, channels, sequence_length)
#           x = x.mean(dim=2)  # Global average pooling along the sequence_length dimension
#       else:
#           raise ValueError("Input tensor should be 3D with dimensions (batch_size, channels, sequence_length)")

#       for layer in self.fc_layers:
#           x = layer(x)

#       x = self.output_layer(x)
#       return x


In [None]:
# MODEL 4
# DEEP NEURAL NET

import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
class BasicBlock(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_prob=0.1,bn_momentum=0.9, bn_eps=1e-5):
        super(BasicBlock, self).__init__()

        self.block = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            # nn.BatchNorm1d(output_dim),  # Batch normalization
            nn.BatchNorm1d(output_dim, momentum=bn_momentum, eps=bn_eps),
            nn.ReLU(),
            nn.Dropout(p=dropout_prob)  # Dropout
        )

    def forward(self, x):
        x = self.block(x)
        return x
class Classifier(nn.Module):
    def __init__(self, input_dim, output_dim=41, hidden_layers=3, hidden_dim=1024, dropout_prob=0.1):
        super(Classifier, self).__init__()

        layers = [BasicBlock(input_dim, hidden_dim, dropout_prob)]
        for _ in range(hidden_layers):
            layers.append(BasicBlock(hidden_dim, hidden_dim, dropout_prob))

        layers.append(nn.Linear(hidden_dim, output_dim))
        self.fc = nn.Sequential(*layers)

    def forward(self, x):
        x = self.fc(x)
        return x





# DEEP NEURAL NET WITH REGULARIZATION AND DIFFERENT ACTIVATION FUNCTION

# import torch.nn as nn
# from torch.optim.lr_scheduler import ReduceLROnPlateau

# class BasicBlock(nn.Module):
#     def __init__(self, input_dim, output_dim, dropout_prob=0.2, bn_momentum=0.9, bn_eps=1e-3):
#         super(BasicBlock, self).__init__()

#         self.block = nn.Sequential(
#             nn.Linear(input_dim, output_dim),
#             nn.BatchNorm1d(output_dim, momentum=bn_momentum, eps=bn_eps),
#             nn.ReLU(),
#             nn.Dropout(p=dropout_prob)
#         )

#     def forward(self, x):
#         x = self.block(x)
#         return x

# class Classifier(nn.Module):
#     def __init__(self, input_dim, output_dim=41, hidden_layers=1, hidden_dim=256, dropout_prob=0.2, l2_reg=0.001):
#         super(Classifier, self).__init__()

#         layers = [BasicBlock(input_dim, hidden_dim, dropout_prob)]
#         for _ in range(hidden_layers):
#             layers.append(BasicBlock(hidden_dim, hidden_dim, dropout_prob))

#         layers.append(nn.Linear(hidden_dim, output_dim))
#         self.fc = nn.Sequential(*layers)
#         self.l2_reg = l2_reg  # L2 regularization strength

#     def forward(self, x):
#         x = self.fc(x)
#         return x

#     def calculate_l2_regularization(self):
#         l2_reg = 0.0
#         for param in self.parameters():
#             l2_reg += torch.norm(param, p=2)  # L2 norm of parameters
#         return l2_reg

#     def compute_loss(self, outputs, labels):
#         l2_reg = self.calculate_l2_regularization()
#         return nn.CrossEntropyLoss()(outputs, labels) + self.l2_reg * l2_reg


# import torch.nn as nn
# from torch.optim.lr_scheduler import ReduceLROnPlateau

# class GatedReLU(nn.Module):
#     def __init__(self):
#         super(GatedReLU, self).__init__()
#         self.sigmoid = nn.Sigmoid()
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         return self.relu(x) * self.sigmoid(x)

# class BasicBlock(nn.Module):
#     def __init__(self, input_dim, output_dim, dropout_prob=0.2, bn_momentum=0.9, bn_eps=1e-3):
#         super(BasicBlock, self).__init__()

#         self.block = nn.Sequential(
#             nn.Linear(input_dim, output_dim),
#             nn.BatchNorm1d(output_dim, momentum=bn_momentum, eps=bn_eps),
#             GatedReLU(),  # Use GatedReLU instead of ReLU
#             nn.Dropout(p=dropout_prob)
#         )

#     def forward(self, x):
#         x = self.block(x)
#         return x

# class Classifier(nn.Module):
#     def __init__(self, input_dim, output_dim=41, hidden_layers=1, hidden_dim=256, dropout_prob=0.2, l2_reg=0.001):
#         super(Classifier, self).__init__()

#         layers = [BasicBlock(input_dim, hidden_dim, dropout_prob)]
#         for _ in range(hidden_layers):
#             layers.append(BasicBlock(hidden_dim, hidden_dim, dropout_prob))

#         # ReLU activation for the last hidden layer
#         layers.append(nn.ReLU())

#         # Softmax for the output layer
#         layers.append(nn.Linear(hidden_dim, output_dim))

#         self.fc = nn.Sequential(*layers)
#         self.l2_reg = l2_reg  # L2 regularization strength

#     def forward(self, x):
#         x = self.fc(x)
#         return x

#     def calculate_l2_regularization(self):
#         l2_reg = 0.0
#         for param in self.parameters():
#             l2_reg += torch.norm(param, p=2)  # L2 norm of parameters
#         return l2_reg

#     def compute_loss(self, outputs, labels):
#         l2_reg = self.calculate_l2_regularization()
#         return nn.CrossEntropyLoss()(outputs, labels) + self.l2_reg * l2_reg






# Hyper-parameters

In [None]:
# data prarameters
# TODO: change the value of "concat_nframes" for medium baseline
concat_nframes = 41   # the number of frames to concat with, n must be odd (total 2k+1 = n frames)
train_ratio = 0.80  # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 1213          # random seed
batch_size = 300     # batch size
num_epoch = 50         # the number of training epoch
learning_rate = 1e-3     # learning rate
model_path = './model.ckpt'  # the path where the checkpoint will be saved


# model parameters
# TODO: change the value of "hidden_layers" or "hidden_dim" for medium baseline
input_dim = 39 * concat_nframes  # the input dim of the model, you should not change the value
hidden_layers = 3      # the number of hidden layers
hidden_dim = 1024       # the hidden dim

# Dataloader

In [None]:
from torch.utils.data import DataLoader
import gc

same_seeds(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {device}')

# preprocess data
train_X, train_y = preprocess_data(split='train', feat_dir='./data/feat', phone_path='./data', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)
val_X, val_y = preprocess_data(split='val', feat_dir='./data/feat', phone_path='./data', concat_nframes=concat_nframes, train_ratio=train_ratio, random_seed=seed)

# get dataset
train_set = SpeechDataset(train_X, train_y)
val_set = SpeechDataset(val_X, val_y)

# remove raw feature to save memory
del train_X, train_y, val_X, val_y
gc.collect()

# get dataloader
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

DEVICE: cuda
[Dataset] - # phone classes: 41, number of utterances for train: 2194


2194it [00:08, 264.02it/s]


[INFO] train set
torch.Size([1348721, 1599])
torch.Size([1348721])
[Dataset] - # phone classes: 41, number of utterances for val: 549


549it [00:02, 238.01it/s]


[INFO] val set
torch.Size([348042, 1599])
torch.Size([348042])


# Training

In [None]:
# import torch.nn.functional as F
# class FocalLoss(nn.Module):
#     def __init__(self, alpha=1, gamma=2, reduction='mean'):
#         super(FocalLoss, self).__init__()
#         self.alpha = alpha
#         self.gamma = gamma
#         self.reduction = reduction

#     def forward(self, logits, targets):
#         ce_loss = F.cross_entropy(logits, targets, reduction='none')
#         pt = torch.exp(-ce_loss)
#         focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

#         if self.reduction == 'mean':
#             return torch.mean(focal_loss)
#         elif self.reduction == 'sum':
#             return torch.sum(focal_loss)
#         else:
#           return focal_loss

In [None]:
# create model, define a loss function, and optimizer

from torch.optim.lr_scheduler import ReduceLROnPlateau
#model = CNNClassifier(input_dim=input_dim, output_dim=41, num_conv_layers=num_conv_layers,
#                      num_fc_layers=num_fc_layers, hidden_dim=hidden_dim, dropout=0.5).to(device)

model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
#model = BiLSTM(input_dim=input_dim, num_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
#model = TransformerClassifier(input_dim=input_dim, num_layers=num_layers, hidden_dim=hidden_dim,
#                              output_dim=41, nhead=nhead).to(device)
#criterion = FocalLoss(alpha=1, gamma=2, reduction='mean')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.1, verbose=True)

best_acc = 0.0
for epoch in range(num_epoch):
    train_acc = 0.0
    train_loss = 0.0
    val_acc = 0.0
    val_loss = 0.0

    # training
    model.train() # set the model to training mode
    for i, batch in enumerate(tqdm(train_loader)):
        features, labels = batch
        features = features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, train_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        train_acc += (train_pred.detach() == labels.detach()).sum().item()
        train_loss += loss.item()

    # validation
    model.eval() # set the model to evaluation mode
    with torch.no_grad():
        for i, batch in enumerate(tqdm(val_loader)):
            features, labels = batch
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features)

            loss = criterion(outputs, labels)

            _, val_pred = torch.max(outputs, 1)
            val_acc += (val_pred.cpu() == labels.cpu()).sum().item() # get the index of the class with the highest probability
            val_loss += loss.item()

    scheduler.step(val_acc)
    print(f'[{epoch+1:03d}/{num_epoch:03d}] Train Acc: {train_acc/len(train_set):3.5f} Loss: {train_loss/len(train_loader):3.5f} | Val Acc: {val_acc/len(val_set):3.5f} loss: {val_loss/len(val_loader):3.5f}')

    # if the model improves, save a checkpoint at this epoch
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), model_path)
        print(f'saving model with acc {best_acc/len(val_set):.5f}')


100%|██████████| 4496/4496 [00:27<00:00, 162.91it/s]
100%|██████████| 1161/1161 [00:04<00:00, 257.71it/s]


[001/050] Train Acc: 0.62228 Loss: 1.21785 | Val Acc: 0.66881 loss: 1.05046
saving model with acc 0.66881


100%|██████████| 4496/4496 [00:27<00:00, 160.62it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.13it/s]


[002/050] Train Acc: 0.70546 Loss: 0.92192 | Val Acc: 0.69205 loss: 0.98746
saving model with acc 0.69205


100%|██████████| 4496/4496 [00:27<00:00, 161.70it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.43it/s]


[003/050] Train Acc: 0.74809 Loss: 0.77451 | Val Acc: 0.70138 loss: 0.96670
saving model with acc 0.70138


100%|██████████| 4496/4496 [00:27<00:00, 162.70it/s]
100%|██████████| 1161/1161 [00:04<00:00, 257.39it/s]


[004/050] Train Acc: 0.78087 Loss: 0.66258 | Val Acc: 0.70387 loss: 0.99223
saving model with acc 0.70387


100%|██████████| 4496/4496 [00:27<00:00, 163.79it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.49it/s]


[005/050] Train Acc: 0.80568 Loss: 0.57796 | Val Acc: 0.70387 loss: 1.01790
saving model with acc 0.70387


100%|██████████| 4496/4496 [00:27<00:00, 162.94it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.54it/s]


[006/050] Train Acc: 0.82604 Loss: 0.51218 | Val Acc: 0.70451 loss: 1.08152
saving model with acc 0.70451


100%|██████████| 4496/4496 [00:27<00:00, 162.70it/s]
100%|██████████| 1161/1161 [00:04<00:00, 257.39it/s]


[007/050] Train Acc: 0.84181 Loss: 0.46150 | Val Acc: 0.70087 loss: 1.10423


100%|██████████| 4496/4496 [00:27<00:00, 163.34it/s]
100%|██████████| 1161/1161 [00:04<00:00, 253.07it/s]


[008/050] Train Acc: 0.85376 Loss: 0.42397 | Val Acc: 0.70232 loss: 1.13749


100%|██████████| 4496/4496 [00:27<00:00, 162.87it/s]
100%|██████████| 1161/1161 [00:04<00:00, 252.83it/s]


[009/050] Train Acc: 0.86369 Loss: 0.39377 | Val Acc: 0.70205 loss: 1.15867


100%|██████████| 4496/4496 [00:27<00:00, 163.41it/s]
100%|██████████| 1161/1161 [00:04<00:00, 253.08it/s]


Epoch 00010: reducing learning rate of group 0 to 1.0000e-04.
[010/050] Train Acc: 0.87188 Loss: 0.36902 | Val Acc: 0.69901 loss: 1.18642


100%|██████████| 4496/4496 [00:27<00:00, 162.59it/s]
100%|██████████| 1161/1161 [00:04<00:00, 254.24it/s]


[011/050] Train Acc: 0.90421 Loss: 0.27283 | Val Acc: 0.71043 loss: 1.28384
saving model with acc 0.71043


100%|██████████| 4496/4496 [00:27<00:00, 163.63it/s]
100%|██████████| 1161/1161 [00:04<00:00, 256.46it/s]


[012/050] Train Acc: 0.91661 Loss: 0.23708 | Val Acc: 0.71143 loss: 1.28029
saving model with acc 0.71143


100%|██████████| 4496/4496 [00:27<00:00, 163.84it/s]
100%|██████████| 1161/1161 [00:04<00:00, 252.56it/s]


[013/050] Train Acc: 0.92179 Loss: 0.22118 | Val Acc: 0.71160 loss: 1.27459
saving model with acc 0.71160


100%|██████████| 4496/4496 [00:27<00:00, 162.89it/s]
100%|██████████| 1161/1161 [00:04<00:00, 252.39it/s]


[014/050] Train Acc: 0.92520 Loss: 0.21080 | Val Acc: 0.70660 loss: 1.36218


100%|██████████| 4496/4496 [00:27<00:00, 163.41it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.08it/s]


[015/050] Train Acc: 0.92800 Loss: 0.20354 | Val Acc: 0.70995 loss: 1.31756


100%|██████████| 4496/4496 [00:27<00:00, 163.41it/s]
100%|██████████| 1161/1161 [00:04<00:00, 250.46it/s]


[016/050] Train Acc: 0.93024 Loss: 0.19717 | Val Acc: 0.70951 loss: 1.40253


100%|██████████| 4496/4496 [00:27<00:00, 164.46it/s]
100%|██████████| 1161/1161 [00:04<00:00, 250.41it/s]


[017/050] Train Acc: 0.93210 Loss: 0.19168 | Val Acc: 0.71185 loss: 1.34984
saving model with acc 0.71185


100%|██████████| 4496/4496 [00:27<00:00, 163.86it/s]
100%|██████████| 1161/1161 [00:04<00:00, 248.70it/s]


[018/050] Train Acc: 0.93337 Loss: 0.18734 | Val Acc: 0.70986 loss: 1.39895


100%|██████████| 4496/4496 [00:27<00:00, 165.01it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.52it/s]


[019/050] Train Acc: 0.93534 Loss: 0.18290 | Val Acc: 0.70785 loss: 1.39735


100%|██████████| 4496/4496 [00:27<00:00, 164.75it/s]
100%|██████████| 1161/1161 [00:04<00:00, 246.59it/s]


[020/050] Train Acc: 0.93627 Loss: 0.17899 | Val Acc: 0.70883 loss: 1.40578


100%|██████████| 4496/4496 [00:27<00:00, 162.62it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.65it/s]


Epoch 00021: reducing learning rate of group 0 to 1.0000e-05.
[021/050] Train Acc: 0.93757 Loss: 0.17534 | Val Acc: 0.71005 loss: 1.41124


100%|██████████| 4496/4496 [00:27<00:00, 163.04it/s]
100%|██████████| 1161/1161 [00:04<00:00, 248.05it/s]


[022/050] Train Acc: 0.94043 Loss: 0.16751 | Val Acc: 0.70931 loss: 1.40907


100%|██████████| 4496/4496 [00:27<00:00, 164.40it/s]
100%|██████████| 1161/1161 [00:04<00:00, 250.68it/s]


[023/050] Train Acc: 0.94130 Loss: 0.16490 | Val Acc: 0.71133 loss: 1.40363


100%|██████████| 4496/4496 [00:27<00:00, 162.03it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.07it/s]


[024/050] Train Acc: 0.94180 Loss: 0.16389 | Val Acc: 0.71018 loss: 1.41859


100%|██████████| 4496/4496 [00:27<00:00, 163.96it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.87it/s]


Epoch 00025: reducing learning rate of group 0 to 1.0000e-06.
[025/050] Train Acc: 0.94166 Loss: 0.16384 | Val Acc: 0.71127 loss: 1.42805


100%|██████████| 4496/4496 [00:27<00:00, 164.11it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.32it/s]


[026/050] Train Acc: 0.94209 Loss: 0.16225 | Val Acc: 0.71040 loss: 1.45997


100%|██████████| 4496/4496 [00:27<00:00, 163.35it/s]
100%|██████████| 1161/1161 [00:04<00:00, 253.72it/s]


[027/050] Train Acc: 0.94210 Loss: 0.16269 | Val Acc: 0.71193 loss: 1.43168
saving model with acc 0.71193


100%|██████████| 4496/4496 [00:27<00:00, 162.70it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.25it/s]


[028/050] Train Acc: 0.94213 Loss: 0.16249 | Val Acc: 0.70915 loss: 1.42216


100%|██████████| 4496/4496 [00:27<00:00, 163.12it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.66it/s]


[029/050] Train Acc: 0.94243 Loss: 0.16169 | Val Acc: 0.70806 loss: 1.49911


100%|██████████| 4496/4496 [00:27<00:00, 163.28it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.70it/s]


[030/050] Train Acc: 0.94242 Loss: 0.16132 | Val Acc: 0.71091 loss: 1.43654


100%|██████████| 4496/4496 [00:27<00:00, 163.45it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.16it/s]


Epoch 00031: reducing learning rate of group 0 to 1.0000e-07.
[031/050] Train Acc: 0.94257 Loss: 0.16177 | Val Acc: 0.70979 loss: 1.44783


100%|██████████| 4496/4496 [00:27<00:00, 162.79it/s]
100%|██████████| 1161/1161 [00:04<00:00, 253.53it/s]


[032/050] Train Acc: 0.94249 Loss: 0.16169 | Val Acc: 0.70999 loss: 1.44071


100%|██████████| 4496/4496 [00:27<00:00, 163.55it/s]
100%|██████████| 1161/1161 [00:04<00:00, 253.94it/s]


[033/050] Train Acc: 0.94235 Loss: 0.16151 | Val Acc: 0.71021 loss: 1.44052


100%|██████████| 4496/4496 [00:27<00:00, 164.90it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.84it/s]


[034/050] Train Acc: 0.94239 Loss: 0.16172 | Val Acc: 0.70920 loss: 1.45064


100%|██████████| 4496/4496 [00:27<00:00, 163.48it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.14it/s]


Epoch 00035: reducing learning rate of group 0 to 1.0000e-08.
[035/050] Train Acc: 0.94272 Loss: 0.16118 | Val Acc: 0.71055 loss: 1.41796


100%|██████████| 4496/4496 [00:27<00:00, 164.11it/s]
100%|██████████| 1161/1161 [00:04<00:00, 247.66it/s]


[036/050] Train Acc: 0.94256 Loss: 0.16176 | Val Acc: 0.70953 loss: 1.42987


100%|██████████| 4496/4496 [00:27<00:00, 163.71it/s]
100%|██████████| 1161/1161 [00:04<00:00, 250.72it/s]


[037/050] Train Acc: 0.94241 Loss: 0.16178 | Val Acc: 0.71247 loss: 1.39810
saving model with acc 0.71247


100%|██████████| 4496/4496 [00:27<00:00, 163.77it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.86it/s]


[038/050] Train Acc: 0.94276 Loss: 0.16152 | Val Acc: 0.70947 loss: 1.41546


100%|██████████| 4496/4496 [00:27<00:00, 164.02it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.10it/s]


[039/050] Train Acc: 0.94257 Loss: 0.16136 | Val Acc: 0.71043 loss: 1.43289


100%|██████████| 4496/4496 [00:27<00:00, 163.16it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.47it/s]


[040/050] Train Acc: 0.94253 Loss: 0.16117 | Val Acc: 0.71002 loss: 1.41384


100%|██████████| 4496/4496 [00:27<00:00, 163.81it/s]
100%|██████████| 1161/1161 [00:04<00:00, 252.72it/s]


[041/050] Train Acc: 0.94240 Loss: 0.16154 | Val Acc: 0.71117 loss: 1.40389


100%|██████████| 4496/4496 [00:27<00:00, 162.94it/s]
100%|██████████| 1161/1161 [00:04<00:00, 251.60it/s]


[042/050] Train Acc: 0.94247 Loss: 0.16196 | Val Acc: 0.70953 loss: 1.45937


100%|██████████| 4496/4496 [00:27<00:00, 164.01it/s]
100%|██████████| 1161/1161 [00:04<00:00, 250.65it/s]


[043/050] Train Acc: 0.94292 Loss: 0.16084 | Val Acc: 0.71132 loss: 1.43154


100%|██████████| 4496/4496 [00:27<00:00, 162.82it/s]
100%|██████████| 1161/1161 [00:04<00:00, 248.82it/s]


[044/050] Train Acc: 0.94266 Loss: 0.16151 | Val Acc: 0.70975 loss: 1.45121


100%|██████████| 4496/4496 [00:27<00:00, 163.74it/s]
100%|██████████| 1161/1161 [00:04<00:00, 249.90it/s]


[045/050] Train Acc: 0.94246 Loss: 0.16163 | Val Acc: 0.71127 loss: 1.46887


100%|██████████| 4496/4496 [00:27<00:00, 164.59it/s]
100%|██████████| 1161/1161 [00:04<00:00, 248.66it/s]


[046/050] Train Acc: 0.94219 Loss: 0.16230 | Val Acc: 0.71091 loss: 1.43767


100%|██████████| 4496/4496 [00:27<00:00, 165.21it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.22it/s]


[047/050] Train Acc: 0.94248 Loss: 0.16191 | Val Acc: 0.70925 loss: 1.45135


100%|██████████| 4496/4496 [00:27<00:00, 164.08it/s]
100%|██████████| 1161/1161 [00:04<00:00, 253.45it/s]


[048/050] Train Acc: 0.94287 Loss: 0.16112 | Val Acc: 0.70887 loss: 1.46228


100%|██████████| 4496/4496 [00:27<00:00, 163.60it/s]
100%|██████████| 1161/1161 [00:04<00:00, 254.70it/s]


[049/050] Train Acc: 0.94263 Loss: 0.16111 | Val Acc: 0.71145 loss: 1.41722


100%|██████████| 4496/4496 [00:27<00:00, 163.78it/s]
100%|██████████| 1161/1161 [00:04<00:00, 255.78it/s]

[050/050] Train Acc: 0.94267 Loss: 0.16148 | Val Acc: 0.71187 loss: 1.44660





In [None]:
del train_set, val_set
del train_loader, val_loader
gc.collect()

0

# Testing
Create a testing dataset, and load model from the saved checkpoint.

In [None]:
# load data
test_X = preprocess_data(split='test', feat_dir='./data/feat', phone_path='./data', concat_nframes=concat_nframes)
test_set = SpeechDataset(test_X, None)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

[Dataset] - # phone classes: 41, number of utterances for test: 686


686it [00:02, 278.57it/s]

[INFO] test set
torch.Size([420031, 1599])





In [None]:
# load model
model = Classifier(input_dim=input_dim, hidden_layers=hidden_layers, hidden_dim=hidden_dim).to(device)
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

Make prediction.

In [None]:
pred = np.array([], dtype=np.int32)

model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):
        features = batch
        features = features.to(device)

        outputs = model(features)

        _, test_pred = torch.max(outputs, 1) # get the index of the class with the highest probability
        pred = np.concatenate((pred, test_pred.cpu().numpy()), axis=0)


100%|██████████| 1401/1401 [00:04<00:00, 342.68it/s]


Write prediction to a CSV file.

After finish running this block, download the file `prediction.csv` from the files section on the left-hand side and submit it to Kaggle.

In [None]:
with open('predictiontrial_94267train_71187_vald_acc.csv', 'w') as f:
    f.write('Id,Label\n')
    for i, y in enumerate(pred):
        f.write('{},{}\n'.format(i, y))

In [None]:
import pandas as pd
data = pd.read_csv('file.txt', sep=',')

FileNotFoundError: ignored