In [1]:
import os
import csv
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
#!pip install --upgrade --force-reinstall --no-deps kaggle

In [2]:
import json

TOKEN = {""}

! pip install kaggle==1.5.12
! mkdir -p .kaggle
! mkdir -p /content & mkdir -p /content/.kaggle & mkdir -p /root/.kaggle/

with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(TOKEN, file)

! pip install --upgrade --force-reinstall --no-deps kaggle
! ls "/content/.kaggle"
! chmod 600 /content/.kaggle/kaggle.json
! cp /content/.kaggle/kaggle.json /root/.kaggle/

! kaggle config set -n path -v /content

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 4.7 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=1bf213097aec0d40cb9c7a9f938e8c8bca7ed273166b8a6745c1118e4c56c9fa
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12
kaggle.json
- path is now set to: /content


In [3]:
! kaggle competitions download -c 11-785-s22-hw1p2

Downloading 11-785-s22-hw1p2.zip to /content/competitions/11-785-s22-hw1p2
100% 1.86G/1.86G [00:12<00:00, 186MB/s]
100% 1.86G/1.86G [00:12<00:00, 157MB/s]


In [5]:
# ! unzip /content/competitions/11-785-s22-hw1p2/11-785-s22-hw1p2.zip

In [7]:
class Network(torch.nn.Module):
    def __init__(self, context=0):
        super(Network, self).__init__()
        # TODO: Please try different architectures
        in_size = (1 + 2 * context) * 13 
        layers = [
            nn.Linear(in_size, 1024),
            nn.BatchNorm1d(1024),
            nn.Softplus(),
            nn.Dropout(p=0.3),

            nn.Linear(1024, 2048),
            nn.BatchNorm1d(2048),
            nn.Softplus(),
            nn.Dropout(p=0.25),

            nn.Linear(2048, 4096),
            nn.BatchNorm1d(4096),
            nn.Softplus(),
            nn.Dropout(p=0.2),

            nn.Linear(4096, 1024),
            nn.BatchNorm1d(1024), 
            nn.Softplus(),
            nn.Dropout(p=0.15),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Softplus(),
            nn.Dropout(p=0.1),

            nn.Linear(512, 40)
        ]
        self.layers = nn.Sequential(*layers)
        

    def forward(self, A0):
        x = self.layers(A0)
        return x
          

In [8]:

class LibriSamples(torch.utils.data.Dataset):
    def __init__(self, data_path, sample=20000, shuffle=True, partition="dev-clean", csvpath=None, train=True):
        # sample represent how many npy files will be preloaded for one __getitem__ call
        self.sample = sample 
        
        self.X_dir = data_path + "/" + partition + "/mfcc/"
        self.Y_dir = data_path + "/" + partition +"/transcript/"
        
        self.X_names = os.listdir(self.X_dir)
        if train:
          self.Y_names = os.listdir(self.Y_dir)

        # using a small part of the dataset to debug
        if csvpath:
            if train:
              subset = self.parse_csv(csvpath)
              self.X_names = [i for i in self.X_names if i in subset]
              self.Y_names = [i for i in self.Y_names if i in subset]
            else:
              self.X_names = list(pd.read_csv(csvpath).file)
        
        if shuffle == True:
            XY_names = list(zip(self.X_names, self.Y_names))
            random.shuffle(XY_names)
            self.X_names, self.Y_names = zip(*XY_names)
        if train:
          assert(len(self.X_names) == len(self.Y_names))
        self.length = len(self.X_names)
        
        self.PHONEMES = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']
      
    @staticmethod
    def parse_csv(filepath):
        subset = []
        with open(filepath) as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                subset.append(row[1])
        return subset[1:]

    def __len__(self):
        return int(np.ceil(self.length / self.sample))
        
    def __getitem__(self, i):
        sample_range = range(i*self.sample, min((i+1)*self.sample, self.length))
        
        X, Y = [], []
        for j in sample_range:
          X_path = self.X_dir + self.X_names[j]
          Y_path = self.Y_dir + self.Y_names[j]
            
          label = [self.PHONEMES.index(yy) for yy in np.load(Y_path)][1:-1]

          X_data = np.load(X_path)
          X_data = (X_data - X_data.mean(axis=0))/X_data.std(axis=0)
          X.append(X_data)
          Y.append(np.array(label))
            
        X, Y = np.concatenate(X), np.concatenate(Y)
        return X, Y

class LibriSamplesTest(torch.utils.data.Dataset):
    def __init__(self, data_path, sample=20000, shuffle=True, partition="dev-clean", csvpath=None, train=False):
        # sample represent how many npy files will be preloaded for one __getitem__ call
        self.sample = sample 
        
        self.X_dir = data_path + "/" + partition + "/mfcc/"
        # self.Y_dir = data_path + "/" + partition +"/transcript/"
        
        self.X_names = os.listdir(self.X_dir)
        if train:
          self.Y_names = os.listdir(self.Y_dir)

        # using a small part of the dataset to debug
        if csvpath:
            if train:
              subset = self.parse_csv(csvpath)
              self.X_names = [i for i in self.X_names if i in subset]
              self.Y_names = [i for i in self.Y_names if i in subset]
            else:
              self.X_names = list(pd.read_csv(csvpath).file)
        
        if shuffle == True:
            XY_names = list(zip(self.X_names, self.Y_names))
            random.shuffle(XY_names)
            self.X_names, self.Y_names = zip(*XY_names)
        if train:
          assert(len(self.X_names) == len(self.Y_names))
        self.length = len(self.X_names)
        
        self.PHONEMES = [
            'SIL',   'AA',    'AE',    'AH',    'AO',    'AW',    'AY',  
            'B',     'CH',    'D',     'DH',    'EH',    'ER',    'EY',
            'F',     'G',     'HH',    'IH',    'IY',    'JH',    'K',
            'L',     'M',     'N',     'NG',    'OW',    'OY',    'P',
            'R',     'S',     'SH',    'T',     'TH',    'UH',    'UW',
            'V',     'W',     'Y',     'Z',     'ZH',    '<sos>', '<eos>']
      
    @staticmethod
    def parse_csv(filepath):
        subset = []
        with open(filepath) as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                subset.append(row[1])
        return subset[1:]

    def __len__(self):
        return int(np.ceil(self.length / self.sample))
        
    def __getitem__(self, i):
        sample_range = range(i*self.sample, min((i+1)*self.sample, self.length))
        
        X = []
        for j in sample_range:
          X_path = self.X_dir + self.X_names[j]
          # Y_path = self.Y_dir + self.Y_names[j]
            
          # label = [self.PHONEMES.index(yy) for yy in np.load(Y_path)][1:-1]

          X_data = np.load(X_path)
          X_data = (X_data - X_data.mean(axis=0))/X_data.std(axis=0)
          X.append(X_data)
          # Y.append(np.array(label))
            
        X = np.concatenate(X)
        return X


    
class LibriItems(torch.utils.data.Dataset):
    def __init__(self, X, Y, context = 0):
        assert(X.shape[0] == Y.shape[0])
        
        self.length  = X.shape[0]
        self.context = context

        if context == 0:
            self.X, self.Y = X, Y
        else:
            X = np.pad(X, ((context,context), (0,0)), 'constant', constant_values=(0,0))
            self.X, self.Y = X, Y 
            
        
    def __len__(self):
        return self.length
        
    def __getitem__(self, i):
        if self.context == 0:
            xx = self.X[i].flatten()
            yy = self.Y[i]
        else:
            xx = self.X[i:(i + 2*self.context + 1)].flatten()
            yy = self.Y[i]
        return xx, yy

class LibriItemsTest(torch.utils.data.Dataset):
    def __init__(self, X, context = 0):
        # assert(X.shape[0] == Y.shape[0])
        
        self.length  = X.shape[0]
        self.context = context

        if context == 0:
            self.X = X
        else:
            X = np.pad(X, ((context,context), (0,0)), 'constant', constant_values=(0,0))
            self.X = X
        
    def __len__(self):
        return self.length
        
    def __getitem__(self, i):
        if self.context == 0:
            xx = self.X[i].flatten()
            # yy = self.Y[i]
        else:
            xx = self.X[i:(i + 2*self.context + 1)].flatten()
        return xx
    



In [9]:
def init_weights(m):
  if isinstance(m, nn.Linear):
    torch.nn.init.kaiming_uniform_(m.weight.data)
    nn.init.normal_(m.bias.data)

In [12]:
def train(args, model, device, train_samples, optimizer, criterion, epoch):
    model.train()
    for i in range(len(train_samples)):
        X, Y = train_samples[i]
        train_items = LibriItems(X, Y, context=args['context'])
        train_loader = torch.utils.data.DataLoader(train_items, batch_size=args['batch_size'], shuffle=True, num_workers=2, pin_memory=True)

        scaler = torch.cuda.amp.GradScaler()
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.float().to(device)
            target = target.long().to(device)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
              output = model(data)
              loss = criterion(output, target)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            if batch_idx % args['log_interval'] == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item(), optimizer.state_dict()['param_groups'][0]['lr']))
                # print("LR: ", optimizer.state_dict()['param_groups'][0]['lr'])


def test(args, model, device, dev_samples):
    model.eval()
    true_y_list = []
    pred_y_list = []
    with torch.no_grad():
        for i in range(len(dev_samples)):
            X, Y = dev_samples[i]

            test_items = LibriItems(X, Y, context=args['context'])
            test_loader = torch.utils.data.DataLoader(test_items, batch_size=args['batch_size'], shuffle=False)

            for data, true_y in test_loader:
                data = data.float().to(device)
                true_y = true_y.long().to(device)                
                
                output = model(data)
                pred_y = torch.argmax(output, axis=1)

                pred_y_list.extend(pred_y.tolist())
                true_y_list.extend(true_y.tolist())

    train_accuracy =  accuracy_score(true_y_list, pred_y_list)
    return train_accuracy

def test_out(args, model, device, test_samples):
    model.eval()
    # true_y_list = []
    pred_y_list = []
    with torch.no_grad():
        for i in range(len(test_samples)):
            X = test_samples[i]

            test_items = LibriItemsTest(X, context=args['context'])
            test_loader = torch.utils.data.DataLoader(test_items, batch_size=args['batch_size'], shuffle=False)

            for data in test_loader:
                data = data.float().to(device)
                # true_y = true_y.long().to(device)                
                
                output = model(data)
                pred_y = torch.argmax(output, axis=1)

                pred_y_list.extend(pred_y.tolist())
                # true_y_list.extend(true_y.tolist())

    # train_accuracy =  accuracy_score(true_y_list, pred_y_list)
    return pred_y_list

def main(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = Network(context=args['context']).to(device)
    # model.apply(init_weights)
    # model.load_state_dict(torch.load("model_epoch_20.pt"))
    optimizer = optim.Adam(model.parameters(), lr=args['lr'])
    # weight_decay = 2e-2
    
    criterion = torch.nn.CrossEntropyLoss()
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2)
    # print(scheduler.get_last_lr())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3, mode='max', threshold=0.0001)

    # If you want to use full Dataset, please pass None to csvpath
    train_samples = LibriSamples(data_path = args['LIBRI_PATH'], shuffle=True, partition="train-clean-100", csvpath=None, train=True)
    dev_samples = LibriSamples(data_path = args['LIBRI_PATH'], shuffle=True, partition="dev-clean", train=True)
    test_samples = LibriSamplesTest(data_path = args['LIBRI_PATH'], shuffle=False, partition="test-clean", csvpath="/content/test_order.csv", train=False)

    for epoch in range(1, args['epoch'] + 1):
        train(args, model, device, train_samples, optimizer, criterion, epoch)
        # train_acc = test(args, model, device, train_samples)
        test_acc = test(args, model, device, dev_samples)
        scheduler.step(test_acc)

        torch.save(model.state_dict(), f"model_epoch_adam{epoch}.pt")
        
        # print('Train accuracy ', train_acc)
        print('Dev accuracy ', test_acc)
       
        

        if epoch % 5 == 0:
          pred_y_submission = test_out(args, model, device, test_samples)
          output = pd.DataFrame()
          output['id'] = np.array(range(len(pred_y_submission)))
          output['label'] = np.array(pred_y_submission) 
          output.to_csv(f"submission_SGD{epoch}.csv", index = False)
      

    pred_y_submission = test_out(args, model, device, test_samples)
    output = pd.DataFrame()
    output['id'] = np.array(range(len(pred_y_submission)))
    output['label'] = np.array(pred_y_submission) 
    output.to_csv("submission.csv", index = False)

if __name__ == '__main__':
    args = {
        'batch_size': 2048,
        'context': 24,
        'log_interval': 300,
        'LIBRI_PATH': '/content/hw1p2_student_data',
        'lr': 0.001,
        'epoch': 20
    }
    main(args)



KeyboardInterrupt: ignored

In [None]:
# ! kaggle competitions submit -c 11-785-s22-hw1p2 -f "submission.csv" -m "50 epochs, 10 layers, exponential LR scheduler"