In [None]:
INPUT_LOC = "/content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/" # remmeber to add a \ at the end
OUTPUT_LOC = "/content/drive/MyDrive/ma_dataset/similar_langs_small/100_set_auto/" # remmeber to add a \ at the end
DATASET_PATH = INPUT_LOC # change this depending on the task
JSON_PATH = OUTPUT_LOC+"recordingslist.json"
LANGUAGES = 3

#**Imports**

In [None]:
import os
import numpy as np
import glob
import argparse
from itertools import compress
import random
import json
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.utils.data import DataLoader  
from torch import optim
# import torch
# import torch.autograd as grad
# import torch.nn.functional as F
# import torch.nn as nn
# from torch import optim
# from torch.utils.data import DataLoader   
# from sklearn.metrics import accuracy_score

# from SpeechDataGenerator import SpeechDataGenerator
# from utils import utils
# from models.model import RawNet
# from loss import GE2ELoss



# torch.multiprocessing.set_sharing_strategy('file_system')

#**Data Loading**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def fetch_data(dataset_path, json_path):

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "file_path": [],
        "file_label_name": [],
        "file_label_code": [],
    }

    # loop through all language sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        n = 1 # to keep track of ETA
        # ensure we're processing a language sub-folder level
        if dirpath is not dataset_path:

            # save language label (i.e., sub-folder name) in the mapping
            folder = dirpath.split("/")[-1]
            print("\nProcessing: {}".format(folder))

            # process all audio files in sub-dir
            for f in filenames:
                # load audio file
                file_path = os.path.join(dirpath, f)
                data["file_path"].append(file_path)
                data["file_label_name"].append(folder)
                data["file_label_code"].append(i - 1)
                print(str(n) + " " + file_path)
                n += 1

    # save list of files to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [None]:
fetch_data(DATASET_PATH, JSON_PATH)


Processing: french
1 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french10.wav
2 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french3.wav
3 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french2.wav
4 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french13.wav
5 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french11.wav
6 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french14.wav
7 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french5.wav
8 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french1.wav
9 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french4.wav
10 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french12.wav
11 /content/drive/MyDrive/ma_dataset/similar_langs_small/100_set/french/french7.wav
12 /content/drive/MyDrive/ma_dataset/similar_langs_small/100

#**Train, Test, Validation sets**

In [None]:
def load_data(data_path):
    with open(data_path, "r") as fp:
        data = json.load(fp)

    # convert lists to numpy arrays
    paths = np.array(data["file_path"])
    labels = np.array(data["file_label_code"])

    return paths, labels

In [None]:
# load data
paths, labels = load_data(JSON_PATH)

In [None]:
# stratify-split the data into val(20%), train, test
data_pre, data_val, labels_pre, labels_val = train_test_split(paths, labels, test_size=0.20, stratify= labels, random_state=42)
data_train, data_test, labels_train, labels_test = train_test_split(data_pre, labels_pre, test_size=0.20, stratify= labels_pre, random_state=42)

In [None]:
## ready to use
# data_train, labels_train
# data_test, labels_test
# data_val, labels_val

In [None]:
def numpy_data_to_txt(X, y, output_file):
  data = list(zip(X,y))
  f = open(output_file,"w+") 
  for element in data: #you wouldn't need to write this since you are already in a loop
    f.write(f"{element[0]} {element[1]} \n") 
  f.close()


In [None]:
numpy_data_to_txt(data_train,labels_train, OUTPUT_LOC+"training.txt")
numpy_data_to_txt(data_test,labels_test, OUTPUT_LOC+"testing.txt")
numpy_data_to_txt(data_val,labels_val, OUTPUT_LOC+"validation.txt")



#**Feature extraction**

**helper functions**



In [None]:

def load_wav(audio_filepath, sr, min_dur_sec=4):
    audio_data,fs  = librosa.load(audio_filepath,sr=16000)
    len_file = len(audio_data)
    
    if len_file <int(min_dur_sec*sr):
        dummy=np.zeros((1,int(min_dur_sec*sr)-len_file))
        extened_wav = np.concatenate((audio_data,dummy[0]))
    else:
        
        extened_wav = audio_data
    return extened_wav


def lin_mel_from_wav(wav, hop_length, win_length, n_mels):
    linear = librosa.feature.melspectrogram(wav, n_mels=n_mels, win_length=win_length, hop_length=hop_length) # linear spectrogram
    return linear.T

def lin_spectogram_from_wav(wav, hop_length, win_length, n_fft=512):
    linear = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) # linear spectrogram
    return linear.T


def feature_extraction(filepath,sr=16000, min_dur_sec=4,win_length=400,hop_length=160, n_mels=40, spec_len=400,mode='train'):
    audio_data = load_wav(filepath, sr=sr,min_dur_sec=min_dur_sec)
    linear_spect = lin_spectogram_from_wav(audio_data, hop_length, win_length, n_fft=512)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    mu = np.mean(mag_T, 0, keepdims=True)
    std = np.std(mag_T, 0, keepdims=True)
    return (mag_T - mu) / (std + 1e-5)
    

def load_raw_data(filepath,sr=16000):
    audio_data = load_wav(filepath, sr=sr,min_dur_sec=3.0)
    
    
def load_data(filepath,sr=16000, min_dur_sec=4,win_length=400,hop_length=160, n_mels=40, spec_len=400,mode='train'):
    audio_data = load_wav(filepath, sr=sr,min_dur_sec=min_dur_sec)
    #linear_spect = lin_spectogram_from_wav(audio_data, hop_length, win_length, n_mels)
    linear_spect = lin_spectogram_from_wav(audio_data, hop_length, win_length, n_fft=512)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    

    if mode=='train':
        randtime = np.random.randint(0, mag_T.shape[1]-spec_len)
        spec_mag = mag_T[:, randtime:randtime+spec_len]
    else:
        spec_mag = mag_T
    
    # preprocessing, subtract mean, divided by time-wise var
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag, 0, keepdims=True)
    return (spec_mag - mu) / (std + 1e-5)
    


def load_npy_data(filepath,spec_len=400,mode='train'):
    mag_T = np.load(filepath)
    if mode=='train':
        randtime = np.random.randint(0, mag_T.shape[1]-spec_len)
        spec_mag = mag_T[:, randtime:randtime+spec_len]
    else:
        spec_mag = mag_T
    return spec_mag
    


def speech_collate(batch):
    targets = []
    specs = []
    for sample in batch:
        specs.append(sample['features'])
        targets.append((sample['labels']))
    return specs, targets


In [None]:

def extract_features(audio_filepath):
    features = feature_extraction(audio_filepath)
    return features
    
    

def FE_pipeline(feature_list,store_loc,mode):
    create_root = os.path.join(store_loc,mode)
    if not os.path.exists(create_root):
        os.makedirs(create_root)
    if mode=='train':
        fid = open(OUTPUT_LOC+'training_feat.txt','w')
    elif mode=='test':
        fid = open(OUTPUT_LOC+'testing_feat.txt','w')
    elif mode=='validation':
        fid = open(OUTPUT_LOC+'validation_feat.txt','w')
    else:
        print('Unknown mode')
    
    for row in feature_list:
        filepath = row.split(' ')[0]
        lang_id = row.split(' ')[1]
        vid_folder = filepath.split('/')[-2]
        lang_folder = filepath.split('/')[-3]
        filename = filepath.split('/')[-1]
        create_folders = os.path.join(create_root,lang_folder,vid_folder)
        if not os.path.exists(create_folders):
            os.makedirs(create_folders)
        extract_feats = extract_features(filepath)
        dest_filepath = create_folders+'/'+filename[:-4]+'.npy'
        np.save(dest_filepath,extract_feats)
        to_write = dest_filepath+' '+lang_id
        fid.write(to_write+'\n')
    fid.close()
    

In [None]:
store_loc = OUTPUT_LOC+'features'
read_train = [line.rstrip('\n') for line in open(OUTPUT_LOC+'training.txt')]
FE_pipeline(read_train,store_loc,mode='train')

read_test = [line.rstrip('\n') for line in open(OUTPUT_LOC+'testing.txt')]
FE_pipeline(read_test,store_loc,mode='test')

read_val = [line.rstrip('\n') for line in open(OUTPUT_LOC+'validation.txt')]
FE_pipeline(read_val,store_loc,mode='validation')
    

#**dataloaders**

In [None]:

class DatasetLoader_D():
    """Speech dataset."""

    def __init__(self, manifest, mode):
        """
        Read the textfile and get the paths
        """
        self.mode=mode
        self.audio_links = [line.rstrip('\n').split(' ')[0] for line in open(manifest)]
        self.labels = [int(line.rstrip('\n').split(' ')[1]) for line in open(manifest)]
        

    def __len__(self):
        return len(self.audio_links)

    def __getitem__(self, idx):
        audio_link =self.audio_links[idx]
        class_id = self.labels[idx]
        ### select M random files
        get_ids = [el==class_id for el in self.labels]
        get_all_files = list(compress(self.audio_links, get_ids))
        selected_files = get_all_files
        specs = []
        labels_list = []
        for audio_filepath in selected_files:
            spec = load_data(audio_link,mode=self.mode)
            specs.append(spec)
            labels_list.append(class_id)
        feats = np.asarray(specs)
        label_arr = np.asarray(labels_list)
        # print("here",len(torch.from_numpy(np.ascontiguousarray(feats))))
        
        sample = {'features': torch.from_numpy(np.ascontiguousarray(feats)), 'labels': torch.from_numpy(np.ascontiguousarray(label_arr))}
        return sample
        
    

In [None]:
import numpy as np
import torch

class DatasetLoader_X():
    """Speech dataset."""

    def __init__(self, manifest, mode):
        """
        Read the textfile and get the paths
        """
        self.mode=mode
        self.audio_links = [line.rstrip('\n').split(' ')[0] for line in open(manifest)]
        self.labels = [int(line.rstrip('\n').split(' ')[1]) for line in open(manifest)]
        

    def __len__(self):
        return len(self.audio_links)

    def __getitem__(self, idx):
        audio_link =self.audio_links[idx]
        class_id = self.labels[idx]
        #lang_label=lang_id[self.audio_links[idx].split('/')[-2]]
        spec = load_data(audio_link,mode=self.mode)
        sample = {'features': torch.from_numpy(np.ascontiguousarray(spec)), 'labels': torch.from_numpy(np.ascontiguousarray(class_id))}
        return sample
        

#**D-Vector Architecture**

In [None]:
# d-vector
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import torch

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv1d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock3x3(nn.Module):
    expansion = 1
    def __init__(self, inplanes3, planes, stride=1, downsample=None):
        super(BasicBlock3x3, self).__init__()
        self.conv1 = conv3x3(inplanes3, planes, stride)
        self.bn1 = nn.BatchNorm1d(planes)
        self.relu = nn.LeakyReLU(negative_slope=0.01,inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm1d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)

        return out





class RawNet(nn.Module):
    def __init__(self, input_channel, num_classes=1211):
        self.inplanes3 = 128
        super(RawNet, self).__init__()
        self.conv1 = nn.Conv1d(input_channel, 128, kernel_size=3, stride=1, padding=0,
                               bias=False)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu = nn.LeakyReLU(negative_slope=0.01,inplace=True)
        #############################################################################
        
        
        self.resblock_1_1 = self._make_layer3(BasicBlock3x3, 128, 1, stride=1)
        self.resblock_1_2 = self._make_layer3(BasicBlock3x3, 128, 1, stride=1)
        self.maxpool_resblock_1 = nn.MaxPool1d(kernel_size=3, stride=1, padding=0)
        #############################################################################
        self.resblock_2_1 = self._make_layer3(BasicBlock3x3, 256, 1, stride=1)
        self.resblock_2_2 = self._make_layer3(BasicBlock3x3, 256, 1, stride=1)
        self.resblock_2_3 = self._make_layer3(BasicBlock3x3, 256, 1, stride=1)
        self.resblock_2_4 = self._make_layer3(BasicBlock3x3, 256, 1, stride=1)
        self.maxpool_resblock_2 = nn.MaxPool1d(kernel_size=3, stride=2, padding=0)
        
        ############################################################################
        self.gru = nn.GRU(input_size=256, hidden_size=1024,dropout=0.2,bidirectional=False,batch_first=True)
        self.spk_emb = nn.Linear(1024,128)
        # self.drop = nn.Dropout(p=0.2)
        self.output_layer = nn.Linear(128, num_classes)


    def _make_layer3(self, block, planes, blocks, stride=1):
        downsample = None
        # print(f"Make layers: {planes}, {block.expansion}")
        if stride != 1 or self.inplanes3 != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv1d(self.inplanes3, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm1d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes3, planes, stride, downsample))
        self.inplanes3 = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes3, planes))

        return nn.Sequential(*layers)

 

    def forward(self, inputs):
        out = self.conv1(inputs)
        out = self.bn1(out)
        out = self.relu(out)
        
        ## ResBlock-1
        out = self.resblock_1_1(out)
        out = self.maxpool_resblock_1(out)
        out = self.resblock_1_2(out)
        out = self.maxpool_resblock_1(out)
        ##Resblock-2
        out = self.resblock_2_1(out)
        out = self.maxpool_resblock_2(out)
        out = self.resblock_2_2(out)
        out = self.maxpool_resblock_2(out)
        out = self.resblock_2_3(out)
        out = self.maxpool_resblock_2(out)
        out = self.resblock_2_4(out)
        out = self.maxpool_resblock_2(out)
        ### GRU
        out = out.permute(0,2,1)
        out,_ = self.gru(out)
        out = out.permute(0,2,1)
        spk_embeddings = self.spk_emb(out[:,:,-1])
        preds = self.output_layer(spk_embeddings)

        return preds,spk_embeddings

#**X-Vector Architecture**

In [None]:
# x-vector

import torch.nn as nn
import torch.nn.functional as F

class TDNN(nn.Module):
    
    def __init__(
                    self, 
                    input_dim=23, 
                    output_dim=512,
                    context_size=5,
                    stride=1,
                    dilation=1,
                    batch_norm=False,
                    dropout_p=0.2
                ):
        '''
        TDNN as defined by https://www.danielpovey.com/files/2015_interspeech_multisplice.pdf
        Affine transformation not applied globally to all frames but smaller windows with local context
        batch_norm: True to include batch normalisation after the non linearity
        
        Context size and dilation determine the frames selected
        (although context size is not really defined in the traditional sense)
        For example:
            context size 5 and dilation 1 is equivalent to [-2,-1,0,1,2]
            context size 3 and dilation 2 is equivalent to [-2, 0, 2]
            context size 1 and dilation 1 is equivalent to [0]
        '''
        super(TDNN, self).__init__()
        self.context_size = context_size
        self.stride = stride
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dilation = dilation
        self.dropout_p = dropout_p
        self.batch_norm = batch_norm
      
        self.kernel = nn.Linear(input_dim*context_size, output_dim)
        self.nonlinearity = nn.ReLU()
        if self.batch_norm:
            self.bn = nn.BatchNorm1d(output_dim)
        if self.dropout_p:
            self.drop = nn.Dropout(p=self.dropout_p)
        
    def forward(self, x):
        '''
        input: size (batch, seq_len, input_features)
        outpu: size (batch, new_seq_len, output_features)
        '''
        
        _, _, d = x.shape
        assert (d == self.input_dim), 'Input dimension was wrong. Expected ({}), got ({})'.format(self.input_dim, d)
        x = x.unsqueeze(1)

        # Unfold input into smaller temporal contexts
        x = F.unfold(
                        x, 
                        (self.context_size, self.input_dim), 
                        stride=(1,self.input_dim), 
                        dilation=(self.dilation,1)
                    )

        # N, output_dim*context_size, new_t = x.shape
        x = x.transpose(1,2)
        x = self.kernel(x.float())
        x = self.nonlinearity(x)
        
        if self.dropout_p:
            x = self.drop(x)

        if self.batch_norm:
            x = x.transpose(1,2)
            x = self.bn(x)
            x = x.transpose(1,2)

        return x

import torch.nn as nn
import torch


class X_vector(nn.Module):
    def __init__(self, input_dim = 40, num_classes=8):
        super(X_vector, self).__init__()
        self.tdnn1 = TDNN(input_dim=input_dim, output_dim=512, context_size=5, dilation=1,dropout_p=0.5)
        self.tdnn2 = TDNN(input_dim=512, output_dim=512, context_size=3, dilation=1,dropout_p=0.5)
        self.tdnn3 = TDNN(input_dim=512, output_dim=512, context_size=2, dilation=2,dropout_p=0.5)
        self.tdnn4 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=1,dropout_p=0.5)
        self.tdnn5 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=3,dropout_p=0.5)
        #### Frame levelPooling
        self.segment6 = nn.Linear(1024, 512)
        self.segment7 = nn.Linear(512, 512)
        self.output = nn.Linear(512, num_classes)
        self.softmax = nn.Softmax(dim=1)
    def forward(self, inputs):
        tdnn1_out = self.tdnn1(inputs)
        tdnn2_out = self.tdnn2(tdnn1_out)
        tdnn3_out = self.tdnn3(tdnn2_out)
        tdnn4_out = self.tdnn4(tdnn3_out)
        tdnn5_out = self.tdnn5(tdnn4_out)
        ### Stat Pool
        
        mean = torch.mean(tdnn5_out,1)
        std = torch.var(tdnn5_out,1)
        stat_pooling = torch.cat((mean,std),1)
        segment6_out = self.segment6(stat_pooling)
        x_vec = self.segment7(segment6_out)
        predictions = self.output(x_vec)
        return predictions,x_vec

#**training and validation functions** 

In [None]:
from sklearn.metrics import accuracy_score
def train(dataloader_train,epoch, model, device, optimizer,ce_loss,  mode="x"):
    train_loss_list=[]
    full_preds=[]
    full_gts=[]
    model.train()
    for i_batch, sample_batched in enumerate(dataloader_train):
        if mode == "x":
          features = torch.from_numpy(np.asarray([torch_tensor.numpy().T for torch_tensor in sample_batched[0]])).float()
          labels = torch.from_numpy(np.asarray([torch_tensor[0].numpy() for torch_tensor in sample_batched[1]]))
        else:
          features = torch.cat((sample_batched[0])).float()
          labels = torch.cat((sample_batched[1]))
        features, labels = features.to(device),labels.to(device)
        features.requires_grad = True
        optimizer.zero_grad()
        pred_logits,vec = model(features)
        #### CE loss
        loss = ce_loss(pred_logits,labels)
        loss.backward()
        optimizer.step()
        train_loss_list.append(loss.item())
        #train_acc_list.append(accuracy)
        #if i_batch%10==0:
        #    print('Loss {} after {} iteration'.format(np.mean(np.asarray(train_loss_list)),i_batch))
        
        predictions = np.argmax(pred_logits.detach().cpu().numpy(),axis=1)
        for pred in predictions:
            full_preds.append(pred)
        for lab in labels.detach().cpu().numpy():
            full_gts.append(lab)
            
    mean_acc = accuracy_score(full_gts,full_preds)
    mean_loss = np.mean(np.asarray(train_loss_list))
    print('Total training loss {} and training Accuracy {} after {} epochs'.format(mean_loss,mean_acc,epoch))
    


def validation(dataloader_val,epoch, model, device, optimizer,ce_loss,  mode="x"):
    model.eval()
    with torch.no_grad():
        val_loss_list=[]
        full_preds=[]
        full_gts=[]
        for i_batch, sample_batched in enumerate(dataloader_val):
            if mode == "x":
              features = torch.from_numpy(np.asarray([torch_tensor.numpy().T for torch_tensor in sample_batched[0]])).float()
              labels = torch.from_numpy(np.asarray([torch_tensor[0].numpy() for torch_tensor in sample_batched[1]]))
            else:
              features = torch.cat((sample_batched[0])).float()
              labels = torch.cat((sample_batched[1]))
            features, labels = features.to(device),labels.to(device)
            pred_logits,vec = model(features)
            #### CE loss
            loss = ce_loss(pred_logits,labels)
            val_loss_list.append(loss.item())
            #train_acc_list.append(accuracy)
            predictions = np.argmax(pred_logits.detach().cpu().numpy(),axis=1)
            for pred in predictions:
                full_preds.append(pred)
            for lab in labels.detach().cpu().numpy():
                full_gts.append(lab)
                
        mean_acc = accuracy_score(full_gts,full_preds)
        mean_loss = np.mean(np.asarray(val_loss_list))
        print('Total vlidation loss {} and Validation accuracy {} after {} epochs'.format(mean_loss,mean_acc,epoch))
        
        model_save_path = os.path.join(OUTPUT_LOC+'save_model', 'best_check_point_'+str(epoch)+'_'+str(mean_loss))
        state_dict = {'model': model.state_dict(),'optimizer': optimizer.state_dict(),'epoch': epoch}
        torch.save(state_dict, model_save_path)

#**driver code**

In [None]:
def vector_train(mode="x", languages = 6):
    ### Data related
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

      
    if mode == "x":
      dataset_train = DatasetLoader_X(manifest=OUTPUT_LOC+'training.txt',mode='train')
      dataloader_train = DataLoader(dataset_train, batch_size=4,shuffle=True,collate_fn=speech_collate) 
      dataset_val = DatasetLoader_X(manifest=OUTPUT_LOC+'validation.txt',mode='train')
      dataloader_val = DataLoader(dataset_val, batch_size=4,shuffle=True,collate_fn=speech_collate) 
      model = X_vector(257, languages)
    else:
      dataset_train = DatasetLoader_D(manifest=OUTPUT_LOC+'training.txt',mode='train')
      dataloader_train = DataLoader(dataset_train, batch_size=4,shuffle=True,collate_fn=speech_collate) 
      dataset_val = DatasetLoader_D(manifest=OUTPUT_LOC+'validation.txt',mode='train')
      dataloader_val = DataLoader(dataset_val, batch_size=4,shuffle=True,collate_fn=speech_collate) 
      model = RawNet(257 ,languages)
      

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0, betas=(0.9, 0.98), eps=1e-9)   
    ce_loss = nn.CrossEntropyLoss()
    for epoch in tqdm(range(20)):
      train(dataloader_train,epoch, model, device, optimizer, ce_loss,  mode)
      validation(dataloader_val,epoch, model, device, optimizer,ce_loss,  mode)

In [None]:
vector_train(mode="x", languages = LANGUAGES)

  0%|          | 0/20 [00:00<?, ?it/s]

Total training loss 1.103741390009721 and training Accuracy 0.328125 after 0 epochs
Total vlidation loss 1.0985203345616659 and Validation accuracy 0.3333333333333333 after 0 epochs


  5%|▌         | 1/20 [00:33<10:31, 33.25s/it]

Total training loss 1.0981587022542953 and training Accuracy 0.296875 after 1 epochs


 10%|█         | 2/20 [01:03<09:31, 31.76s/it]

Total vlidation loss 1.0929180145263673 and Validation accuracy 0.2833333333333333 after 1 epochs
Total training loss 1.0645711819330852 and training Accuracy 0.375 after 2 epochs


 15%|█▌        | 3/20 [01:35<08:55, 31.48s/it]

Total vlidation loss 1.0811744213104248 and Validation accuracy 0.43333333333333335 after 2 epochs
Total training loss 1.0026376756529014 and training Accuracy 0.4270833333333333 after 3 epochs


 20%|██        | 4/20 [02:06<08:22, 31.40s/it]

Total vlidation loss 1.0288559516270956 and Validation accuracy 0.4666666666666667 after 3 epochs
Total training loss 0.9270490060249964 and training Accuracy 0.5416666666666666 after 4 epochs


 25%|██▌       | 5/20 [02:37<07:49, 31.33s/it]

Total vlidation loss 0.9776626348495483 and Validation accuracy 0.5333333333333333 after 4 epochs
Total training loss 0.8215190898627043 and training Accuracy 0.5572916666666666 after 5 epochs


 30%|███       | 6/20 [03:08<07:14, 31.07s/it]

Total vlidation loss 0.9983925938606262 and Validation accuracy 0.5 after 5 epochs
Total training loss 0.78577094959716 and training Accuracy 0.5104166666666666 after 6 epochs


 35%|███▌      | 7/20 [03:38<06:42, 30.96s/it]

Total vlidation loss 0.9754094084103903 and Validation accuracy 0.5833333333333334 after 6 epochs
Total training loss 0.7424181327223778 and training Accuracy 0.6041666666666666 after 7 epochs


 40%|████      | 8/20 [04:09<06:09, 30.82s/it]

Total vlidation loss 1.0301928997039795 and Validation accuracy 0.5666666666666667 after 7 epochs
Total training loss 0.7027506893500686 and training Accuracy 0.6822916666666666 after 8 epochs


 45%|████▌     | 9/20 [04:41<05:42, 31.12s/it]

Total vlidation loss 0.9100666999816894 and Validation accuracy 0.6 after 8 epochs
Total training loss 0.6422528040905794 and training Accuracy 0.6822916666666666 after 9 epochs


 50%|█████     | 10/20 [05:11<05:09, 30.97s/it]

Total vlidation loss 0.9605853408575058 and Validation accuracy 0.6166666666666667 after 9 epochs
Total training loss 0.6205099048092961 and training Accuracy 0.6927083333333334 after 10 epochs


 55%|█████▌    | 11/20 [05:42<04:37, 30.85s/it]

Total vlidation loss 0.9611064632733662 and Validation accuracy 0.5666666666666667 after 10 epochs
Total training loss 0.6228311744829019 and training Accuracy 0.6927083333333334 after 11 epochs


 60%|██████    | 12/20 [06:12<04:06, 30.76s/it]

Total vlidation loss 0.9068485021591186 and Validation accuracy 0.6 after 11 epochs
Total training loss 0.527316767256707 and training Accuracy 0.796875 after 12 epochs


 65%|██████▌   | 13/20 [06:44<03:37, 31.07s/it]

Total vlidation loss 1.6872923791408538 and Validation accuracy 0.5 after 12 epochs
Total training loss 0.6061593654255072 and training Accuracy 0.734375 after 13 epochs


 70%|███████   | 14/20 [07:15<03:05, 30.90s/it]

Total vlidation loss 0.9567465866605441 and Validation accuracy 0.5666666666666667 after 13 epochs
Total training loss 0.4687606973069099 and training Accuracy 0.8229166666666666 after 14 epochs


 75%|███████▌  | 15/20 [07:45<02:34, 30.85s/it]

Total vlidation loss 1.1951275726159414 and Validation accuracy 0.5666666666666667 after 14 epochs
Total training loss 0.540039622845749 and training Accuracy 0.7395833333333334 after 15 epochs


 80%|████████  | 16/20 [08:16<02:03, 30.76s/it]

Total vlidation loss 0.9776402354240418 and Validation accuracy 0.55 after 15 epochs
Total training loss 0.4156379991521438 and training Accuracy 0.8229166666666666 after 16 epochs


 85%|████████▌ | 17/20 [08:48<01:33, 31.10s/it]

Total vlidation loss 1.3798449700077375 and Validation accuracy 0.5666666666666667 after 16 epochs
Total training loss 0.3762348381569609 and training Accuracy 0.859375 after 17 epochs


 90%|█████████ | 18/20 [09:19<01:01, 30.97s/it]

Total vlidation loss 1.135534823934237 and Validation accuracy 0.5833333333333334 after 17 epochs
Total training loss 0.3865354358373831 and training Accuracy 0.8385416666666666 after 18 epochs


 95%|█████████▌| 19/20 [09:49<00:30, 30.85s/it]

Total vlidation loss 1.0312312444051106 and Validation accuracy 0.6333333333333333 after 18 epochs
Total training loss 0.28545340164176497 and training Accuracy 0.8802083333333334 after 19 epochs


100%|██████████| 20/20 [10:20<00:00, 31.02s/it]

Total vlidation loss 1.183060358464718 and Validation accuracy 0.6 after 19 epochs





In [None]:
vector_train(mode="d", languages = LANGUAGES)

  "num_layers={}".format(dropout, num_layers))
  0%|          | 0/20 [00:00<?, ?it/s]

Total training loss 1.0710926577448845 and training Accuracy 0.4823404947916667 after 0 epochs
Total vlidation loss 1.1973120411237081 and Validation accuracy 0.3641666666666667 after 0 epochs


  5%|▌         | 1/20 [26:33<8:24:33, 1593.36s/it]

Total training loss 0.8898469346264998 and training Accuracy 0.6216634114583334 after 1 epochs
Total vlidation loss 1.1137486060460409 and Validation accuracy 0.42333333333333334 after 1 epochs


 10%|█         | 2/20 [52:49<7:55:03, 1583.51s/it]

Total training loss 0.7266308528681596 and training Accuracy 0.69775390625 after 2 epochs
Total vlidation loss 1.1037898321946462 and Validation accuracy 0.5308333333333334 after 2 epochs


 15%|█▌        | 3/20 [1:19:23<7:29:57, 1588.08s/it]

Total training loss 0.5917930697711805 and training Accuracy 0.741455078125 after 3 epochs
Total vlidation loss 0.8282718032598495 and Validation accuracy 0.6491666666666667 after 3 epochs


 20%|██        | 4/20 [1:46:01<7:04:34, 1592.18s/it]

Total training loss 0.4387635923922062 and training Accuracy 0.8451334635416666 after 4 epochs
Total vlidation loss 0.9018092423677444 and Validation accuracy 0.6333333333333333 after 4 epochs


 25%|██▌       | 5/20 [2:12:40<6:38:39, 1594.62s/it]

Total training loss 0.42164528820042807 and training Accuracy 0.846923828125 after 5 epochs
Total vlidation loss 1.0273851851622264 and Validation accuracy 0.6016666666666667 after 5 epochs


 30%|███       | 6/20 [2:39:25<6:12:51, 1597.94s/it]

Total training loss 0.2582693822332658 and training Accuracy 0.9049479166666666 after 6 epochs
Total vlidation loss 0.9956939200560252 and Validation accuracy 0.6008333333333333 after 6 epochs


 35%|███▌      | 7/20 [3:06:05<5:46:23, 1598.77s/it]

Total training loss 0.24873027694411576 and training Accuracy 0.9235026041666666 after 7 epochs
Total vlidation loss 1.3389460613330206 and Validation accuracy 0.5466666666666666 after 7 epochs


 40%|████      | 8/20 [3:32:46<5:19:53, 1599.45s/it]

Total training loss 0.2290441335790092 and training Accuracy 0.89404296875 after 8 epochs
Total vlidation loss 1.309056439002355 and Validation accuracy 0.595 after 8 epochs


 45%|████▌     | 9/20 [3:59:28<4:53:23, 1600.31s/it]

Total training loss 0.12881136843255567 and training Accuracy 0.9620768229166666 after 9 epochs
Total vlidation loss 1.0536251078049341 and Validation accuracy 0.6475 after 9 epochs


 50%|█████     | 10/20 [4:26:13<4:26:55, 1601.57s/it]

Total training loss 0.1387709007152201 and training Accuracy 0.9384765625 after 10 epochs


 55%|█████▌    | 11/20 [4:52:50<4:00:01, 1600.14s/it]

Total vlidation loss 1.301651124904553 and Validation accuracy 0.63 after 10 epochs
Total training loss 0.12046699700780057 and training Accuracy 0.95556640625 after 11 epochs


 60%|██████    | 12/20 [5:19:21<3:32:58, 1597.33s/it]

Total vlidation loss 1.188602396969994 and Validation accuracy 0.645 after 11 epochs
Total training loss 0.09969641213441112 and training Accuracy 0.9645182291666666 after 12 epochs
Total vlidation loss 1.7637654994924863 and Validation accuracy 0.5841666666666666 after 12 epochs


 65%|██████▌   | 13/20 [5:45:50<3:06:04, 1594.98s/it]

Total training loss 0.14582442666505813 and training Accuracy 0.9534505208333334 after 13 epochs
Total vlidation loss 1.4232726410031318 and Validation accuracy 0.645 after 13 epochs


 70%|███████   | 14/20 [6:12:24<2:39:27, 1594.52s/it]

In [None]:
%env CUDA_LAUNCH_BLOCKING=1
# to do
# lower the learning rate. it might be the problem
# 1) Gradually decrease the learning rate to 0.0001. 
# 2) Add more data. 
# 3) Gradually increase the Dropout rates to ~0.2. Keep it consistent throughout the network. 
# 4) Decrease your batch size. 
# 5) Try a different optimizer (choose one that gives you the smallest loss).
# chnage shuffle to false

# source of the implementations:
# https://github.com/KrishnaDN/d-vector-language-recognition
# https://github.com/KrishnaDN/x-vector-pytorch
# https://github.com/gzhu06/Y-vector
