In [None]:
import math
import random
import csv
from collections import Counter

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available else "cpu")

In [7]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
path     = 'data/transcriptions/train.csv' 

with open(path, 'r') as f:
    reader = csv.reader(f)
    lines   = list(reader)
    
_, words, trans = zip(*lines[1:])

<h4>Shuffle and split data</h4>

In [None]:
#shuffle here

val_size = 0.1
train_words, val_words = #code here
train_trans, val_trans = #code here

In [None]:
def words_tokenize(line):
    #'APPLE' -> ['A', 'P', 'L', 'L', 'E']
    #code here

def trans_tokenize(line):
    #'AH N K AO R K' -> ['AH', 'N', 'K', 'AO', 'R', 'K']
    #code here

<h4>Cound words and transcriptions</h4>

In [10]:
words_counter = Counter()
trans_counter = Counter()

#code here

In [11]:
class Vocab:
    def __init__(self, counter, sos, eos, pad, unk, min_freq=None):
        self.sos = sos
        self.eos = eos
        self.pad = pad
        self.unk = unk
        
        self.pad_idx = 0
        self.unk_idx = 1
        self.sos_idx = 2
        self.eos_idx = 3
        
        self._token2idx = {
            self.sos: self.sos_idx,
            self.eos: self.eos_idx,
            self.pad: self.pad_idx,
            self.unk: self.unk_idx,
        }
        self._idx2token = {idx:token for token, idx in self._token2idx.items()}
        
        idx = len(self._token2idx)
        min_freq = 0 if min_freq is None else min_freq
        
        for token, count in counter.items():
            if count > min_freq:
                self._token2idx[token] = idx
                self._idx2token[idx]   = token
                idx += 1
        
        self.vocab_size = len(self._token2idx)
        self.tokens     = list(self._token2idx.keys())
    
    def token2idx(self, token):
        return self._token2idx.get(token, self.pad_idx)
    
    def idx2token(self, idx):
        return self._idx2token.get(idx, self.pad)
    
    def __len__(self):
        return len(self._token2idx)

In [12]:
def padding(sequences, pad_idx):
    '''
    Inputs:
        sequences: list of list of tokens
    '''
    max_length = max(map(len, sequences))
    
    return [seq + [pad_idx]*(max_length - len(seq)) for seq in sequences]

In [13]:
sos = "<sos>"
eos = "<eos>"
pad = "<pad>"
unk = "<unk>"

words_vocab = Vocab(words_counter, 
                    sos, eos, pad, unk)

trans_vocab = Vocab(trans_counter, 
                    sos, eos, pad, unk)

<h4>Tokenize data</h4>

In [None]:
train_words = [[words_vocab.token2idx(item) for item in words_tokenize(word)] for word in train_words]
val_words   = [[words_vocab.token2idx(item) for item in words_tokenize(word)] for word in val_words]

train_trans = [[trans_vocab.token2idx(item) for item in trans_tokenize(trans)] for trans in train_trans]
val_trans   = [[trans_vocab.token2idx(item) for item in trans_tokenize(trans)] for trans in val_trans]

In [15]:
def get_batch(batch_size, sort=False, val=False):
    '''
    Outputs:
        batch_words:    (batch x seq_len)   torch.LongTensor
        batch_trans_in: (batch x seq_len)   torch.LongTensor
        batch_trans_out: (batch x seq_len)  torch.LongTensor
        words_lens: (batch)                 torch.LongTensor
        trans_lens: (batch)                 torch.LongTensor
    '''
        
    #code here

    return batch_words, batch_trans_in, batch_trans_out, words_lens, trans_lens

In [18]:
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence  as unpack

#code Encoder
#forward(batch_words, words_lens) -> outputs, hidden
    
#code Decoder
#forward(batch_trans_in, hidden) -> logits, hidden

In [20]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
    def forward(self, batch_words, words_lens, batch_trans_in, batch_trans_out):
        _, hidden = self.encoder(batch_words, words_lens)
        logits, _ = self.decoder(batch_trans_in, hidden)
        
        #mask = code here
        #loss = code here
        
        return loss
    
    def generate(self, sos_idx, eos_idx, batch_words):
        '''
        Inputs:
            batch_words: (1 x seq_len)
        Outputs:
            tokens: [45, 30, 122, 4, 8, 5]
        '''
        

In [23]:
def plot(epoch, batch_idx, train_losses, val_losses):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('epoch %s. | batch: %s | loss: %s' % (epoch, batch_idx, train_losses[-1]))
    plt.plot(train_losses)
    plt.subplot(132)
    plt.title('epoch %s. | loss: %s' % (epoch, val_losses[-1]))
    plt.plot(val_losses)
    plt.show()

In [24]:
#Declare models, criterion, optimizer

In [26]:
#train epochs

In [27]:
def _print(val):
    batch_words, batch_trans_in, batch_trans_out, words_lens, trans_lens = get_batch(1, sort=True, val=val)
    batch_words     = batch_words.to(device)
    batch_trans_out = batch_trans_out.to(device)


    inp = model.generate(words_vocab.sos_idx, words_vocab.eos_idx, batch_words)
            
    tokens = [trans_vocab.idx2token(idx) for idx in inp if idx not in [trans_vocab.sos_idx,
                                                                                     trans_vocab.eos_idx,
                                                                                     trans_vocab.pad_idx]]
    print('Src: ', ''.join([words_vocab.idx2token(idx) for idx in batch_words[0].tolist()]))
    print('Pred:', ''.join(tokens))
    print('Real:', ''.join([trans_vocab.idx2token(idx) for idx in batch_trans_out[0].tolist() if idx not in [trans_vocab.sos_idx,
                                                                            trans_vocab.eos_idx,
                                                                            trans_vocab.pad_idx]]))

In [None]:
for _ in range(10):
    _print(True)
    print()

<h1>Практика</h1>

In [29]:
class Task:
    def __init__(self,):
        pass
    
    def get_batch(self, ):
        pass

class Vocab:
    def __init__(self,):
        pass
    
    def token2idx(self,):
        pass
    
    def idx2token(self,):
        pass
    
    def __len__(self,):
        pass

class Trainer:
    def __init__(self, model, optimizer, criterion):
        pass
    
    def one_train_step(self, ):
        pass
    
    def one_val_step(self,):
        pass
    
    def train(self, epochs):
        pass
    
    def plot(self,):
        pass

class Model:
    def forward(self, ):
        return loss
    
    def generate(self, ):
        return pred_tokens