In [1]:
import torch
import torch.nn as nn
import numpy as np

# PREPROCESSING
def gen_files(length=10000, input_file="words.txt", output_file="translated.txt"):
    count = 0
    suffixes = ["ay", "way"]
    vowels = ["a", "e", "i", "o", "u"]
    with open(input_file, 'r') as input_file:
        with open(output_file, 'w') as output_file:
            for line in input_file:
                # if count > length:
                #     break
                for word in line.split():
                    word = word.lower()
                    consanants = ""
                    translated = ""
                    while word[0] not in vowels:
                        consanants = consanants + word[0]
                        if len(word) > 1:
                            word = word[1:]
                        else: 
                            word = ""
                            break
                    if len(consanants) > 0:
                        translated = word + consanants + suffixes[0]
                    else:
                        translated = word +suffixes[1]
                    output_file.write(translated+"\n")
                    count += 1
    input_file.close()
    output_file.close()

def gen_examples(input_file="words.txt", output_file="translated.txt"):
    input_data = []
    output_data = []
    for line in open(input_file, "r"): input_data.append("".join([c for c in line[:-1]]))
    for line in open(output_file, "r"): output_data.append("".join([c for c in line[:-1]]))
    return input_data, output_data

In [2]:
# ENCODING
def encode_single(data, chars, int_to_char):
    char_to_int = {ch: i for i, ch in int_to_char.items()}
    en_data = np.array([char_to_int[ch] for ch in data])
    return en_data

def encode_list(data, chars, int_to_char):
    return [encode_single(single, chars, int_to_char) for single in list(data)]

def transform(data, vector_size=33):
    transformed_data = np.zeros(shape=(vector_size))
    for i in range(0, len(data)):
        transformed_data[i] = data[i]
    return transformed_data

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dict_size=28):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dict_size, 64)
        self.lstm = nn.LSTM(32, 64, batch_first=True, bidirectional=True;likujyhgfdsjkl;)


    def forward(self, encoder_input_sequences):
        embedded = self.embedding(encoder_input_sequences)
        output, _ = self.lstm(embedded)
        return output[:, -1]

SyntaxError: invalid syntax (<ipython-input-13-6d8ff01ecf55>, line 5)

In [4]:
class Decoder(nn.Module):
    def __init__(self, output_dict_size=28):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dict_size, 64)
        self.lstm = nn.LSTM(64, 64, batch_first=True)
        self.linear = nn.Linear(64, output_dict_size)

    def forward(self, encoder_output, decoder_input_sequences):
        encoder_output = encoder_output.unsqueeze(0)
        embedded = self.embedding(decoder_input_sequences)
        output, _ = self.lstm(embedded, [encoder_output, encoder_output])
        output = self.linear(output)
        
        return output

In [5]:
class CharRNN(nn.Module):
    def __init__(self):
        super(CharRNN, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
    
    def forward(self, encoder_input_sequences, decoder_input_sequences):
        encoder_output = self.encoder(encoder_input_sequences)
        decoder_output = self.decoder(encoder_output, decoder_input_sequences)
        return decoder_output

In [6]:
CHAR_CODE_START = 27
INPUT_LENGTH = 33
OUTPUT_LENGTH = 33

gen_files()
input_data, output_data = gen_examples()

chars = set(' '.join(input_data)) | set(' '.join(output_data))
int_to_char = dict(enumerate(chars, 1))
int_to_char[28] = 'START'

data_input = np.array([transform(d) for d in encode_list(input_data, chars, int_to_char)])
data_output = np.array([transform(d) for d in encode_list(output_data, chars, int_to_char)])

perm = np.random.permutation(data_input.shape[0])
data_input = data_input[perm]
data_output = data_output[perm]

data_size = len(input_data)
encoded_training_input = data_input
encoded_training_output = data_output

training_encoder_input = encoded_training_input
training_decoder_output = encoded_training_output

training_decoder_input = np.zeros_like(encoded_training_output)
training_decoder_input[:, 1:] = encoded_training_output[:,:-1]
training_decoder_input[:, 0] = CHAR_CODE_START

In [None]:
def train_epoch(model, optimizer, batch_size=64, criterion=nn.CrossEntropyLoss(), encoder_input=training_encoder_input,decoder_input=training_decoder_input, decoder_output=training_decoder_output):
    permutation = np.random.permutation(encoder_input.shape[0])
    encoder_input = encoder_input[permutation]
    decoder_input = decoder_input[permutation]
    decoder_output = decoder_output[permutation]
    
    epoch_loss = 0
    iteration_count = 0
    
    for begin_index in range(0, int(len(encoder_input)*0.1), batch_size):    
        end_index = begin_index + batch_size
        iteration_count += 1
        
        e_in = torch.tensor(encoder_input[begin_index:end_index]).to(torch.int64)
        d_in = torch.tensor(decoder_input[begin_index:end_index]).to(torch.int64)
        d_out = torch.tensor(decoder_output[begin_index:end_index]).to(torch.int64)
        
        optimizer.zero_grad()
        
        output = model(e_in, d_in)
        target = d_out.view(-1)
        output = output.view(-1, output.shape[-1])
        loss = criterion(output, target)
        loss.backward()

        optimizer.step()
        epoch_loss = epoch_loss + loss.item()
            
    return epoch_loss / iteration_count

In [None]:
def train_model(model, optimizer, n_epoch=30):
    for i in range(1, n_epoch + 1):
        loss = train_epoch(model, optimizer)
        print('epoch ', i, ', loss: ', loss)

def generate_output(input_sequence):
    decoder_input = np.zeros(shape=(len(input_sequence), OUTPUT_LENGTH), dtype='int')
    decoder_input[:,0] =  CHAR_CODE_START
    
    encoder_input = torch.tensor(input_sequence).to(torch.int64)
    decoder_input = torch.tensor(decoder_input).to(torch.int64)
    
    for i in range(1, OUTPUT_LENGTH):
        model.cpu()
        output = model(encoder_input, decoder_input)
        output = output.argmax(dim=2)
        decoder_input[:,i] = output[:,i-1]
        
    return decoder_input[:,1:].detach().numpy()

def decode_single(data, chars, int_to_char):
    de_data = ''
    data = data[0]
    for ch in data:
        if ch == 0:
            break
        if ch == 28:
            continue
        de_data += int_to_char[ch]
    return de_data

def translate_sentence(sentence):
    words = sentence.split(' ')
    translated = []
    for word in words:
        inp = [transform(encode_single(word, chars, int_to_char))]
        out = generate_output(inp)
        translated.append(decode_single(out, chars, int_to_char))
    return ' '.join(translated)

In [11]:
model = CharRNN()
optimizer = torch.optim.Adam(model.parameters())
train_model(model, optimizer, n_epoch=30)

epoch  1 , loss:  0.7918949072917532
epoch  2 , loss:  0.7013959657905424
epoch  3 , loss:  0.6853027545753061
epoch  4 , loss:  0.6750695450161651
epoch  5 , loss:  0.6700472347674521
epoch  6 , loss:  0.6655973805817815
epoch  7 , loss:  0.6467723959117526
epoch  8 , loss:  0.5833438515491376
epoch  9 , loss:  0.5112464204129981
epoch  10 , loss:  0.44520724730120614
epoch  11 , loss:  0.3855056169568977
epoch  12 , loss:  0.34311881936249194
epoch  13 , loss:  0.3117845743121606
epoch  14 , loss:  0.2855542089135228
epoch  15 , loss:  0.26448184335953223
epoch  16 , loss:  0.24506816001721693
epoch  17 , loss:  0.2306771616856715
epoch  18 , loss:  0.2129188483317922
epoch  19 , loss:  0.19964998507018736
epoch  20 , loss:  0.1966515133978654
epoch  21 , loss:  0.186656533632567
epoch  22 , loss:  0.1740190249655707
epoch  23 , loss:  0.16503155371898875
epoch  24 , loss:  0.157720651093237
epoch  25 , loss:  0.1510531153948575
epoch  26 , loss:  0.14551580659508362
epoch  27 , loss

In [12]:
translate_sentence("fuck donald trump")

'uckfay onaldday umpsay'