In [1]:
import matplotlib.pyplot as plt
import os
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence, hashing_trick
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sample_file = 'data/text/finnish.txt'
with open(sample_file) as f:
    x = f.read()
    
i = text_to_word_sequence(
    x,
    filters='!"\'#$%&()*+,-./:;<=>?@[\]^_,\n',
    lower=True, 
    split=' '
)

In [3]:
num_words = 1000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
z = tokenizer.fit_on_texts(i)

In [4]:
word_index = tokenizer.word_index

In [5]:
train_sequences = tokenizer.texts_to_sequences(i)
maxlen = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

In [6]:
print("Word index:\n", word_index)
print("\nTraining sequences:\n", train_sequences)
print("\nPadded training sequences:\n", train_padded)
print("\nPadded training shape:", train_padded.shape)
print("Training sequences data type:", type(train_sequences))
print("Padded Training sequences data type:", type(train_padded))

Word index:
 {'<UNK>': 1, 'on': 2, 'lauloi': 3, 'väinämöinen': 4, 'nuori': 5, 'joukahainen': 6, 'vanha': 7, 'sanoi': 8, 'ei': 9, 'siitä': 10, 'en': 11, 'kun': 12, 'joukahaisen': 13, 'joka': 14, 'nuoren': 15, 'jo': 16, 'itse': 17, 'noita': 18, 'sanan': 19, 'oi': 20, 'mitä': 21, 'virkkoi': 22, 'noin': 23, 'nimesi': 24, 'tieän': 25, 'tuon': 26, 'kaksi': 27, 'vaka': 28, 'väinölän': 29, 'ahoilla': 30, 'sai': 31, 'emo': 32, 'ajoi': 33, 'olet': 34, 'toinen': 35, 'sen': 36, 'ole': 37, 'pyhät': 38, 'itselläniki': 39, 'kalevalan': 40, 'oli': 41, 'emoni': 42, 'jos': 43, 'päivän': 44, 'iän': 45, 'ikuinen': 46, 'miehen': 47, 'ilman': 48, 'eikä': 49, 'sanoiksi': 50, 'virkki': 51, 'miekan': 52, 'tästä': 53, 'annan': 54, 'huoli': 55, 'siitäki': 56, 'syvemmä': 57, 'noilla': 58, 'kankahilla': 59, 'laula': 60, 'urohon': 61, 'se': 62, 'tiesi': 63, 'tuosta': 64, 'kaiken': 65, 'laulajaksi': 66, 'luoksi': 67, 'kera': 68, 'laulan': 69, 'läksi': 70, 'toisen': 71, 'tietäjä': 72, 'vesi': 73, 'silloin': 74, 'vaan

In [7]:
test_sequences = tokenizer.texts_to_sequences(i)
test_padded = pad_sequences(test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

print("Testing sequences:\n", test_sequences)
print("\nPadded testing sequences:\n", test_padded)
print("\nPadded testing shape:",test_padded.shape)

Testing sequences:
 [[28], [7], [4], [261], [262], [58], [29], [30], [40], [59], [106], [263], [106], [264], [3], [265], [266], [267], [268], [269], [270], [271], [18], [107], [108], [109], [9], [60], [110], [111], [272], [273], [112], [113], [114], [115], [116], [117], [274], [275], [276], [277], [118], [278], [119], [279], [61], [280], [118], [281], [282], [31], [283], [284], [285], [5], [6], [286], [287], [288], [62], [289], [290], [291], [292], [293], [294], [295], [296], [297], [298], [58], [29], [30], [40], [59], [299], [21], [300], [63], [41], [301], [302], [303], [64], [304], [305], [65], [306], [307], [120], [66], [308], [309], [16], [121], [122], [67], [67], [123], [310], [311], [312], [313], [314], [29], [315], [68], [124], [125], [126], [127], [316], [126], [127], [32], [317], [318], [319], [68], [124], [125], [320], [321], [128], [128], [322], [323], [324], [129], [325], [326], [327], [328], [329], [330], [331], [332], [8], [5], [6], [333], [130], [131], [42], [334], [132]

In [8]:
for x, y in zip(i, test_padded):
  print('{} -> {}'.format(x, y))

print("\nWord index (for reference):", word_index)

vaka -> [28]
vanha -> [7]
väinämöinen -> [4]
elelevi -> [261]
aikojansa -> [262]
noilla -> [58]
väinölän -> [29]
ahoilla -> [30]
kalevalan -> [40]
kankahilla -> [59]
laulelevi -> [106]
virsiänsä -> [263]
laulelevi -> [106]
taitelevi -> [264]
lauloi -> [3]
päivät -> [265]
pääksytysten -> [266]
yhytysten -> [267]
yöt -> [268]
saneli -> [269]
muinaisia -> [270]
muisteloita -> [271]
noita -> [18]
syntyjä -> [107]
syviä -> [108]
joit -> [109]
ei -> [9]
laula -> [60]
kaikki -> [110]
lapset -> [111]
ymmärrä -> [272]
yhet -> [273]
urohot -> [112]
tällä -> [113]
inhalla -> [114]
iällä -> [115]
katovalla -> [116]
kannikalla -> [117]
kauas -> [274]
kuuluvi -> [275]
sanoma -> [276]
ulos -> [277]
viestit -> [118]
vierähtävät -> [278]
väinämöisen -> [119]
laulannasta -> [279]
urohon -> [61]
osoannasta -> [280]
viestit -> [118]
vierähti -> [281]
suvehen -> [282]
sai -> [31]
sanomat -> [283]
pohjolahan -> [284]
olipa -> [285]
nuori -> [5]
joukahainen -> [6]
laiha -> [286]
poika -> [287]
lappalainen ->