In [1]:
import pyphen
import string
import re
import collections

import numpy as np

from keras.models import Sequential
from keras.layers import Dense
import keras

Using TensorFlow backend.


In [2]:
hypher = pyphen.Pyphen(lang='hu_HU')

In [3]:
def hyph_class(word, hypher=pyphen.Pyphen(lang='hu_HU'), aslist=False):
    """Hyphenating classification of the characters in the word. {B(egin),M(iddle),E(nd),S(ingle)}"""
    if(len(word)==0):
        raise IndexError("0 length word")
    ret = list('M' * len(word))
    ret[0]='B'
    ret[-1]='E'
    for i in hypher.positions(word):
        ret[i]='B'
        if(ret[i-1]=='B'):
            ret[i-1]='S'
        else:
            ret[i-1]='E'
    if(aslist):
        return ret
    return "".join(ret)
def same_char_num(word, hypher=pyphen.Pyphen(lang='hu_HU')):
    """Return true if the hyphenated word has as many chars as the original"""
    return len(hypher.inserted(word))==len(word)+len(hypher.positions(word))
def cleaning(data):
    """Text cleaning:
        lower the letters
        punctuation, digits ellimination"""
    formated_data = data.lower()
    formated_data = re.sub('['+string.punctuation+']','',formated_data)
    formated_data = re.sub('['+string.digits+']','',formated_data)
    return formated_data

In [4]:
counter_hu_data = collections.Counter()
word_list = []
c_all = 0
c_same_char_num = 0
with open('web2.2-freq-sorted.txt','r',errors='ignore',encoding='latin2') as f:
    i = 0
    for line in f:
        i = i+1
        words = line.split()
        if len(words)>1:
            if(words[1].isdigit()):
                counter_hu_data[cleaning(words[0])]+=int(words[1])
        if i>100000:
            break
for words in counter_hu_data.most_common():
    c_all+=1
    next_word = words[0]
    if(len(next_word)!=0 and same_char_num(next_word)):
        c_same_char_num+=1
        word_list.append([next_word,hyph_class(next_word)])

In [5]:
c_same_char_num/c_all

0.9867324640395745

In [6]:
print(word_list[100:200])

[['ember', 'BEBME'], ['ezek', 'BMME'], ['tovább', 'BEBMME'], ['vagyok', 'BEBMME'], ['ma', 'BE'], ['miatt', 'BEBME'], ['this', 'BMME'], ['mellett', 'BMEBMME'], ['vannak', 'BMEBME'], ['by', 'BE'], ['alapján', 'BMMEBME'], ['lenne', 'BMEBE'], ['with', 'BMME'], ['that', 'BMME'], ['tehát', 'BEBME'], ['te', 'BE'], ['it', 'BE'], ['ben', 'BME'], ['egész', 'BMMME'], ['néhány', 'BEBMME'], ['milyen', 'BEBMME'], ['át', 'BE'], ['nekem', 'BEBME'], ['előtt', 'BMMME'], ['ezzel', 'BEBME'], ['mivel', 'BEBME'], ['ezen', 'BMME'], ['nélkül', 'BMEBME'], ['lett', 'BMME'], ['stb', 'BME'], ['you', 'BME'], ['viszont', 'BEBMMME'], ['év', 'BE'], ['teljes', 'BMEBME'], ['erre', 'BEBE'], ['hiszen', 'BEBMME'], ['másik', 'BEBME'], ['ebben', 'BEBME'], ['mind', 'BMME'], ['der', 'BME'], ['három', 'BEBME'], ['talán', 'BEBME'], ['valami', 'BEBEBE'], ['die', 'BME'], ['bár', 'BME'], ['are', 'BME'], ['legyen', 'BEBMME'], ['tudom', 'BEBME'], ['from', 'BMME'], ['inkább', 'BEBMME'], ['során', 'BEBME'], ['például', 'BMEBEBE'], ['r

In [7]:
len(word_list)

82181

In [40]:
#onehot: {'B','M','E','S'}
def one_hot_encode(char,dictionary='BMES'):
    ret = [0]*len(dictionary)
    if char in dictionary:
        ret[dictionary.find(char)]=1
        return ret
    raise ValueError('Value out of dictionary range'+char)

In [36]:
hun_chars='aábcdeéfghiíjklmnoóöőpqrstuúüűvwxyz'+'.'
def generate_network_data(data, ret_input = [],ret_output=[],length = 2, start_char='.'):
    """from [word,hyph_class(word) to length-long input-output data"""
    word = data[0]
    word_plus = start_char*(length-1)+word
    hyph_word = data[1]
    for i in range(0,len(word)):
        input_next_iter = []
        for c in word_plus[i:i+length]:
            input_next_iter.append(one_hot_encode(c,hun_chars))
        output_next_iter = one_hot_encode(hyph_word[i])
        ret_input.append(input_next_iter)
        ret_output.append(output_next_iter)

In [38]:
wc_in=[]
wc_out=[]

generate_network_data(['alma','BEBE'],wc_in, wc_out)
print(wc_in[0],wc_out[0])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] [1, 0, 0, 0]


In [45]:
data_in = []
data_out = []
wrong_word = 0
for word in word_list:
    try:
        generate_network_data(word,data_in,data_out,2)
    except ValueError:
        wrong_word+=1
print('Data len: ',len(data_in))
print('Wrong words: ',wrong_word)

Data len:  695849
Wrong words:  640


In [78]:
window_length = 2


model = Sequential()
model.add(Dense(input_shape=(2,36), units=10, name='input_layer'))
model.add(Dense(4,name='output_layer'))

In [79]:
model.compile(loss='mse',optimizer='sgd')

In [85]:
model.fit(data_in[0:10],data_out[0:10], epochs=10)

In [84]:
data_out[0:10]

[[0, 0, 1, 0],
 [1, 0, 0, 0],
 [0, 0, 1, 0],
 [1, 0, 0, 0],
 [0, 0, 1, 0],
 [1, 0, 0, 0],
 [0, 1, 0, 0],
 [0, 1, 0, 0],
 [0, 0, 1, 0],
 [1, 0, 0, 0]]