In [1]:
import tensorflow
from tensorflow.keras.layers import Reshape
from keras.layers import Input, Embedding, Reshape, LSTM, Dense, Flatten
from keras.models import Model
import os
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import pickle


In [2]:
batch_size = 16
max_sequence_length = 384
max_word_length = 20
embedding_size = 128
lstm_units = 256
num_classes = 14

directory_train='./training-data'
directory_test='./testing-data'
directory_all='./all-data'
model_path='model.keras' #modelin konumu
model_test_path='./deneme-veriler'#modelin testi için kullanılacak verinin konumu

In [12]:
train_test_texts=[]

for filename in os.listdir(directory_all):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_all, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            content = re.sub(r'<start_".*?">', '', content)
            content = re.sub(r'<end_".*?">', '', content)
            txt_data = content.split()
            train_test_texts.extend(txt_data)

tokenizer = Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(train_test_texts)

vocab_len = len(tokenizer.word_index) + 1

In [4]:
#SAVE TOKENIZER
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(tokenizer.word_index)

{'0': 1, 'A': 2, '1': 3, 'E': 4, 'T': 5, 'I': 6, '2': 7, 'R': 8, 'N': 9, 'L': 10, 'K': 11, '*': 12, 'S': 13, 'O': 14, 'M': 15, '5': 16, ':': 17, '.': 18, '3': 19, '8': 20, '4': 21, '9': 22, 'D': 23, '6': 24, 'U': 25, 'İ': 26, '7': 27, ',': 28, 'B': 29, 'C': 30, 'Y': 31, 'P': 32, 'V': 33, 'e': 34, 'a': 35, 'i': 36, 'H': 37, 'G': 38, 'Z': 39, 'r': 40, 'F': 41, '/': 42, 'Ş': 43, 'n': 44, 'o': 45, 'l': 46, '%': 47, 't': 48, 's': 49, 'Ü': 50, '-': 51, 'k': 52, 'd': 53, 'm': 54, 'u': 55, 'X': 56, 'y': 57, 'c': 58, 'g': 59, 'h': 60, 'z': 61, '#': 62, 'ı': 63, 'b': 64, 'w': 65, 'Ç': 66, 'Ğ': 67, 'Ö': 68, 'p': 69, 'v': 70, 'ş': 71, 'W': 72, 'ü': 73, ')': 74, '(': 75, 'J': 76, 'f': 77, 'x': 78, "'": 79, '+': 80, '$': 81, 'ğ': 82, '»': 83, 'ç': 84, '!': 85, 'Q': 86, '=': 87, 'ö': 88, 'Í': 89, 'ж': 90, '&': 91, 'j': 92, '>': 93, '"': 94, '\\': 95, 'Т': 96, 'О': 97, 'Р': 98, 'q': 99, '×': 100, '@': 101, ';': 102, 'Ø': 103, 'К': 104, '|': 105, '<': 106, 'Á': 107, 'Ú': 108, 'Ș': 109, '[': 110, '_': 1

In [5]:
def process_text_files(directory):
    all_texts = []
    all_texts_extend = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                content = re.sub(r'<start_".*?">', '', content)
                content = re.sub(r'<end_".*?">', '', content)
                txt_data = content.split()
                all_texts.append(txt_data)
                all_texts_extend.extend(txt_data)

    #tokenizer = Tokenizer(char_level=True, lower=False)
    #tokenizer.fit_on_texts(all_texts_extend)

    all_sequences=[]
    for i in range(len(all_texts)):
        all_sequences.append(tokenizer.texts_to_sequences(all_texts[i]))
        

    max_word_length = 20
    trunc_type = 'post'
    padding_type = 'post'

    # Pad each word sequence to the length of the longest word
    all_padded_sequences=[]
    for i in range(len(all_sequences)):
        padded_sequence = pad_sequences(all_sequences[i], maxlen=max_word_length, padding=padding_type, truncating=trunc_type)
        all_padded_sequences.append(padded_sequence)

    return all_padded_sequences, tokenizer.word_index


#   TRAIN
directory_path = directory_train
padded_sequences, word_index = process_text_files(directory_path)
padded_arrays = []
for arr in padded_sequences:
    pad_width = ((0, max_sequence_length - len(arr)), (0, 0))
    padded_arr = np.pad(arr, pad_width, mode='constant', constant_values=0)
    padded_arrays.append(padded_arr)

#  TEST
directory_path_test = directory_test
padded_sequences_test, word_index_test = process_text_files(directory_path_test)
padded_arrays_test = []
for arr in padded_sequences_test:
    pad_width = ((0, max_sequence_length - len(arr)), (0, 0))
    padded_arr = np.pad(arr, pad_width, mode='constant', constant_values=0)
    padded_arrays_test.append(padded_arr)


np.set_printoptions(threshold=np.inf)

#son array'i numpy array'ine çevir
x_train=np.array(padded_arrays)
x_test=np.array(padded_arrays_test)

print(x_train.shape)
print(x_test.shape)

np.set_printoptions(threshold=1000)

(654, 384, 20)
(440, 384, 20)


In [6]:
global wordsList,wordsIndextList

wordsList = []
wordsIndextList = []
max_sequence_length = 384


label_list = ["<start_\"company_name\">",
              "<end_\"company_name\">",
                  "<start_\"date\">",
                  "<end_\"date\">",
                  "<start_\"time\">",
                  "<end_\"time\">",
                  "<start_\"receipt_number\">",
                  "<end_\"receipt_number\">",
                  "<start_\"tax\">",
                  "<end_\"tax\">",
                  "<start_\"amount\">",
                  "<end_\"amount\">"]

label_list2= ["Pad","Others","B_Comp","l_Comp","B_Date","l_Date","B_Time","l_Time",
                  "B_Receipt","l_Receipt","B_Tax","l_Tax","B_Amount","l_Amount"]



def one_hot_encode_index_list():
    oneHotEncodedList = []
    unique_labels_len = len(sorted(set(label_list2)))
    for i in wordsIndextList:
        bitlist = list(0 for i in range(unique_labels_len))
        bitlist[i] = 1
        oneHotEncodedList.append(bitlist)

    for i in  range(max_sequence_length-len(oneHotEncodedList)):
        oneHotEncodedList.append([1,0,0,0,0,0,0,0,0,0,0,0,0,0])
    return oneHotEncodedList


def splitfunction(text:str):
    global wordsList,wordsIndextList
    wordsList = []
    wordsIndextList = []
    words = text.split()
    for word in words:
        wordsList.append(word)
        wordsIndextList.append(1)



def indexAssignment(tag,last):
    if(tag==None):
        return None
    index = label_list.index(tag)+2
    if(last == "l"):
       index +=1
    return index


def checkTag(word:str):
    for tag in label_list:
        index = word.find(tag)
        if index != -1:
            return tag
    return None



def tagingWords():
    control = None
    counter = 0
    for word in wordsList:
        tag = checkTag(word)
        if tag != None:
            if(word.find("<start_") != -1):
                control = tag
            if(word.find("<start_") != -1 and word.find("<end_") == -1):
                wordsIndextList[counter] = indexAssignment(tag,"B")
                counter+=1
            elif(word.find("<start_") != -1 and word.find("<end_") != -1):
                wordsIndextList[counter] = indexAssignment(tag,"B")
                counter+=1
            else:
                wordsIndextList[counter] = indexAssignment(tag,"B")
                counter+=1
            if(word.find("<end_") != -1):
                control = None
        elif control !=None:
            wordsIndextList[counter] = indexAssignment(control,"l")
            counter+=1
        else:
            counter+=1


def oneHotEncodedFunction(texts):
    splitfunction(texts)
    tagingWords()
    return np.array(one_hot_encode_index_list())



# TRAIN
folder_path_train =  directory_train
file_list_train = os.listdir(folder_path_train)
y_train= []
for file_name in file_list_train:
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path_train, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            result = oneHotEncodedFunction(text)

            if len(y_train) == 0:
                y_train = np.array([result])
            else:
                y_train = np.concatenate((y_train, [result]), axis=0)


# TEST
folder_path_test =  directory_test
file_list_test = os.listdir(folder_path_test)
y_test= []
for file_name in file_list_test:
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path_test, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            result = oneHotEncodedFunction(text)

            if len(y_test) == 0:
                y_test = np.array([result])
            else:
                y_test = np.concatenate((y_test, [result]), axis=0)


print(y_train.shape)
print(y_test.shape)

(654, 384, 14)
(440, 384, 14)


In [7]:
# LSTM Modeli

input_shape = (max_sequence_length, max_word_length)
input_layer = Input(shape=(max_sequence_length,max_word_length))
print(input_layer.shape)


(None, 384, 20)


In [9]:
char_embedding = Embedding(input_dim=vocab_len, output_dim=embedding_size)(input_layer)
print(char_embedding.shape)

(None, 384, 20, 128)


In [10]:
reshaped_embedding = tensorflow.reshape(char_embedding, (-1, max_word_length, embedding_size))
print(reshaped_embedding.shape)

(None, 20, 128)


In [11]:
char_lstm, state_h, state_c = LSTM(units=lstm_units, return_sequences=True, return_state=True)(reshaped_embedding)
print(char_lstm)

char_lstm2, state_h, state_c = LSTM(units=lstm_units, return_sequences=True, return_state=True)(char_lstm, initial_state=[state_h, state_c])
print(char_lstm2)

KerasTensor(type_spec=TensorSpec(shape=(None, 20, 256), dtype=tf.float32, name=None), name='lstm/PartitionedCall:1', description="created by layer 'lstm'")
KerasTensor(type_spec=TensorSpec(shape=(None, 20, 256), dtype=tf.float32, name=None), name='lstm_1/PartitionedCall:1', description="created by layer 'lstm_1'")


In [12]:
reshaped = tensorflow.reshape(state_h, (-1,max_sequence_length, state_h.shape[-1]))
print(reshaped.shape)

(None, 384, 256)


In [13]:
word_lstm1, state_h, state_c = LSTM(units=lstm_units, return_sequences=True, return_state=True)(reshaped)
print(word_lstm1)
word_lstm2, state_h, state_c = LSTM(units=lstm_units, return_sequences=True, return_state=True)(word_lstm1, initial_state=[state_h, state_c])
print(word_lstm2)

KerasTensor(type_spec=TensorSpec(shape=(None, 384, 256), dtype=tf.float32, name=None), name='lstm_2/PartitionedCall:1', description="created by layer 'lstm_2'")
KerasTensor(type_spec=TensorSpec(shape=(None, 384, 256), dtype=tf.float32, name=None), name='lstm_3/PartitionedCall:1', description="created by layer 'lstm_3'")


In [14]:
classification_output = Dense(units=num_classes, activation='softmax')(word_lstm2)
print(word_lstm2)

KerasTensor(type_spec=TensorSpec(shape=(None, 384, 256), dtype=tf.float32, name=None), name='lstm_3/PartitionedCall:1', description="created by layer 'lstm_3'")


In [15]:
model = Model(inputs=input_layer, outputs=classification_output)

In [16]:
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#model.summary()

In [17]:
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow.keras as keras

checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', save_best_only=True)

opt = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])



In [18]:

model.fit(x_train, y_train, epochs=1000, batch_size=16, validation_data=(x_test, y_test), callbacks=[ checkpoint])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x27d333ff088>

In [19]:
model.save("model.keras")

In [26]:
#MODELİ ÇALIŞTIR
def model_run(directory):
    all_texts = []

    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                txt_data = content.split()
                all_texts.append(txt_data)
                
    #TOKENIZER YÜKLE
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer_model = pickle.load(handle)     
    print(tokenizer_model.word_index)        

    all_sequences=[]
    for i in range(len(all_texts)):
        all_sequences.append(tokenizer_model.texts_to_sequences(all_texts[i]))
        
    max_word_length,trunc_type,padding_type = 20,'post','post'

    all_padded_sequences=[]
    for i in range(len(all_sequences)):
        padded_sequence = pad_sequences(all_sequences[i], maxlen=max_word_length, padding=padding_type, truncating=trunc_type)
        all_padded_sequences.append(padded_sequence)

    padded_arrays = []
    for arr in all_padded_sequences:
        pad_width = ((0, max_sequence_length - len(arr)), (0, 0))
        padded_arr = np.pad(arr, pad_width, mode='constant', constant_values=0)
        padded_arrays.append(padded_arr)

    xtrain=np.array(padded_arrays)
    model = keras.models.load_model(model_path)

    prediction=model.predict(xtrain)
    print(prediction.shape)
    #------------------------------------------------------------
    label_list = ["Pad", "Others", "B_Comp", "I_Comp", "B_Date", "I_Date", "B_Time", "I_Time", "B_Receipt", "I_Receipt", "B_Tax", "I_Tax", "B_Amount", "I_Amount"]

    #deneme yapılacak fişin konumu
    with open("./deneme-veriler/deneme-veri.txt", "r", encoding="utf-8") as file:
        words = file.read().split()
    
    predicted_labels = np.argmax(prediction, axis=-1)
    print(predicted_labels)
    
    #klasördeki ilk fişi temsil eder
    fis_1=predicted_labels[0]

    truncated_values = fis_1[:len(words)]#uzun olan diziyi kısalt ve iki dizi boyutunu eşle
    my_map = dict(zip(words, truncated_values))#kelimelere ve karşılarına predictionları 0 1 2 formatında maple

    result_map = {}
    for key, value in my_map.items():#anlamlı gözükmesi için 0 1 2 yerine etiket isimleri olacak şekilde maple
        index = min(value, len(label_list) - 1)
        label = label_list[index]
        result_map[key] = label

    print(result_map)
    for key, value in result_map.items():
        print(f'{key}: {value}')
        

In [27]:

#model_test_path içinde test edilecek veriler txt halde bulunmalıdır
model_run(model_test_path)

{'0': 1, 'A': 2, '1': 3, 'E': 4, 'T': 5, 'I': 6, '2': 7, 'R': 8, 'N': 9, 'L': 10, 'K': 11, '*': 12, 'S': 13, 'O': 14, 'M': 15, '5': 16, ':': 17, '.': 18, '3': 19, '8': 20, '4': 21, '9': 22, 'D': 23, '6': 24, 'U': 25, 'İ': 26, '7': 27, ',': 28, 'B': 29, 'C': 30, 'Y': 31, 'P': 32, 'V': 33, 'e': 34, 'a': 35, 'i': 36, 'H': 37, 'G': 38, 'Z': 39, 'r': 40, 'F': 41, '/': 42, 'Ş': 43, 'n': 44, 'o': 45, 'l': 46, '%': 47, 't': 48, 's': 49, 'Ü': 50, '-': 51, 'k': 52, 'd': 53, 'm': 54, 'u': 55, 'X': 56, 'y': 57, 'c': 58, 'g': 59, 'h': 60, 'z': 61, '#': 62, 'ı': 63, 'b': 64, 'w': 65, 'Ç': 66, 'Ğ': 67, 'Ö': 68, 'p': 69, 'v': 70, 'ş': 71, 'W': 72, 'ü': 73, ')': 74, '(': 75, 'J': 76, 'f': 77, 'x': 78, "'": 79, '+': 80, '$': 81, 'ğ': 82, '»': 83, 'ç': 84, '!': 85, 'Q': 86, '=': 87, 'ö': 88, 'Í': 89, 'ж': 90, '&': 91, 'j': 92, '>': 93, '"': 94, '\\': 95, 'Т': 96, 'О': 97, 'Р': 98, 'q': 99, '×': 100, '@': 101, ';': 102, 'Ø': 103, 'К': 104, '|': 105, '<': 106, 'Á': 107, 'Ú': 108, 'Ș': 109, '[': 110, '_': 1