In [13]:
# %%
import pandas as pd
from keras.src.callbacks import EarlyStopping
from keras.models import Model
from keras.layers import Embedding, LSTM, Dense, RepeatVector, Input, Bidirectional, Flatten
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
import numpy as np
import keras_nlp

ROW_CSV_FILE = r"row_locks.csv"

# Load and preprocess data
data = pd.read_csv(ROW_CSV_FILE)

# Strip spaces from column headers
data.columns = data.columns.str.strip()

# Create features
# TODO: add row_id, add token for rowid and page id token
# TODO: try transformer
data["page_table_combined"] = (
    # TODO: uncomment the line below  and comment the next one for char-based tokenization
    # data["PAGEID"].astype(str).apply(lambda x: " ".join(x))
    data["PAGEID"].astype(str)
    + " "
    + data["TABNAME"].astype(str).apply(lambda x: x.replace("_", ""))
)


# Prepare sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(
            data.iloc[i : i + seq_length][["page_table_combined"]]
            .apply(" ".join)
            .reset_index()
            .values[0][1]
        )
        y.append(
            data.iloc[i + seq_length]["page_table_combined"]
        )  # Predicting combined feature
    return X, y


seq_length = 50  # Define sequence length
out_seq_length = 2  # Define output sequence length I.e., page_id and table_name
source_texts, target_texts = create_sequences(data, seq_length)

# Parameters
vocab_size = 900  # Vocabulary size
embedding_dim = 128  # Embedding dimension
max_length = seq_length  # Maximum length of the input sequences
lstm_units = 256  # Number of LSTM units

def check_oov(tokenized_texts):
    """Check how many OOV tokens are present in the tokenized texts"""
    for text in tokenized_texts:
        if 1 in text:
            return True
    return False
    

# Tokenization
# TODO: create one unified tokenizer for input and output
source_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
source_tokenizer.fit_on_texts(source_texts)
source_sequences = source_tokenizer.texts_to_sequences(source_texts)
padded_source_sequences = pad_sequences(
    source_sequences, maxlen=max_length, padding="post"
)
if check_oov(source_sequences):
    raise ValueError("OOV tokens found in source sequences")

target_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
padded_target_sequences = pad_sequences(
    target_sequences, maxlen=out_seq_length, padding="post"
)
if check_oov(target_sequences):
    raise ValueError("OOV tokens found in target sequences")

# Shifting target sequences to be the expected output (next token)
input_data = padded_source_sequences
output_data = to_categorical(padded_target_sequences, num_classes=vocab_size)


In [89]:
set_unique_source_texts = set((" ".join(source_texts)).split())
len(set_unique_source_texts)

# counter dict for source texts
source_texts_counter = {}
for text in source_texts:
    for word in text.split():
        if word in source_texts_counter:
            source_texts_counter[word] += 1
        else:
            source_texts_counter[word] = 1

# list in ascending order of frequency
source_texts_counter_sorted = sorted(
    source_texts_counter.items(), key=lambda x: x[1], reverse=True
)


In [90]:
source_texts_counter_sorted

[('ORDERLINE', 338128),
 ('4', 28917),
 ('SYSDATATYPES', 28917),
 ('77850', 1400),
 ('78492', 1250),
 ('80006', 1200),
 ('79998', 1200),
 ('78955', 1100),
 ('80832', 1050),
 ('80156', 1050),
 ('78810', 1050),
 ('79905', 1000),
 ('80659', 1000),
 ('81029', 950),
 ('81473', 950),
 ('81759', 950),
 ('65836', 900),
 ('81505', 900),
 ('81268', 800),
 ('78415', 800),
 ('80361', 750),
 ('81085', 750),
 ('79527', 750),
 ('71146', 750),
 ('76052', 750),
 ('79489', 750),
 ('71664', 750),
 ('79288', 750),
 ('79333', 750),
 ('77000', 750),
 ('80827', 750),
 ('76577', 750),
 ('81622', 750),
 ('80769', 750),
 ('65759', 750),
 ('79708', 750),
 ('79172', 750),
 ('80003', 750),
 ('78842', 750),
 ('79841', 750),
 ('78386', 750),
 ('80604', 750),
 ('80220', 750),
 ('81717', 750),
 ('75901', 750),
 ('66633', 750),
 ('80632', 750),
 ('79542', 750),
 ('80419', 750),
 ('81651', 750),
 ('79779', 750),
 ('80633', 750),
 ('73040', 750),
 ('73052', 700),
 ('79778', 700),
 ('80295', 700),
 ('72309', 700),
 ('7920

In [14]:
# write source_text to file
with open("source_text.txt", "w") as f:
    for text in source_texts:
        f.write(text + "\n")

# write target_text to file
with open("target_text.txt", "w") as f:
    for text in target_texts:
        f.write(text + "\n")

In [15]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE())

In [16]:
from tokenizers.pre_tokenizers import Whitespace

tokenizer.pre_tokenizer = Whitespace()

In [106]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(limit_alphabet=28, vocab_size=100)
tokenizer.train(files=["source_text.txt", "target_text.txt"], trainer=trainer)






In [143]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

In [165]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
from tokenizers.trainers import BpeTrainer

In [166]:
table_names = data["TABNAME"].astype(str).apply(lambda x: x.replace("_", "")).unique().tolist()

In [173]:
trainer = BpeTrainer(vocab_size=1000, max_token_length=5, special_tokens=table_names)
tokenizer.train_from_iterator([*source_texts, *target_texts], trainer=trainer)






In [174]:
tokenizer.get_vocab_size()

476

In [175]:
tokenizer.model.save('.')

['./vocab.json', './merges.txt']

In [176]:
sen_enc2=tokenizer.encode(source_texts[0])
print(f"Output: {format(sen_enc2.tokens)}")

Output: ['4', 'SYSDATATYPES', '4', 'SYSDATATYPES', '4', 'SYSDATATYPES', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '733', '01', 'ORDERLINE', '7094', '3', 'ORDERLINE', '7094', '3', 'ORDERLINE', '7094', '3', 'ORDERLINE', '7094', '3', 'ORDERLINE', '7094', '3', 'ORDERLINE', '7094', '3', 'ORDERLINE', '4', 'SYSDATATYPES', '76', '731', 'ORDERLINE', '76', '731', 'ORDERLINE', '76', '731', 'ORDERLINE', '76', '731', 'ORDERLINE', '76', '731', 'ORDERLINE', '4', 'SYSDATATYPES', '4', 'SYSDATATYPES', '4', 'SYSDATATYPES', '808', '71', 'ORDERLINE', '808', '71', 'ORDERLINE', '808', '71', 'ORDERLINE', '808', '71', 'ORDERLINE', '808', '71', 'ORDERLINE', '808', '71', 'ORDERLINE', '808', '71',

In [108]:
for i in range(len(source_texts)):
    output = tokenizer.encode(source_texts[i])
    print(output.tokens)

thread '<unnamed>' panicked at /home/runner/work/tokenizers/tokenizers/tokenizers/src/models/bpe/model.rs:459:66:
no entry found for key


PanicException: no entry found for key

In [109]:
!pip install tiktoken

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting tiktoken
  Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [110]:
# bpe tokenization using tiktoken
from tiktoken import BPE

bpe = BPE()
bpe.train("source_text.txt", "source_text.bpe")



ImportError: cannot import name 'BPE' from 'tiktoken' (/usr/local/lib/python3.11/dist-packages/tiktoken/__init__.py)

In [111]:
from tiktoken.tokenizer import Tokenizer
from tiktoken.learn import BPELearner

# Path to your text file
file_path = 'source_text.txt'

# Read the text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Create a BPE learner
learner = BPELearner()

# Feed text to the learner
learner.add_text(text)

# Train the BPE model
tokenizer = learner.learn(vocab_size=1000)  # Set your desired vocabulary size

# Optionally, you can save your trained model
tokenizer.save('bpe_model.json')

# To tokenize a new text
tokens = tokenizer.tokenize('Some new text to tokenize')
print(tokens)


ModuleNotFoundError: No module named 'tiktoken.tokenizer'

In [140]:
import sentencepiece as spm

# Define the path to your text file
input_file = 'source_text.txt'  # Change this to the path of your text file

# Define the model prefix, it will output files with this prefix
model_prefix = 'bpe_model'

# Define the model type and the number of BPE merge operations (vocabulary size)
vocab_size = 1000  # You can adjust this number based on your needs

# Train the model
spm.SentencePieceTrainer.train(
    input=input_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    split_by_number=False,
    model_type='bpe',  # Specifies that we are using BPE
    character_coverage=1.0  # Adjust as necessary to include various characters

)

print(f"BPE model trained and saved as {model_prefix}.model and {model_prefix}.vocab")


BPE model trained and saved as bpe_model.model and bpe_model.vocab


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: source_text.txt
  input_format: 
  model_prefix: bpe_model
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 0
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differenti

In [141]:
source_texts[0]

'4 SYSDATATYPES 4 SYSDATATYPES 4 SYSDATATYPES 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 73301 ORDERLINE 70943 ORDERLINE 70943 ORDERLINE 70943 ORDERLINE 70943 ORDERLINE 70943 ORDERLINE 70943 ORDERLINE 4 SYSDATATYPES 76731 ORDERLINE 76731 ORDERLINE 76731 ORDERLINE 76731 ORDERLINE 76731 ORDERLINE 4 SYSDATATYPES 4 SYSDATATYPES 4 SYSDATATYPES 80871 ORDERLINE 80871 ORDERLINE 80871 ORDERLINE 80871 ORDERLINE 80871 ORDERLINE 80871 ORDERLINE 80871 ORDERLINE 80873 ORDERLINE 80872 ORDERLINE 80872 ORDERLINE 80872 ORDERLINE 80872 ORDERLINE 4 SYSDATATYPES 65197 ORDERLINE 65197 ORDERLINE 65197 ORDERLINE 65197 ORDERLINE'

In [142]:
# encode the text
sp = spm.SentencePieceProcessor()
sp.load('bpe_model.model')
encoded_text = sp.encode(source_texts[0])
print(encoded_text)

[19, 29, 19, 29, 19, 29, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 190, 55, 11, 78, 52, 987, 11, 78, 52, 987, 11, 78, 52, 987, 11, 78, 52, 987, 11, 78, 52, 987, 11, 78, 52, 987, 11, 19, 29, 213, 170, 11, 213, 170, 11, 213, 170, 11, 213, 170, 11, 213, 170, 11, 19, 29, 19, 29, 19, 29, 96, 56, 11, 96, 56, 11, 96, 56, 11, 96, 56, 11, 96, 56, 11, 96, 56, 11, 96, 56, 11, 96, 979, 987, 11, 96, 76, 11, 96, 76, 11, 96, 76, 11, 96, 76, 11, 19, 29, 403, 11, 403, 11, 403, 11, 403, 11]
