**First step to create a LLM**

---



In [1]:
# Data preparation (replace later with actual data)
text = "hello world hello python hello llm"

In [2]:
# Character-Level Tokenization
chars = sorted(list(set(text)))
char_to_index = {ch: i for i, ch in enumerate(chars)}
index_to_char = {i: ch for i, ch in enumerate(chars)}

vocab_size = len(chars)
print(f"Vocab size: {vocab_size}") # Vocab size: 11
print(f"Characters: {chars}") # Characters: [' ', 'd', 'e', 'h', 'l', 'm', 'n', 'o', 'p', 't', 'w', 'y']

Vocab size: 13
Characters: [' ', 'd', 'e', 'h', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'w', 'y']


In [3]:
# pre-processing
sequence_length = 4  # Length of input sequences
sequences = []
next_chars = []

for i in range(0, len(text) - sequence_length):
    seq = text[i:i + sequence_length]
    next_char = text[i + sequence_length]
    sequences.append([char_to_index[char] for char in seq])
    next_chars.append(char_to_index[next_char])

print(f"Number of sequences: {len(sequences)}") # Number of sequences: 26
print(sequences[:5]) # Example: [[7, 3, 4, 4], [3, 4, 4, 7], [4, 4, 7, 8], [4, 7, 8, 7], [7, 8, 7, 7]]
print(next_chars[:5]) # Example: [7, 8, 7, 7, 0]

Number of sequences: 30
[[3, 2, 4, 4], [2, 4, 4, 7], [4, 4, 7, 0], [4, 7, 0, 11], [7, 0, 11, 7]]
[7, 0, 11, 7, 9]


In [4]:
# actual model using recurrent neural network (RNN) with an embedding layer
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding, LSTM, Dense

# Convert lists to numpy arrays
sequences = np.array(sequences)
next_chars = np.array(next_chars)

model = keras.Sequential([
    Embedding(vocab_size, 128, input_length=sequence_length), # Learn vector representations of characters.
    LSTM(128),  # Recurrent layer to process sequences.
    Dense(vocab_size, activation='softmax') # Output layer, probabilities for each character.
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.summary()



In [5]:
# train
model.fit(sequences, next_chars, epochs=100, verbose=1) # Adjust epochs as needed

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 2.5657
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 2.5553
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 2.5448
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2.5341
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - loss: 2.5229
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 2.5111
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 2.4983
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 2.4843
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 2.4689
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 2.4518
Epoch 11/10

<keras.src.callbacks.history.History at 0x7b88cbfaf430>

In [6]:
# generate text
def generate_text(seed_text, length=100):
    generated_text = seed_text
    for _ in range(length):
        sequence = [char_to_index[char] for char in seed_text[-sequence_length:]]
        sequence = np.array([sequence]) # Reshape for input to the model
        predictions = model.predict(sequence)
        predicted_index = np.argmax(predictions)
        predicted_char = index_to_char[predicted_index]
        generated_text += predicted_char
        seed_text += predicted_char
    return generated_text

seed = "hell"
generated = generate_text(seed)
print(f"Generated text: {generated}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1