In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import polars as pl
import numpy as np

In [25]:
max_vocab_size = 50000
sequence_len = 90
batch_size = 64
embed_dim = 256
num_heads = 4
ff_dim = 512
dropout_rate = 0.1

In [26]:
df = pl.read_csv('data/general.csv', quote_char='"').select(['question', 'answer'])
df2 = pl.read_csv('data/reddit.csv', ignore_errors=True).select(['question', 'answer'])
df3 = pl.read_csv('data/suicidio.csv', ignore_errors=True).select(['question', 'answer'])

In [27]:
df_total = pl.concat([df, df2, df3], how="vertical").drop_nulls().unique()

In [28]:
questions = df_total["question"].cast(str).to_list()
answers = [f"[START] {a.strip()} [END]" for a in df_total["answer"].cast(str).to_list()]

In [29]:
question_vectorizer = layers.TextVectorization(
    max_tokens=max_vocab_size, output_mode="int", output_sequence_length=sequence_len
)
answer_vectorizer = layers.TextVectorization(
    max_tokens=max_vocab_size, output_mode="int", output_sequence_length=sequence_len
)

In [30]:
question_vectorizer.adapt(questions)
answer_vectorizer.adapt(answers)

In [31]:
questions_tensor = question_vectorizer(questions)
answers_tensor = answer_vectorizer(answers)

In [32]:
encoder_input = questions_tensor
decoder_input = answers_tensor[:, :-1]
decoder_target = answers_tensor[:, 1:]

In [33]:
data_size = encoder_input.shape[0]
val_split = int(data_size * 0.1)

In [34]:
train_data = tf.data.Dataset.from_tensor_slices(
    ((encoder_input[val_split:], decoder_input[val_split:]), decoder_target[val_split:])
)
val_data = tf.data.Dataset.from_tensor_slices(
    ((encoder_input[:val_split], decoder_input[:val_split]), decoder_target[:val_split])
)

In [35]:
train_data = train_data.shuffle(1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_data = val_data.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [36]:
def transformer_encoder(embed_dim, num_heads, ff_dim, dropout=0.1):
    inputs = layers.Input(shape=(None,))
    x = layers.Embedding(max_vocab_size, embed_dim)(inputs)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
    x = layers.Add()([x, attn])
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    ffn = keras.Sequential([
        layers.Dense(ff_dim, activation="relu"),
        layers.Dense(embed_dim),
    ])
    x = layers.Add()([x, ffn(x)])
    return keras.Model(inputs, x)

In [37]:
def transformer_decoder(embed_dim, num_heads, ff_dim, dropout=0.1):
    enc_inputs = layers.Input(shape=(None, embed_dim))
    dec_inputs = layers.Input(shape=(None,))
    x = layers.Embedding(max_vocab_size, embed_dim)(dec_inputs)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    attn1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
    x = layers.Add()([x, attn1])
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    attn2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, enc_inputs)
    x = layers.Add()([x, attn2])
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    ffn = keras.Sequential([
        layers.Dense(ff_dim, activation="relu"),
        layers.Dense(embed_dim),
    ])
    x = layers.Add()([x, ffn(x)])
    outputs = layers.Dense(max_vocab_size, activation="softmax")(x)
    return keras.Model([enc_inputs, dec_inputs], outputs)

In [38]:
encoder = transformer_encoder(embed_dim, num_heads, ff_dim, dropout=dropout_rate)
decoder = transformer_decoder(embed_dim, num_heads, ff_dim, dropout=dropout_rate)

In [39]:
enc_inputs = layers.Input(shape=(None,))
dec_inputs = layers.Input(shape=(None,))
enc_outputs = encoder(enc_inputs)
dec_outputs = decoder([enc_outputs, dec_inputs])

In [40]:
model = keras.Model([enc_inputs, dec_inputs], dec_outputs)

In [41]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [42]:
callbacks = [
    keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
]

In [None]:
history = model.fit(train_data, validation_data=val_data, epochs=10, callbacks=callbacks)
model.save('CoreMate_v1')