In [None]:
!wget https://raw.githubusercontent.com/mufaddalhamidofficial/tensorflow_course/main/helper_funcs.py
!wget https://media.githubusercontent.com/media/mufaddalhamidofficial/skimlit_ai/main/data.zip

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from helper_funcs import create_tensorboard_callback, create_checkpoint_callback, plot_loss_curves, compare_historys, unzip_data, calculate_results
import tensorflow_hub as hub
import io

In [None]:
tf.__version__

In [None]:
unzip_data("data.zip")
!rm -rf data.zip

In [None]:
DIR_NAME_01 = "data/01_percent"
DIR_NAME_1 = "data/1_percent"
DIR_NAME_10 = "data/10_percent"
DIR_NAME_100 = "data/100_percent"

test_df_100 = pd.read_csv(DIR_NAME_100 + "/test.csv")
test_df_100 = test_df_100.iloc[:, 1:]

test_sentences_ = test_df_100.text.to_numpy()
test_labels_ = test_df_100.target.to_numpy()

## Callbacks:

In [None]:
def get_callbacks(experiment_name, use_early_stopping = True, patience=10, use_tensorboard = True, use_model_checkpoint = False):
    callbacks = [
    ]
    if use_tensorboard:
        callbacks.append(
            create_tensorboard_callback(
                dir_name="skim_lit/tensorboard",
                experiment_name=experiment_name,
            )
        )
    if use_model_checkpoint:
        callbacks.append(
            create_checkpoint_callback(
                dir_name="skim_lit/checkpoint",
                experiment_name=experiment_name,
                monitor="val_accuracy",
            )
        )
    if use_early_stopping:
        
        callbacks.append(tf.keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=patience,
            restore_best_weights=True,
        ))
    return callbacks

# Modelling

## Model s Data Preps:

### Get and Prepare data

In [None]:
train_data_1 = pd.read_csv(DIR_NAME_1 + "/train.csv")
train_sentences = train_data_1.text.to_numpy()
train_labels = train_data_1.target.to_numpy()

val_data_1 = pd.read_csv(DIR_NAME_1 + "/val.csv")
val_sentences = val_data_1.text.to_numpy()
val_labels = val_data_1.target.to_numpy()

test_sentences = test_sentences_.copy()
test_labels = test_labels_.copy()

### One hot encode labels

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_labels.reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_labels.reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_labels.reshape(-1, 1))

train_labels_one_hot

In [None]:
class_names = one_hot_encoder.categories_[0]
classes_count = len(class_names) # type: ignore
class_names

In [None]:
import string

train_chars = [" ".join(list(sentence)) for sentence in train_sentences]
val_chars = [" ".join(list(sentence)) for sentence in val_sentences]
test_chars = [" ".join(list(sentence)) for sentence in test_sentences]

char_lens = [len(sentence.split(' ')) for sentence in train_chars]

seq_char_len = int(np.percentile(char_lens, 90))

alphabet = string.ascii_lowercase + string.digits + string.punctuation

NUM_CHAR_TOKENS = len(alphabet) + 2
seq_char_len, NUM_CHAR_TOKENS, alphabet

## Model s 5

In [None]:
train_line_numbers_one_hot = tf.one_hot(train_data_1["line_number"].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_data_1["line_number"].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df_100["line_number"].to_numpy(), depth=15)

train_total_lines_one_hot = tf.one_hot(train_data_1["total_lines"].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_data_1["total_lines"].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df_100["total_lines"].to_numpy(), depth=20)

train_line_numbers_one_hot.shape, train_total_lines_one_hot.shape

In [None]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars, train_line_numbers_one_hot, train_total_lines_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars, val_line_numbers_one_hot, val_total_lines_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars, test_line_numbers_one_hot, test_total_lines_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:

hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)

char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=NUM_CHAR_TOKENS,
    output_sequence_length=seq_char_len,
)

char_vectorizer.adapt(train_chars)

char_embedding = tf.keras.layers.Embedding(
    input_dim=len(char_vectorizer.get_vocabulary()),
    output_dim=25,
    mask_zero=True,
)


token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(1,), dtype='string', name='char_inputs')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embedding(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(20,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(256, activation='relu')(token_char_concatenate)
drop_out = tf.keras.layers.Dropout(0.5)(drop_out)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_5 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_5.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_5_history = model_s_5.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_5"),
)

model_s_5_val_preds = tf.argmax(model_s_5.predict(val_dataset), axis=1)
model_s_5_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_5_val_preds)
print(model_s_5_val_results)

plot_loss_curves(model_s_5_history)

model_s_5_preds = tf.argmax(model_s_5.predict(test_dataset), axis=1)
model_s_5_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_5_preds)
model_s_5_results

## Model s 14

In [None]:
train_chars_num_count = np.array([sentence.count('@') for sentence in train_sentences])
val_chars_num_count = np.array([sentence.count('@') for sentence in val_sentences])
test_chars_num_count = np.array([sentence.count('@') for sentence in test_sentences])

train_chars_count_one_hot = tf.one_hot(train_chars_num_count, depth=10)
val_chars_count_one_hot = tf.one_hot(val_chars_num_count, depth=10)
test_chars_count_one_hot = tf.one_hot(test_chars_num_count, depth=10)

In [None]:
train_line_numbers_one_hot = tf.one_hot(train_data_1["line_number"].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_data_1["line_number"].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df_100["line_number"].to_numpy(), depth=15)

train_total_lines_one_hot = tf.one_hot(train_data_1["total_lines"].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_data_1["total_lines"].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df_100["total_lines"].to_numpy(), depth=20)

train_line_numbers_one_hot.shape, train_total_lines_one_hot.shape

In [None]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars_count_one_hot, train_line_numbers_one_hot, train_total_lines_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars_count_one_hot, val_line_numbers_one_hot, val_total_lines_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars_count_one_hot, test_line_numbers_one_hot, test_total_lines_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)

token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(10,), dtype=tf.float32, name='char_inputs')
char_outputs = tf.keras.layers.Dense(32, activation='relu')(char_inputs)
char_model = tf.keras.Model(char_inputs, char_outputs)

line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(20,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(256, activation='relu')(token_char_concatenate)
drop_out = tf.keras.layers.Dropout(0.5)(drop_out)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_14 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_14.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_14_history = model_s_14.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_14"),
)

model_s_14_val_preds = tf.argmax(model_s_14.predict(val_dataset), axis=1)
model_s_14_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_14_val_preds)
print(model_s_14_val_results)

plot_loss_curves(model_s_14_history)

model_s_14_preds = tf.argmax(model_s_14.predict(test_dataset), axis=1)
model_s_14_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_14_preds)
model_s_14_results

## Model s 15

In [None]:
import string

train_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in train_sentences]
val_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in val_sentences]
test_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in test_sentences]

sentence_lengths = [len(sentence.split(' ')) for sentence in train_chars_puntuations]
# sentence_lengths
seq_char_punctuation_len = int(np.percentile(sentence_lengths, 95))

seq_char_punctuation_len

(23, 34, '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

In [None]:
train_line_numbers_one_hot = tf.one_hot(train_data_1["line_number"].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_data_1["line_number"].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df_100["line_number"].to_numpy(), depth=15)

train_total_lines_one_hot = tf.one_hot(train_data_1["total_lines"].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_data_1["total_lines"].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df_100["total_lines"].to_numpy(), depth=20)

train_line_numbers_one_hot.shape, train_total_lines_one_hot.shape

In [None]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars_puntuations, train_line_numbers_one_hot, train_total_lines_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars_puntuations, val_line_numbers_one_hot, val_total_lines_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars_puntuations, test_line_numbers_one_hot, test_total_lines_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)

char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    output_sequence_length=seq_char_punctuation_len,
    standardize='lower',
)

char_vectorizer.adapt(train_chars_puntuations)

char_embedding = tf.keras.layers.Embedding(
    input_dim=len(char_vectorizer.get_vocabulary()),
    output_dim=25,
    mask_zero=True,
)


token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(1,), dtype='string', name='char_inputs')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embedding(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(20,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(256, activation='relu')(token_char_concatenate)
drop_out = tf.keras.layers.Dropout(0.5)(drop_out)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_15 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_15.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_15_history = model_s_15.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_15"),
)

model_s_15_val_preds = tf.argmax(model_s_15.predict(val_dataset), axis=1)
model_s_15_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_15_val_preds)
print(model_s_15_val_results)

plot_loss_curves(model_s_15_history)

model_s_15_preds = tf.argmax(model_s_15.predict(test_dataset), axis=1)
model_s_15_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_15_preds)
model_s_15_results

## Model s 16

In [None]:
import string

train_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in train_sentences]
val_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in val_sentences]
test_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in test_sentences]

sentence_lengths = [len(sentence.split(' ')) for sentence in train_chars_puntuations]
# sentence_lengths
seq_char_punctuation_len = int(np.percentile(sentence_lengths, 95) / 4) * 4

char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    output_sequence_length=seq_char_punctuation_len,
    standardize='lower',
)

char_vectorizer.adapt(train_chars_puntuations)

train_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(train_chars_puntuations)))
val_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(val_chars_puntuations)))
test_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(test_chars_puntuations)))

one_hot_matrix = np.eye(len(char_vectorizer.get_vocabulary()))

train_chars_puntuations_one_hot = one_hot_matrix[train_chars_puntuations_vectorized][:, :, 2:]
val_chars_puntuations_one_hot = one_hot_matrix[val_chars_puntuations_vectorized][:, :, 2:]
test_chars_puntuations_one_hot = one_hot_matrix[test_chars_puntuations_vectorized][:, :, 2:]
train_chars_puntuations_one_hot.shape, val_chars_puntuations_one_hot.shape, test_chars_puntuations_one_hot.shape

In [None]:
hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)


token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(20,26,), dtype=tf.float32, name='char_puntuation_inputs')
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24))(char_inputs)
char_outputs = tf.keras.layers.Dense(128, activation='relu')(char_bi_lstm)
char_model = tf.keras.Model(char_inputs, char_outputs)

line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(20,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(256, activation='relu')(token_char_concatenate)
drop_out = tf.keras.layers.Dropout(0.5)(drop_out)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_16 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_16.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_16_history = model_s_16.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_16"),
)

model_s_16_val_preds = tf.argmax(model_s_16.predict(val_dataset), axis=1)
model_s_16_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_16_val_preds)
print(model_s_16_val_results)

plot_loss_curves(model_s_16_history)

model_s_16_preds = tf.argmax(model_s_16.predict(test_dataset), axis=1)
model_s_16_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_16_preds)
model_s_16_results

## Model s 17

In [None]:
import string

train_chars = [" ".join(list(sentence)) for sentence in train_sentences]
val_chars = [" ".join(list(sentence)) for sentence in val_sentences]
test_chars = [" ".join(list(sentence)) for sentence in test_sentences]

char_lens = [len(sentence.split(' ')) for sentence in train_chars]

seq_char_len = int(np.percentile(char_lens, 90))

alphabet = string.ascii_lowercase + string.digits + string.punctuation

NUM_CHAR_TOKENS = len(alphabet) + 2
seq_char_len, NUM_CHAR_TOKENS, alphabet

In [None]:
train_line_numbers_one_hot = tf.one_hot(train_data_1["line_number"].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_data_1["line_number"].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df_100["line_number"].to_numpy(), depth=15)

train_total_lines_one_hot = tf.one_hot(train_data_1["total_lines"].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_data_1["total_lines"].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df_100["total_lines"].to_numpy(), depth=20)

train_line_numbers_one_hot.shape, train_total_lines_one_hot.shape

In [None]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars, train_line_numbers_one_hot, train_total_lines_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars, val_line_numbers_one_hot, val_total_lines_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars, test_line_numbers_one_hot, test_total_lines_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:

hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)

char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=NUM_CHAR_TOKENS,
    output_sequence_length=seq_char_len,
)

char_vectorizer.adapt(train_chars)

char_embedding = tf.keras.layers.Embedding(
    input_dim=len(char_vectorizer.get_vocabulary()),
    output_dim=25,
    mask_zero=True,
)


token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(1,), dtype='string', name='char_inputs')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embedding(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

line_number_inputs = tf.keras.layers.Input(shape=(15,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(20,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(128, activation='relu')(token_char_concatenate)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_17 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_17.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_17_history = model_s_17.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_17"),
)

model_s_17_val_preds = tf.argmax(model_s_17.predict(val_dataset), axis=1)
model_s_17_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_17_val_preds)
print(model_s_17_val_results)

plot_loss_curves(model_s_17_history)

model_s_17_preds = tf.argmax(model_s_17.predict(test_dataset), axis=1)
model_s_17_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_17_preds)
model_s_17_results