In [None]:
!wget https://raw.githubusercontent.com/mufaddalhamidofficial/tensorflow_course/main/helper_funcs.py
!wget https://media.githubusercontent.com/media/mufaddalhamidofficial/skimlit_ai/main/data.zip

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from helper_funcs import create_tensorboard_callback, create_checkpoint_callback, plot_loss_curves, compare_historys, unzip_data, calculate_results
import tensorflow_hub as hub
import io

In [None]:
tf.__version__

In [None]:
unzip_data("data.zip", '/kaggle/temp/')
!rm -rf data.zip

In [None]:
DIR_NAME_01 = "/kaggle/temp/data/01_percent"
DIR_NAME_1 = "/kaggle/temp/data/1_percent"
DIR_NAME_10 = "/kaggle/temp/data/10_percent"
DIR_NAME_100 = "/kaggle/temp/data/100_percent"

test_df_100 = pd.read_csv(DIR_NAME_100 + "/test.csv")
test_df_100 = test_df_100.iloc[:, 1:]

test_sentences_ = test_df_100.text.to_numpy()
test_labels_ = test_df_100.target.to_numpy()

## Callbacks:

In [None]:
def get_callbacks(experiment_name, use_early_stopping = True, patience=10, use_tensorboard = True, use_model_checkpoint = True):
    callbacks = [
    ]
    if use_tensorboard:
        callbacks.append(
            create_tensorboard_callback(
                dir_name="skim_lit/tensorboard",
                experiment_name=experiment_name,
            )
        )
    if use_model_checkpoint:
        callbacks.append(
            create_checkpoint_callback(
                dir_name="skim_lit/checkpoint",
                experiment_name=experiment_name,
                monitor="val_accuracy",
            )
        )
    if use_early_stopping:
        
        callbacks.append(tf.keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=patience,
            restore_best_weights=True,
        ))
    return callbacks

# Modelling

## Model s Data Preps:

### Get and Prepare data

In [None]:
train_data_1 = pd.read_csv(DIR_NAME_1 + "/train.csv")
train_sentences = train_data_1.text.to_numpy()
train_labels = train_data_1.target.to_numpy()

val_data_1 = pd.read_csv(DIR_NAME_1 + "/val.csv")
val_sentences = val_data_1.text.to_numpy()
val_labels = val_data_1.target.to_numpy()

test_sentences = test_sentences_.copy()
test_labels = test_labels_.copy()

### One hot encode labels

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_labels.reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_labels.reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_labels.reshape(-1, 1))

train_labels_one_hot

In [None]:
class_names = list(one_hot_encoder.categories_[0])
classes_count = len(class_names) # type: ignore
class_names

In [None]:
import string

train_chars = [" ".join(list(sentence)) for sentence in train_sentences]
val_chars = [" ".join(list(sentence)) for sentence in val_sentences]
test_chars = [" ".join(list(sentence)) for sentence in test_sentences]

char_lens = [len(sentence.split(' ')) for sentence in train_chars]

seq_char_len = int(np.percentile(char_lens, 90))

alphabet = string.ascii_lowercase + string.digits + string.punctuation

NUM_CHAR_TOKENS = len(alphabet) + 2
seq_char_len, NUM_CHAR_TOKENS, alphabet

## Model s 55

In [None]:
import string

train_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in train_sentences]
val_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in val_sentences]
test_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in test_sentences]

sentence_lengths = [len(sentence.split(' ')) for sentence in train_chars_puntuations]
# sentence_lengths
seq_char_punctuation_len = int(np.percentile(sentence_lengths, 95) / 4) * 4

char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    output_sequence_length=seq_char_punctuation_len,
    standardize='lower',
)

char_vectorizer.adapt(train_chars_puntuations)

train_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(train_chars_puntuations)))
val_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(val_chars_puntuations)))
test_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(test_chars_puntuations)))

one_hot_matrix = np.eye(len(char_vectorizer.get_vocabulary()))

train_chars_puntuations_one_hot = one_hot_matrix[train_chars_puntuations_vectorized][:, :, 2:]
val_chars_puntuations_one_hot = one_hot_matrix[val_chars_puntuations_vectorized][:, :, 2:]
test_chars_puntuations_one_hot = one_hot_matrix[test_chars_puntuations_vectorized][:, :, 2:]
train_chars_puntuations_one_hot.shape, val_chars_puntuations_one_hot.shape, test_chars_puntuations_one_hot.shape

In [None]:
def tf_count(t, val):
    elements_equal_to_value = tf.equal(t, val)
    as_ints = tf.cast(elements_equal_to_value, tf.int32)
    count = tf.reduce_sum(as_ints)
    return count
text_vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=50)
text_vectorizer.adapt(train_sentences)
vocab = text_vectorizer.get_vocabulary()[2:]

words = tf.reshape(tf.strings.split(tf.strings.regex_replace(train_sentences, r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']', ""), ' '), shape=[-1])
counts = {word: tf_count(words, word).numpy() for word in vocab}
counts_df = pd.DataFrame(counts.items(), columns=["word", "count"])
# counts_df = counts_df.sort_values("count", ascending=False)
max_tokens = counts_df[counts_df['count'] >= 5].shape[0]
max_tokens

In [None]:
train_line_numbers_one_hot = train_data_1["line_number"].to_numpy()
val_line_numbers_one_hot = val_data_1["line_number"].to_numpy()
test_line_numbers_one_hot = test_df_100["line_number"].to_numpy()

train_total_lines_one_hot = train_data_1["total_lines"].to_numpy()
val_total_lines_one_hot = val_data_1["total_lines"].to_numpy()
test_total_lines_one_hot = test_df_100["total_lines"].to_numpy()

train_line_numbers_one_hot.shape, train_total_lines_one_hot.shape

In [None]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars_puntuations_one_hot, train_line_numbers_one_hot, train_total_lines_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars_puntuations_one_hot, val_line_numbers_one_hot, val_total_lines_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars_puntuations_one_hot, test_line_numbers_one_hot, test_total_lines_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=50,
)

text_vectorizer.adapt(train_sentences)

embedding = tf.keras.layers.Embedding(
    input_dim=len(text_vectorizer.get_vocabulary()),
    output_dim=2048,
    mask_zero=True,
)

token_inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
token_vectorization = text_vectorizer(token_inputs)
token_embedding = embedding(token_vectorization)
token_merge = tf.keras.layers.Flatten()(token_embedding)
token_x = tf.keras.layers.Dense(512, activation='relu')(token_merge)
token_x = tf.keras.layers.Dense(256, activation='relu')(token_x)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_x)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(20,26,), dtype=tf.float32, name='char_puntuation_inputs')
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(char_inputs)
char_x = tf.keras.layers.Dense(256, activation='relu')(char_bi_lstm)
char_outputs = tf.keras.layers.Dense(128, activation='relu')(char_x)
char_model = tf.keras.Model(char_inputs, char_outputs)

line_number_inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(1,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(256, activation='relu')(token_char_concatenate)
drop_out = tf.keras.layers.Dropout(0.5)(drop_out)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_55 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_55.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

model_s_55_history = model_s_55.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_55"),
)

model_s_55.load_weights("skim_lit/checkpoint/model_s_55/checkpoint.ckpt")

model_s_55_val_preds = tf.argmax(model_s_55.predict(val_dataset), axis=1)
model_s_55_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_55_val_preds)
print(model_s_55_val_results)

plot_loss_curves(model_s_55_history)

model_s_55_preds = tf.argmax(model_s_55.predict(test_dataset), axis=1)
model_s_55_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_55_preds)
model_s_55_results

## Model s 56

In [None]:
import string

train_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in train_sentences]
val_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in val_sentences]
test_chars_puntuations = [' '.join([e for e in list(sentence) if e not in string.ascii_lowercase + ' ']) for sentence in test_sentences]

sentence_lengths = [len(sentence.split(' ')) for sentence in train_chars_puntuations]
# sentence_lengths
seq_char_punctuation_len = int(np.percentile(sentence_lengths, 95) / 4) * 4

char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    output_sequence_length=seq_char_punctuation_len,
    standardize='lower',
)

char_vectorizer.adapt(train_chars_puntuations)

train_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(train_chars_puntuations)))
val_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(val_chars_puntuations)))
test_chars_puntuations_vectorized = np.array(char_vectorizer(np.array(test_chars_puntuations)))

one_hot_matrix = np.eye(len(char_vectorizer.get_vocabulary()))

train_chars_puntuations_one_hot = one_hot_matrix[train_chars_puntuations_vectorized][:, :, 2:]
val_chars_puntuations_one_hot = one_hot_matrix[val_chars_puntuations_vectorized][:, :, 2:]
test_chars_puntuations_one_hot = one_hot_matrix[test_chars_puntuations_vectorized][:, :, 2:]
train_chars_puntuations_one_hot.shape, val_chars_puntuations_one_hot.shape, test_chars_puntuations_one_hot.shape

In [None]:
def tf_count(t, val):
    elements_equal_to_value = tf.equal(t, val)
    as_ints = tf.cast(elements_equal_to_value, tf.int32)
    count = tf.reduce_sum(as_ints)
    return count
text_vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=50)
text_vectorizer.adapt(train_sentences)
vocab = text_vectorizer.get_vocabulary()[2:]

words = tf.reshape(tf.strings.split(tf.strings.regex_replace(train_sentences, r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']', ""), ' '), shape=[-1])
counts = {word: tf_count(words, word).numpy() for word in vocab}
counts_df = pd.DataFrame(counts.items(), columns=["word", "count"])
# counts_df = counts_df.sort_values("count", ascending=False)
max_tokens = counts_df[counts_df['count'] >= 5].shape[0]
max_tokens

In [None]:
train_line_numbers_one_hot = train_data_1["line_number"].to_numpy()
val_line_numbers_one_hot = val_data_1["line_number"].to_numpy()
test_line_numbers_one_hot = test_df_100["line_number"].to_numpy()

train_total_lines_one_hot = train_data_1["total_lines"].to_numpy()
val_total_lines_one_hot = val_data_1["total_lines"].to_numpy()
test_total_lines_one_hot = test_df_100["total_lines"].to_numpy()

train_line_numbers_one_hot.shape, train_total_lines_one_hot.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=max_tokens)
train_sentences_vectorized = vectorizer.fit_transform(train_sentences).toarray()
val_sentences_vectorized = vectorizer.transform(val_sentences).toarray()
test_sentences_vectorized = vectorizer.transform(test_sentences).toarray()


In [None]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences_vectorized, train_chars_puntuations_one_hot, train_line_numbers_one_hot, train_total_lines_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences_vectorized, val_chars_puntuations_one_hot, val_line_numbers_one_hot, val_total_lines_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences_vectorized, test_chars_puntuations_one_hot, test_line_numbers_one_hot, test_total_lines_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:


token_inputs = tf.keras.layers.Input(shape=(max_tokens,), dtype=tf.float32)
token_x = tf.keras.layers.Dense(512, activation='relu')(token_inputs)
token_x = tf.keras.layers.Dense(256, activation='relu')(token_x)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_x)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(20,26,), dtype=tf.float32, name='char_puntuation_inputs')
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(char_inputs)
char_x = tf.keras.layers.Dense(256, activation='relu')(char_bi_lstm)
char_outputs = tf.keras.layers.Dense(128, activation='relu')(char_x)
char_model = tf.keras.Model(char_inputs, char_outputs)

line_number_inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.float32, name='line_number_inputs')
line_number_outputs = tf.keras.layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, line_number_outputs)

total_lines_inputs = tf.keras.layers.Input(shape=(1,), name='total_lines_inputs')
total_lines_outputs = tf.keras.layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, total_lines_outputs)

token_char_concatenate = tf.keras.layers.Concatenate(name="token_char_concatenate")([token_model.output, char_model.output])

drop_out = tf.keras.layers.Dense(256, activation='relu')(token_char_concatenate)
drop_out = tf.keras.layers.Dropout(0.5)(drop_out)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([drop_out, line_number_model.output, total_lines_model.output])

outputs = tf.keras.layers.Dense(5, activation='softmax', name='output_layer')(final_concatenate)

model_s_56 = tf.keras.Model([token_model.input, char_model.input, line_number_model.input, total_lines_model.input], outputs)

model_s_56.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=["accuracy"],
)

model_s_56_history = model_s_56.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_56"),
)

model_s_56.load_weights("skim_lit/checkpoint/model_s_56/checkpoint.ckpt")

model_s_56_val_preds = tf.argmax(model_s_56.predict(val_dataset), axis=1)
model_s_56_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_56_val_preds)
print(model_s_56_val_results)

plot_loss_curves(model_s_56_history)

model_s_56_preds = tf.argmax(model_s_56.predict(test_dataset), axis=1)
model_s_56_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_56_preds)
model_s_56_results