In [21]:
!wget https://raw.githubusercontent.com/mufaddalhamidofficial/tensorflow_course/main/helper_funcs.py
!wget https://media.githubusercontent.com/media/mufaddalhamidofficial/skimlit_ai/main/data.zip

zsh:1: command not found: wget
zsh:1: command not found: wget


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from helper_funcs import create_tensorboard_callback, create_checkpoint_callback, plot_loss_curves, compare_historys, unzip_data, calculate_results
import tensorflow_hub as hub
import io

In [3]:
tf.__version__

'2.15.0'

In [24]:
unzip_data("data.zip")
!rm -rf data.zip

In [4]:
DIR_NAME_01 = "data/01_percent"
DIR_NAME_1 = "data/1_percent"
DIR_NAME_10 = "data/10_percent"
DIR_NAME_100 = "data/100_percent"

test_df_100 = pd.read_csv(DIR_NAME_100 + "/test.csv")
test_df_100 = test_df_100.iloc[:, 1:]

test_sentences_ = test_df_100.text.to_numpy()
test_labels_ = test_df_100.target.to_numpy()

## Callbacks:

In [5]:
def get_callbacks(experiment_name, use_early_stopping = True, patience=10, use_tensorboard = True, use_model_checkpoint = False):
    callbacks = [
    ]
    if use_tensorboard:
        callbacks.append(
            create_tensorboard_callback(
                dir_name="skim_lit/tensorboard",
                experiment_name=experiment_name,
            )
        )
    if use_model_checkpoint:
        callbacks.append(
            create_checkpoint_callback(
                dir_name="skim_lit/checkpoint",
                experiment_name=experiment_name,
                monitor="val_accuracy",
            )
        )
    if use_early_stopping:
        
        callbacks.append(tf.keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=patience,
            restore_best_weights=True,
        ))
    return callbacks

# Modelling

## Model s Data Preps:

### Get and Prepare data

In [6]:
train_data_1 = pd.read_csv(DIR_NAME_1 + "/train.csv")
train_sentences = train_data_1.text.to_numpy()
train_labels = train_data_1.target.to_numpy()

val_data_1 = pd.read_csv(DIR_NAME_1 + "/val.csv")
val_sentences = val_data_1.text.to_numpy()
val_labels = val_data_1.target.to_numpy()

test_sentences = test_labels_.copy()
test_labels = test_labels_.copy()

### One hot encode labels

In [7]:
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_labels.reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_labels.reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_labels.reshape(-1, 1))

train_labels_one_hot

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [8]:
class_names = one_hot_encoder.categories_[0]
classes_count = len(class_names) # type: ignore
class_names

array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
      dtype=object)

In [9]:
import string

train_chars = [" ".join(list(sentence)) for sentence in train_sentences]
val_chars = [" ".join(list(sentence)) for sentence in val_sentences]
test_chars = [" ".join(list(sentence)) for sentence in test_sentences]

char_lens = [len(sentence) for sentence in train_sentences]

seq_char_len = int(np.percentile(char_lens, 90))

alphabet = string.ascii_lowercase + string.digits + string.punctuation

NUM_CHAR_TOKENS = len(alphabet) + 2
seq_char_len, NUM_CHAR_TOKENS, alphabet

(243,
 70,
 'abcdefghijklmnopqrstuvwxyz0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

## Model s 6

In [10]:
train_pos_data = train_data_1["line_number"].to_numpy() / train_data_1["total_lines"].to_numpy()
train_pos_rounded = ((train_pos_data * 20).round(0)/20)
train_pos_one_hot = tf.one_hot(train_pos_rounded, depth=20)

val_pos_data = val_data_1["line_number"].to_numpy() / val_data_1["total_lines"].to_numpy()
val_pos_rounded = ((val_pos_data * 20).round(0)/20)
val_pos_one_hot = tf.one_hot(val_pos_rounded, depth=20)

test_pos_data = test_df_100["line_number"].to_numpy() / test_df_100["total_lines"].to_numpy()
test_pos_rounded = ((test_pos_data * 20).round(0)/20)
test_pos_one_hot = tf.one_hot(test_pos_rounded, depth=20)


2024-02-03 13:55:23.527838: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-02-03 13:55:23.527863: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-02-03 13:55:23.527868: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-02-03 13:55:23.527919: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-03 13:55:23.527945: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
train_word_char_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars, train_pos_one_hot))
train_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_char_pos_data, train_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_char_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars, val_pos_one_hot))
val_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_char_pos_data, val_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_char_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars, test_pos_one_hot))
test_word_char_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_char_pos_data, test_word_char_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [12]:
hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)
char_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=NUM_CHAR_TOKENS,
    output_sequence_length=seq_char_len,
)

char_vectorizer.adapt(train_chars)

char_embedding = tf.keras.layers.Embedding(
    input_dim=len(char_vectorizer.get_vocabulary()),
    output_dim=25,
    mask_zero=True,
)

token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

char_inputs = tf.keras.layers.Input(shape=(1,), dtype='string', name='char_inputs')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embedding(char_vectors)
char_bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)


pos_inputs = tf.keras.layers.Input(shape=(20,), name='pos_inputs')
pos_outputs = tf.keras.layers.Dense(32, activation='relu')(pos_inputs)
pos_model = tf.keras.Model(pos_inputs, pos_outputs)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([token_model.output, char_model.output, pos_model.output])

x = tf.keras.layers.Dense(32, activation='softmax', name='preoutput_dense_layer')(final_concatenate)
outputs = tf.keras.layers.Dense(classes_count, activation='softmax', name='output_layer')(x)

model_s_6 = tf.keras.Model([token_model.input, char_model.input, pos_model.input], outputs)

model_s_6.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_6_history = model_s_6.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_6"),
)

model_s_6_val_preds = tf.argmax(model_s_6.predict(val_dataset), axis=1)
model_s_6_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_6_val_preds)
print(model_s_6_val_results)

plot_loss_curves(model_s_6_history)

model_s_6_preds = tf.argmax(model_s_6.predict(test_dataset), axis=1)
model_s_6_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_6_preds)
model_s_6_results

2024-02-03 13:55:28.148048: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


## Model s 7

In [None]:
train_pos_data = train_data_1["line_number"].to_numpy() / train_data_1["total_lines"].to_numpy()
train_pos_rounded = ((train_pos_data * 20).round(0)/20)
train_pos_one_hot = tf.one_hot(train_pos_rounded, depth=20)

val_pos_data = val_data_1["line_number"].to_numpy() / val_data_1["total_lines"].to_numpy()
val_pos_rounded = ((val_pos_data * 20).round(0)/20)
val_pos_one_hot = tf.one_hot(val_pos_rounded, depth=20)

test_pos_data = test_df_100["line_number"].to_numpy() / test_df_100["total_lines"].to_numpy()
test_pos_rounded = ((test_pos_data * 20).round(0)/20)
test_pos_one_hot = tf.one_hot(test_pos_rounded, depth=20)


2024-02-03 13:55:23.527838: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-02-03 13:55:23.527863: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-02-03 13:55:23.527868: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-02-03 13:55:23.527919: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-03 13:55:23.527945: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
train_word_pos_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_pos_one_hot))
train_word_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset = tf.data.Dataset.zip((train_word_pos_data, train_word_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

val_word_pos_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_pos_one_hot))
val_word_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset = tf.data.Dataset.zip((val_word_pos_data, val_word_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)

test_word_pos_data = tf.data.Dataset.from_tensor_slices((test_sentences, test_pos_one_hot))
test_word_pos_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset = tf.data.Dataset.zip((test_word_pos_data, test_word_pos_labels)).batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:

# Open a strategy scope.
hub_embedding = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2",
    trainable=False,
    name="universal_sentence_encoder",
)

token_inputs = tf.keras.layers.Input(shape=[], dtype='string', name='token_inputs')
token_embedding = hub_embedding(token_inputs)
token_outputs = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
token_model = tf.keras.Model(token_inputs, token_outputs)

pos_inputs = tf.keras.layers.Input(shape=(20,), name='pos_inputs')
pos_outputs = tf.keras.layers.Dense(32, activation='relu')(pos_inputs)
pos_model = tf.keras.Model(pos_inputs, pos_outputs)

final_concatenate = tf.keras.layers.Concatenate(name='final_concatenate')([token_model.output, pos_model.output])

x = tf.keras.layers.Dense(32, activation='softmax', name='preoutput_dense_layer')(final_concatenate)
outputs = tf.keras.layers.Dense(classes_count, activation='softmax', name='output_layer')(x)

model_s_7 = tf.keras.Model([token_model.input, pos_model.input], outputs)

model_s_7.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_s_7_history = model_s_7.fit(
    train_dataset,
    epochs=10,
    steps_per_epoch=int(len(train_dataset)),
    validation_data=val_dataset,
    validation_steps=int(len(val_dataset)),
    callbacks=get_callbacks("model_s_7"),
)

model_s_7_val_preds = tf.argmax(model_s_7.predict(val_dataset), axis=1)
model_s_7_val_results = calculate_results(tf.argmax(val_labels_one_hot, axis=1), model_s_7_val_preds)
print(model_s_7_val_results)

plot_loss_curves(model_s_7_history)

model_s_7_preds = tf.argmax(model_s_7.predict(test_dataset), axis=1)
model_s_7_results = calculate_results(tf.argmax(test_labels_one_hot, axis=1), model_s_7_preds)
model_s_7_results

2024-02-03 13:55:28.148048: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
