Hyperparameters

In [41]:
BATCH_SIZE = [8, 16, 32, 64, 128, 256]
EPOCHS = 1

Install Keras Packages

In [37]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'



Imports and directories

In [38]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import time

print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)

DIR_DATA = os.path.join("..", "data")
PATH_DATA_TRAIN = os.path.join(DIR_DATA, "train.csv")
PATH_DATA_TEST = os.path.join(DIR_DATA, "test-labeled.csv")

TensorFlow version: 2.17.0-dev20240310
Keras version: 0.1.7


Load disaster tweets

In [39]:
df_train = pd.read_csv(PATH_DATA_TRAIN)
df_test = pd.read_csv(PATH_DATA_TEST)

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

df_train.head()

Training Set Shape = (65, 5)
Training Set Memory Usage = 0.00 MB
Test Set Shape = (19, 5)
Test Set Memory Usage = 0.00 MB


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Preprocess the data

Stable parameters

In [40]:
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2

AUTO = tf.data.experimental.AUTOTUNE

Spilt training and test data

In [42]:
from sklearn.model_selection import train_test_split

X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = df_test["text"]
Y_test = df_test["target"]

Perform the learning and testing with validation data

In [43]:
class ConfusionMatrix:
    tp: int
    fp: int
    tn: int
    fn: int

    def __init__(self, tp: int = 0, tn: int = 0, fp: int = 0, fn: int = 0):
        self.tp = tp
        self.fp = fp
        self.tn = tn
        self.fn = fn

    def __str__(self):
        total = self.tp + self.fp + self.tn + self.fn
        return f'{total},{self.tp},{self.tn},{self.fp},{self.fn},{self.get_precision()},{self.get_recall()},{self.get_f1()}'
    
    def get_f1(self):
        return 2 * ((self.get_precision() * self.get_recall()) / (self.get_precision() + self.get_recall()))
    
    def get_precision(self):
        return self.tp / (self.tp + self.fp)

    def get_recall(self):
        return self.tp / (self.tp + self.fn) 

    def add(self, rhs: 'ConfusionMatrix'):
        self.tp += rhs.tp
        self.fp += rhs.fp
        self.tn += rhs.tn
        self.fn += rhs.fn

    def div(self, divisor):
        self.tp /= divisor
        self.fp /= divisor
        self.tn /= divisor
        self.fn /= divisor

    @classmethod
    def average(cls, matrices: list['ConfusionMatrix']):
        result = ConfusionMatrix()
        for cm in matrices:
            result.add(cm)
        result.div(len(matrices))

        return result
        

In [44]:
from keras.optimizers import Adam

def trainAndValidate(_batch_size, _epochs):

    time_train_start = time.process_time_ns()

    # Load a DistilBERT model.
    preset= "distil_bert_base_en_uncased"

    # Use a shorter sequence length.
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                    sequence_length=160,
                                                                    name="preprocessor_4_tweets"
                                                                    )

    # Pretrained classifier.
    classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)
    
    # Compile
    classifier.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
        optimizer=Adam(1e-5),
        metrics=["accuracy"]
    )

    # Fit
    history = classifier.fit(x=X_train,
                            y=y_train,
                            batch_size=_batch_size,
                            epochs=_epochs,
                            validation_data=(X_val, y_val)
                            )
    
    time_train_end = time.process_time_ns()
    
    y_pred_test = classifier.predict(X_test)

    time_pred_end = time.process_time_ns()

    time_training = (time_train_end - time_train_start) / 10**10
    time_prediction = (time_pred_end - time_train_end) / 10**10

    tn, fp, fn, tp = confusion_matrix(Y_test, np.argmax(y_pred_test, axis=1)).ravel()
    with open('results-keras-test.txt', 'a') as fout:
        fout.write(f'{_batch_size},{_epochs},')
        fout.write(str(ConfusionMatrix(tp, tn, fp, fn)) + ',')
        fout.write(f'{time_training},{time_prediction}\n')


In [45]:
for batch_size in BATCH_SIZE:
    trainAndValidate(batch_size, EPOCHS)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 2s/step - accuracy: 0.4543 - loss: 0.6949 - val_accuracy: 0.3846 - val_loss: 0.6945
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
