Install Keras Packages

In [1]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

# This sample uses Keras Core, the multi-backend version of Keras.
# The selected backend is TensorFlow (other supported backends are 'jax' and 'torch')
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'



Imports and directories

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)

DIR_DATA = os.path.join("..", "data")
PATH_DATA_TRAIN = os.path.join(DIR_DATA, "train.csv")
PATH_DATA_TEST = os.path.join(DIR_DATA, "test.csv")

2024-03-13 15:14:06.621337: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 15:14:06.626306: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 15:14:06.688471: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow backend
TensorFlow version: 2.17.0-dev20240310
Keras version: 0.1.7


Load disaster tweets

In [3]:
df_train = pd.read_csv(PATH_DATA_TRAIN)
df_test = pd.read_csv(PATH_DATA_TEST)

print('Training Set Shape = {}'.format(df_train.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(df_test.shape))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

df_train.head()

Training Set Shape = (7613, 5)
Training Set Memory Usage = 0.29 MB
Test Set Shape = (3263, 4)
Test Set Memory Usage = 0.10 MB


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Preprocess the data

Stable parameters

In [4]:
NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2

AUTO = tf.data.experimental.AUTOTUNE

Hyperparameters

In [None]:
BATCH_SIZE = [1, 2, 4, 8, 16, 32, 64, 128]
EPOCHS = [1, 2, 4, 6, 8, 10, 12, 16]

Spilt training and test data

In [5]:
from sklearn.model_selection import train_test_split

X = df_train["text"]
y = df_train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

X_test = df_test["text"]

Perform the learning and testing with validation data

In [None]:
class ConfusionMatrix:
    tp: int
    fp: int
    tn: int
    fn: int

    def __init__(self, tp: int = 0, tn: int = 0, fp: int = 0, fn: int = 0):
        self.tp = tp
        self.fp = fp
        self.tn = tn
        self.fn = fn

    def __str__(self):
        total = self.tp + self.fp + self.tn + self.fn
        return f'Total: {total:.1f}, TP: {self.tp:.1f}, TN: {self.tn:.1f}, FP: {self.fp:.1f}, FN: {self.fn:.1f}, Prc: {self.get_precision():.3f}, Rec. {self.get_recall():.3f}, F1: {self.get_f1():.3f}'
    
    def get_f1(self):
        return 2 * ((self.get_precision() * self.get_recall()) / (self.get_precision() + self.get_recall()))
    
    def get_precision(self):
        return self.tp / (self.tp + self.fp)

    def get_recall(self):
        return self.tp / (self.tp + self.fn) 

    def add(self, rhs: 'ConfusionMatrix'):
        self.tp += rhs.tp
        self.fp += rhs.fp
        self.tn += rhs.tn
        self.fn += rhs.fn

    def div(self, divisor):
        self.tp /= divisor
        self.fp /= divisor
        self.tn /= divisor
        self.fn /= divisor

    @classmethod
    def average(cls, matrices: list['ConfusionMatrix']):
        result = ConfusionMatrix()
        for cm in matrices:
            result.add(cm)
        result.div(len(matrices))

        return result
        

In [None]:
from keras.optimizers import Adam

def trainAndValidate(_batch_size, _epochs):

    # Load a DistilBERT model.
    preset= "distil_bert_base_en_uncased"

    # Use a shorter sequence length.
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                    sequence_length=160,
                                                                    name="preprocessor_4_tweets"
                                                                    )

    # Pretrained classifier.
    classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

    # Compile
    classifier.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
        optimizer=Adam(1e-5),
        metrics=["accuracy"]
    )

    # Fit
    history = classifier.fit(x=X_train,
                            y=y_train,
                            batch_size=_batch_size,
                            epochs=_epochs, 
                            validation_data=(X_val, y_val)
                            )

    # Pretrained classifier.
    classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                                preprocessor = preprocessor, 
                                                                num_classes=2)
    
    y_pred_val = classifier.predict(X_val)

    tn, fp, fn, tp = confusion_matrix(y_val, np.argmax(y_pred_val, axis=1)).ravel()
    with open('results-keras.txt', 'a') as fout:
        fout.write(f'batch_size: {_batch_size}, epochs: {_epochs}')
        fout.write(str(ConfusionMatrix(tp, tn, fp, fn)))


In [None]:
for batch_size in BATCH_SIZE:
    trainAndValidate(batch_size, 2)