In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping



2024-12-16 11:57:39.307676: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-16 11:57:40.186306: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /gpfs/space/software/cluster_software/spack/linux-centos7-x86_64/gcc-9.2.0/cudnn-8.2.0.53-11.3-3qb34ykh5rffuzu6j3rl4tm6yarrjg7w/lib64:/gpfs/space/software/cluster_software/spack/linux-centos7-x86_64/gcc-9.2.0/cuda-11.3.1-oqzddj7nezymwww6ennwec7qb6kktktw/lib64::/usr/local/cuda-12.6/lib64
2024-12-16 11:57:40.186474: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

Using TensorFlow backend


ModuleNotFoundError: No module named 'nltk'

In [8]:
# First load full dataset
train_full = tf.data.experimental.make_csv_dataset(
    "train.csv",
    batch_size=16,
    label_name='target',
    select_columns=['text', 'keyword', 'location', 'target'], 
    num_epochs=10
)
print("Started")
# Create preprocessors
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    "distil_bert_base_en_uncased",
    sequence_length=160
)
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=2,
    min_delta=0.001,   
    mode='min',     
    restore_best_weights=True 
)
keyword_lookup = keras.layers.StringLookup(output_mode="multi_hot")
location_lookup = keras.layers.StringLookup(output_mode="multi_hot")

# Adapt lookups
keyword_data = train_full.map(lambda x, y: x['keyword'])
location_data = train_full.map(lambda x, y: x['location'])
keyword_lookup.adapt(keyword_data)
location_lookup.adapt(location_data)

keyword_size = len(keyword_lookup.get_vocabulary())
location_size = len(location_lookup.get_vocabulary())

# Split data
train_data = train_full.shuffle(buffer_size=1000)
val_size = len(list(train_full)) // 5
val_ds = train_data.take(val_size)
train_ds = train_data.skip(val_size)

train_size = len(list(train_full)) - val_size
batch_size = 16
steps_per_epoch = train_size // batch_size

def prepare_data(features, label):
    text_preprocessed = preprocessor(features['text'])
    
    # Use map_fn to apply lookup to each element in the batch
    keyword_encoded = tf.map_fn(
        keyword_lookup, 
        features['keyword'],
        fn_output_signature=tf.float32
    )
    location_encoded = tf.map_fn(
        location_lookup, 
        features['location'],
        fn_output_signature=tf.float32
    )
    
    # Add middle dimension
    keyword_encoded = tf.expand_dims(keyword_encoded, axis=1)
    location_encoded = tf.expand_dims(location_encoded, axis=1)
    
    features_dict = {
        'token_ids': text_preprocessed['token_ids'],
        'padding_mask': text_preprocessed['padding_mask'],
        'keyword_encoding': keyword_encoded,
        'location_encoding': location_encoded
    }
    return features_dict, label

# Apply the data preparation
train_ds = train_ds.map(prepare_data)
val_ds = val_ds.map(prepare_data)

train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(tf.data.AUTOTUNE)
train_size = len(list(train_full)) - val_size
batch_size = 16
steps_per_epoch = train_size // batch_size

# Create model
backbone = keras_nlp.models.DistilBertBackbone.from_preset(
    "distil_bert_base_en_uncased"
)

backbone_inputs = backbone.input
keyword_input = keras.layers.Input(shape=(1, keyword_size), dtype=tf.float32, name="keyword_encoding")
location_input = keras.layers.Input(shape=(1, location_size), dtype=tf.float32, name="location_encoding")

x = backbone(backbone_inputs)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Reshape((1, 768))(x)
x = keras.layers.Dense(768, activation="gelu")(x)
x = keras.layers.Bidirectional(keras.layers.LSTM(384, return_sequences=True))(x)
attention = keras.layers.MultiHeadAttention(
    num_heads=8,
    key_dim=48
)(x, x)
x = keras.layers.Add()([x, attention]) 
x = keras.layers.LayerNormalization()(x)

combined = keras.layers.Concatenate(axis=-1)([x, keyword_input, location_input])
combined = keras.layers.Flatten()(combined)

x = keras.layers.Dense(512, activation="gelu", kernel_regularizer=keras.regularizers.l2(0.01))(combined)
x = keras.layers.Dropout(0.3)(x)
x = keras.layers.Dense(256, activation="gelu", kernel_regularizer=keras.regularizers.l2(0.01))(x)
x = keras.layers.Dropout(0.3)(x)
outputs = keras.layers.Dense(2)(x)

model = keras.Model(
    inputs={**backbone_inputs, 
            'keyword_encoding': keyword_input,
            'location_encoding': location_input},
    outputs=outputs
)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(1e-5),
    metrics=["accuracy"]
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    steps_per_epoch = steps_per_epoch,
    callbacks = [early_stopping]
)

print("done")

Started


NameError: name 'augment_batch' is not defined

In [7]:
test_df = pd.read_csv("test.csv")

test_df['text'] = test_df['text'].fillna('')
test_df['keyword'] = test_df['keyword'].fillna('')
test_df['location'] = test_df['location'].fillna('')

test_ds = tf.data.experimental.make_csv_dataset(
    "test.csv",
    batch_size=16,
    label_name=None,
    select_columns=['text', 'keyword', 'location'],
    shuffle=False,
    num_epochs=1
)

test_ids = test_df['id'].tolist()

def prepare_test_data(features):
    text_preprocessed = preprocessor(features['text'])

    # Use map_fn to apply lookup to each element in the batch
    keyword_encoded = tf.map_fn(
        keyword_lookup, 
        features['keyword'],
        fn_output_signature=tf.float32
    )
    location_encoded = tf.map_fn(
        location_lookup, 
        features['location'],
        fn_output_signature=tf.float32
    )

    # Add middle dimension
    keyword_encoded = tf.expand_dims(keyword_encoded, axis=1)
    location_encoded = tf.expand_dims(location_encoded, axis=1)

    features_dict = {
        'token_ids': text_preprocessed['token_ids'],
        'padding_mask': text_preprocessed['padding_mask'],
        'keyword_encoding': keyword_encoded,
        'location_encoding': location_encoded
    }
    return features_dict
    
test_ds = test_ds.map(prepare_test_data)

all_predictions = []
for batch in test_ds:
    batch_preds = model.predict_on_batch(batch)
    all_predictions.extend(tf.argmax(batch_preds, axis=1).numpy())

submission = pd.DataFrame({
    'id': test_ids[:len(all_predictions)],
    'target': all_predictions
})

print("Test file shape:", test_df.shape)
print("Submission shape:", submission.shape)
print("Predictions distribution:")
print(pd.Series(all_predictions).value_counts())

submission.to_csv('submission.csv', index=False)

Test file shape: (3263, 4)
Submission shape: (3263, 2)
Predictions distribution:
0    1955
1    1308
Name: count, dtype: int64
