In [1]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

2024-12-15 18:57:14.287667: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-15 18:57:20.594852: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /gpfs/space/software/cluster_software/spack/linux-centos7-x86_64/gcc-9.2.0/cudnn-8.2.0.53-11.3-3qb34ykh5rffuzu6j3rl4tm6yarrjg7w/lib64:/gpfs/space/software/cluster_software/spack/linux-centos7-x86_64/gcc-9.2.0/cuda-11.3.1-oqzddj7nezymwww6ennwec7qb6kktktw/lib64::/usr/local/cuda-12.6/lib64
2024-12-15 18:57:20.595033: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

Using TensorFlow backend


In [2]:
BATCH_SIZE = 32
#NUM_TRAINING_EXAMPLES = df_train.shape[0]
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2
#STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE
EPOCHS = 2
AUTO = tf.data.experimental.AUTOTUNE

In [3]:
def load_data(train = "train.csv", test = "test.csv"):
    df_train = pd.read_csv(train)
    df_test = pd.read_csv(test)

    X_train, X_val, y_train, y_val = train_test_split(df_train["text"], df_train["target"], test_size = 0.2, random_state=42)
    X_test = df_test["text"]
    test_ids = df_test["id"]

    return (X_train, y_train), (X_val, y_val), (X_test, test_ids)
    

In [4]:
def create_preprocessor(model_name: str, sequence_length: int = 160):
    if 'distil' in model_name.lower():
        return keras_nlp.models.DistilBertPreprocessor.from_preset(
            model_name,
            sequence_length=sequence_length
        )
    elif 'roberta' in model_name.lower():
        return keras_nlp.models.RobertaPreprocessor.from_preset(
            model_name,
            sequence_length=sequence_length
        )
    elif 'bert' in model_name.lower():
        return keras_nlp.models.BertPreprocessor.from_preset(
            model_name,
            sequence_length=sequence_length
        )
    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [5]:
def create_backbone(model_name: str):
    if 'distil' in model_name.lower():
        return keras_nlp.models.DistilBertBackbone.from_preset(model_name)
    elif 'roberta' in model_name.lower():
        return keras_nlp.models.RobertaBackbone.from_preset(model_name)
    elif 'bert' in model_name.lower():
        return keras_nlp.models.BertBackbone.from_preset(model_name)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [6]:
def create_custom_classifier_model(backbone, preprocessor, hidden_dims=64):
    """Create a custom classification model."""
    # Create input layers
    input_text = keras.layers.Input(shape=(), dtype=tf.string, name="text")
    
    # Preprocess the text
    preprocessed = preprocessor(input_text)
    
    # Get backbone features
    sequence_output = backbone(preprocessed)
    
    # Pool the sequence output
    pooled = keras.layers.GlobalAveragePooling1D()(sequence_output)
    
    # Add classification layers
    hidden = keras.layers.Dense(hidden_dims, activation="relu")(pooled)
    hidden = keras.layers.Dropout(0.1)(hidden)
    outputs = keras.layers.Dense(2)(hidden)
    
    # Create model
    model = keras.Model(inputs=input_text, outputs=outputs)
    return model

In [7]:
def create_default_classifier(preprocessor):
    preset = "distil_bert_base_en_uncased"
    classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)
    return classifier

In [8]:
def train_model(model, X_train, y_train, X_val, y_val, epochs=2, batch_size=32):
    f1 = keras.metrics.F1Score(
        threshold=0.5,
        average="macro",
        name="f1")
    # Compile model
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=keras.optimizers.Adam(1e-5),
        metrics=["accuracy"]
    )
    
    # Train - note that the model will handle preprocessing internally
    # since we included the preprocessor in the model definition
    history = model.fit(
        x=X_train,
        y=y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val)
    )
    
    return history

In [9]:
def create_submission(model, X_test, test_ids, output_path="submission.csv"):
   # Convert to numpy array if needed
        
    predictions = model.predict(X_test)
    binary_predictions = np.argmax(predictions, axis=1)
    
    submission = pd.DataFrame({
        'id': test_ids,
        'target': binary_predictions
    })
    
    submission.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")
    
    return submission

In [10]:
AVAILABLE_MODELS = {
    'distilbert': 'distil_bert_base_en_uncased',
    'bert': 'bert_base_en_uncased',
    'roberta': 'roberta_base_en',
}

def run_experiment(model_name: str):
    """Run a complete experiment with specified model."""
    print(f"\nRunning experiment with {model_name}")
    preset = AVAILABLE_MODELS[model_name]
    
    # 1. Load data
    (X_train, y_train), (X_val, y_val), (X_test, test_ids) = load_data("train.csv", "test.csv")
    print("Data loaded")
    
    # 2. Create components
    preprocessor = create_preprocessor(preset)
    #backbone = create_backbone(preset)
    model = create_default_classifier(preprocessor)
    print("Model components created")
    
    
    # 3. Train
    history = train_model(model, X_train, y_train, X_val, y_val)
    print("Training completed")
    
    # 4. Create submission
    submission = create_submission(
        model, 
        X_test, 
        test_ids, 
        f"submission_{model_name}.csv"
    )
    
    return model, history, submission

In [11]:
model, history, submission = run_experiment('distilbert')
print("\nFinal metrics:")
print(f"Val accuracy: {history.history['val_accuracy'][-1]:.4f}")
print(f"Val loss: {history.history['val_loss'][-1]:.4f}")


Running experiment with distilbert
Data loaded
Model components created
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 1/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 312ms/step - accuracy: 0.7184 - loss: 0.5646 - val_accuracy: 0.8431 - val_loss: 0.3927
Epoch 2/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 154ms/step - accuracy: 0.8499 - loss: 0.3763 - val_accuracy: 0.8418 - val_loss: 0.3822
Training completed
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 78ms/step
Submission saved to submission_distilbert.csv

Final metrics:
Val accuracy: 0.8418
Val loss: 0.3822


In [14]:
# First load full dataset
train_full = tf.data.experimental.make_csv_dataset(
    "train.csv",
    batch_size=16,
    label_name='target',
    select_columns=['text', 'keyword', 'location', 'target'], 
    num_epochs=10
)
print("Started")
# Create preprocessors
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
    "distil_bert_base_en_uncased",
    sequence_length=160
)

keyword_lookup = keras.layers.StringLookup(output_mode="multi_hot")
location_lookup = keras.layers.StringLookup(output_mode="multi_hot")

# Adapt lookups
keyword_data = train_full.map(lambda x, y: x['keyword'])
location_data = train_full.map(lambda x, y: x['location'])
keyword_lookup.adapt(keyword_data)
location_lookup.adapt(location_data)

keyword_size = len(keyword_lookup.get_vocabulary())
location_size = len(location_lookup.get_vocabulary())

# Split data
train_data = train_full.shuffle(buffer_size=1000)
val_size = len(list(train_full)) // 5
val_ds = train_data.take(val_size)
train_ds = train_data.skip(val_size)
train_size = len(list(train_full)) - val_size
batch_size = 16
steps_per_epoch = train_size // batch_size

def prepare_data(features, label):
    text_preprocessed = preprocessor(features['text'])
    
    # Use map_fn to apply lookup to each element in the batch
    keyword_encoded = tf.map_fn(
        keyword_lookup, 
        features['keyword'],
        fn_output_signature=tf.float32
    )
    location_encoded = tf.map_fn(
        location_lookup, 
        features['location'],
        fn_output_signature=tf.float32
    )
    
    # Add middle dimension
    keyword_encoded = tf.expand_dims(keyword_encoded, axis=1)
    location_encoded = tf.expand_dims(location_encoded, axis=1)
    
    features_dict = {
        'token_ids': text_preprocessed['token_ids'],
        'padding_mask': text_preprocessed['padding_mask'],
        'keyword_encoding': keyword_encoded,
        'location_encoding': location_encoded
    }
    return features_dict, label

# Apply the data preparation
train_ds = train_ds.map(prepare_data)
val_ds = val_ds.map(prepare_data)

train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(tf.data.AUTOTUNE)
train_size = len(list(train_full)) - val_size
batch_size = 16
steps_per_epoch = train_size // batch_size
# Verify shapes
for features, label in train_ds.take(1):
    print("\nFinal shapes in dataset:")
    print("Token ids shape:", features['token_ids'].shape)
    print("Keyword shape:", features['keyword_encoding'].shape)
    print("Location shape:", features['location_encoding'].shape)
# Create model
backbone = keras_nlp.models.DistilBertBackbone.from_preset(
    "distil_bert_base_en_uncased"
)

backbone_inputs = backbone.input
keyword_input = keras.layers.Input(shape=(1, keyword_size), dtype=tf.float32, name="keyword_encoding")
location_input = keras.layers.Input(shape=(1, location_size), dtype=tf.float32, name="location_encoding")

x = backbone(backbone_inputs)
x = keras.layers.GlobalAveragePooling1D()(x)
x = keras.layers.Reshape((1, 768))(x)

combined = keras.layers.Concatenate(axis=-1)([x, keyword_input, location_input])
combined = keras.layers.Flatten()(combined)

x = keras.layers.Dense(512, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01))(combined)
x = keras.layers.Dropout(0.3)(x)
x = keras.layers.Dense(256, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01))(x)
x = keras.layers.Dropout(0.3)(x)
outputs = keras.layers.Dense(2)(x)

model = keras.Model(
    inputs={**backbone_inputs, 
            'keyword_encoding': keyword_input,
            'location_encoding': location_input},
    outputs=outputs
)

model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(1e-5),
    metrics=["accuracy"]
)

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,  # added epochs
    steps_per_epoch = steps_per_epoch
)

print("done")

Started

Final shapes in dataset:
Token ids shape: (16, 160)
Keyword shape: (16, 1, 223)
Location shape: (16, 1, 3343)
Epoch 1/2
[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 179ms/step - accuracy: 0.6751 - loss: 12.6997 - val_accuracy: 0.8435 - val_loss: 11.2568
Epoch 2/2
[1m  2/238[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17s[0m 75ms/step - accuracy: 0.9375 - loss: 11.1494

  self.gen.throw(typ, value, traceback)


[1m238/238[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 166ms/step - accuracy: 0.8451 - loss: 10.9000 - val_accuracy: 0.8542 - val_loss: 9.8368
done


In [1]:
# Load test data as a pandas DataFrame first
test_df = pd.read_csv("test.csv")

# Convert data types and handle missing values
test_df['text'] = test_df['text'].fillna('')
test_df['keyword'] = test_df['keyword'].fillna('')
test_df['location'] = test_df['location'].fillna('')

# Create dataset using make_csv_dataset but with fixed batch size and no shuffling
test_ds = tf.data.experimental.make_csv_dataset(
    "test.csv",
    batch_size=16,
    label_name=None,
    select_columns=['text', 'keyword', 'location'],
    shuffle=False,
    num_epochs=1
)

# Create ordered list of IDs
test_ids = test_df['id'].tolist()

# Apply preprocessing
test_ds = test_ds.map(prepare_test_data)

# Get predictions
all_predictions = []
for batch in test_ds:
    batch_preds = model.predict_on_batch(batch)
    all_predictions.extend(tf.argmax(batch_preds, axis=1).numpy())

# Create submission DataFrame ensuring length matches
submission = pd.DataFrame({
    'id': test_ids[:len(all_predictions)],
    'target': all_predictions
})

# Verify the submission
print("Test file shape:", test_df.shape)
print("Submission shape:", submission.shape)
print("Predictions distribution:")
print(pd.Series(all_predictions).value_counts())

# Save submission
submission.to_csv('submission.csv', index=False)

NameError: name 'pd' is not defined