In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


#from datasets import load_dataset



2024-11-05 15:09:43.093815: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
all_train_df = pd.read_csv("datasets/train.csv").rename(columns={"prompt": "text", "type": "label"})
test_df = pd.read_csv("datasets/test.csv").rename(columns={"prompt": "text", "type": "label"})



In [3]:
# shuffle and with random seed for reproducibility and split the data into train and val 0.8/0.2
all_train_df = all_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into training (80%) and validation (20%)
train_size = int(0.8 * len(all_train_df))
train_df = all_train_df[:train_size].reset_index(drop=True)
val_df = all_train_df[train_size:].reset_index(drop=True)

In [4]:


# Preprocess text data
max_vocab_size = 10000
max_sequence_length = 100

# Tokenization
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(train_df['text'])
sequences = tokenizer.texts_to_sequences(train_df['text'])
X_train = pad_sequences(sequences, maxlen=max_sequence_length)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_df['text']), maxlen=max_sequence_length)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_val = label_encoder.transform(val_df['label'])


# Define the model
model = tf.keras.Sequential([
    layers.Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_sequence_length),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    # Single output neuron for binary classification
    layers.Dense(1, activation='sigmoid')  # Use sigmoid for binary classification
])

# Compile the model with binary crossentropy loss
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the test set
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length)
y_test = label_encoder.transform(test_df['label'])

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

2024-11-05 15:09:46.422796: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-05 15:09:46.422842: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


2024-11-05 15:09:47.444123: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-11-05 15:09:47.521909: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp_10.




2024-11-05 15:09:51.632508: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9809


In [5]:
import keras_tuner as kt
from keras import regularizers


def build_simple_binary_model():
    model = tf.keras.Sequential()

    # Fixed embedding layer
    model.add(layers.Embedding(
        input_dim=max_vocab_size, 
        output_dim=128,  # Fixed output dimension
        input_length=max_sequence_length
    ))
    
    # Add a fixed convolutional layer
    model.add(layers.Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    
    # Dense layer with fixed units and dropout
    model.add(layers.Dense(
        units=128,  # Fixed units
        activation='relu',
        kernel_regularizer=regularizers.l2(0.001)  # Fixed L2 regularization
    ))
    model.add(layers.Dropout(rate=0.5))  # Fixed dropout rate
    
    # Output layer with sigmoid activation for binary classification
    model.add(layers.Dense(1, activation='sigmoid'))  # Single output neuron

    # Compile model with binary crossentropy loss
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),  # Fixed learning rate
        loss='binary_crossentropy',  # Use binary crossentropy for binary classification
        metrics=['accuracy']
    )
    
    return model


In [6]:
# Define the tuner
tuner = kt.RandomSearch(
    build_simple_binary_model,
    objective='val_loss',
    max_trials=5)

# Perform search
tuner.search(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)

TypeError: build_simple_binary_model() takes 0 positional arguments but 1 was given