In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
import tensorflow as tf
import numpy as np
from keras import layers
import pandas as pd



In [20]:
test = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")[["text","label"]].sample(frac=1,random_state=42)
train = pd.read_csv("/kaggle/input/topic-dataset-daigt/topics.csv")[["text","label"]].sample(frac=1,random_state=42)

In [85]:
train_df = train.sample(frac=0.9,random_state=42)
val_df = train.sample(frac=0.1,random_state=42)

In [87]:
seed = 2 
dataset_tr = tf.data.Dataset.from_tensor_slices((train_df.text.values,train_df.label.values ))
dataset_tr = dataset_tr.shuffle(buffer_size=len(train_df)).batch(batch_size=2)

dataset_val = tf.data.Dataset.from_tensor_slices((val_df.text.values,val_df.label.values ))
dataset_val = dataset_val.shuffle(buffer_size=len(val_df)).batch(batch_size=2)

dataset_test = tf.data.Dataset.from_tensor_slices((test.text.values))
dataset_test = dataset_test.batch(batch_size=2)


In [88]:
import string
import re
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )
max_features = 20000
embedding_dim = 128
sequence_length = 500
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)
text_ds = dataset_tr.map(lambda x, y: x).concatenate(dataset_val.map(lambda x, y: x)).concatenate(dataset_test)
vectorize_layer.adapt(text_ds)

In [91]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label
def vectorize_text_test(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)
train_ds = dataset_tr.map(vectorize_text)
train_ds = train_ds.cache().prefetch(buffer_size=10)

val_ds = dataset_val.map(vectorize_text)
val_ds = val_ds.cache().prefetch(buffer_size=10)

test_ds = dataset_test.map(vectorize_text_test)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [92]:
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
model = keras.Model(inputs, predictions)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",tf.keras.metrics.AUC()])

In [93]:
epochs = 1
tf.random.set_seed(1)
model.fit(train_ds, validation_data=val_ds, epochs=epochs,workers=4)



<keras.src.callbacks.History at 0x7c912a14d1b0>

In [94]:
model.evaluate(val_ds)



[0.16604194045066833, 0.9441744685173035, 0.9702588319778442]

In [95]:
preds = model.predict(test_ds)



In [96]:
from sklearn.metrics import roc_auc_score

In [99]:
roc_auc_score(1-test.label,preds)

0.9963834288271556

In [None]:
# inputs = keras.Input(shape=(1,), dtype="string")
# indices = vectorize_layer(inputs)
# outputs = model(indices)
# end_to_end_model = keras.Model(inputs, outputs)
# end_to_end_model.compile(
#     loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",tf.keras.metrics.AUC()]
# )
# end_to_end_model.predict(test_ds)