In [8]:
import numpy as np 
import pandas as pd
import time
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import random

In [9]:
df = pd.read_csv('/kaggle/input/all-texts/text.csv')

In [10]:
oov_tok = "<OOV>"
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(df.text)

In [11]:
df = df.drop_duplicates()
train, valid, test = df.iloc[10000:100000], df.iloc[100000:150000], df.iloc[150000:]

In [12]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 8
max_length = 1000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

training_padded = pad_sequences(tokenizer.texts_to_sequences(train.text), maxlen=max_length, padding=padding_type, truncating=trunc_type)
valid_padded = pad_sequences(tokenizer.texts_to_sequences(valid.text), maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(tokenizer.texts_to_sequences(test.text), maxlen=max_length, padding=padding_type, truncating=trunc_type)
os.environ['PYTHONHASHSEED']=str(1)
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)

vocab_size = len(tokenizer.word_index) + 1

num_epochs = 17
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

Adam = 'Adam'
model.compile(loss='binary_crossentropy',optimizer = Adam, metrics=['AUC'])
with tf.device('/GPU:0'):
    model.fit(training_padded, train.is_ad, epochs=num_epochs, batch_size=32, validation_data = (valid_padded, valid.is_ad), verbose=1) 
loss, accuracy = model.evaluate(testing_padded, test.is_ad)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy) 



Epoch 1/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - AUC: 0.7198 - loss: 0.4602 - val_AUC: 0.8514 - val_loss: 0.3997
Epoch 2/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.7582 - loss: 0.4112 - val_AUC: 0.9252 - val_loss: 0.3899
Epoch 3/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.7639 - loss: 0.4074 - val_AUC: 0.9787 - val_loss: 0.3527
Epoch 4/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.7615 - loss: 0.4368 - val_AUC: 0.8775 - val_loss: 0.3863
Epoch 5/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.7623 - loss: 0.4123 - val_AUC: 0.9651 - val_loss: 0.3716
Epoch 6/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - AUC: 0.8114 - loss: 0.3808 - val_AUC: 0.9761 - val_loss: 0.3696
Epoch 7/17
[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [13]:
model.save_weights('tx_weights.weights.h5')
import pickle
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))