In [None]:
!pip install gensim
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

In [None]:
data = pd.read_csv("train.csv")
data

In [None]:
sentences = [text.split() for text in data['comment_text']]
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)

max_len = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['comment_text'])
text_seq = tokenizer.texts_to_sequences(data['comment_text'])
text_seq = pad_sequences(text_seq, maxlen=max_len)

In [None]:
vocab = tokenizer.word_index
num_tokens = len(vocab) + 2
final_embed = np.zeros((num_tokens, 300))

for word, i in vocab.items():
    if word in word2vec_model.wv:
        final_embed[i] = word2vec_model.wv[word]


In [None]:
# Prepare data and labels
toxicity_labels = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
X_train, X_val, y_train, y_val = train_test_split(text_seq, toxicity_labels, test_size=0.2, random_state=42)


# TPU Configuration
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()  # Detects the TPU
    print("Running on TPU ", resolver.master())
    tf.config.experimental_connect_to_cluster(resolver)
    try:
        tf.tpu.experimental.initialize_tpu_system(resolver)
    except tf.errors.InvalidArgumentError:
        print("TPU system already initialized. Continuing without reinitializing.")
    tpu_strategy = tf.distribute.experimental.TPUStrategy(resolver)
except ValueError:
    print("Could not detect TPU; defaulting to CPU or GPU.")
    tpu_strategy = tf.distribute.get_strategy()


with tpu_strategy.scope():
    # LSTM Model
    embed_input = keras.layers.Input(shape=(max_len,))
    embed_layer = keras.layers.Embedding(num_tokens, 300, embeddings_initializer=keras.initializers.Constant(final_embed), trainable=False)(embed_input)
    lstm_layer = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(embed_layer)
    global_pool = keras.layers.GlobalMaxPool1D()(lstm_layer)
    dense_layer = keras.layers.Dense(64, activation='relu')(global_pool)
    output = keras.layers.Dense(6, activation='sigmoid')(dense_layer)

    model = keras.Model(inputs=embed_input, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Modified callback for tracking accuracy every 100 inputs
class BatchAccumulator(keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        self.batch_accs = []  # Store accuracies for each 100 inputs
        self.temp_accs = []   # Temporary storage for accuracies
        self.batch_counter = 0

    def on_batch_end(self, batch, logs=None):
        acc = logs.get('accuracy')
        self.temp_accs.append(acc)
        self.batch_counter += len(X_train[batch])  # Assuming batch is index, adjust accordingly

        if self.batch_counter >= 100:  # Check if 100 inputs have been processed
            avg_acc = np.mean(self.temp_accs)
            self.batch_accs.append(avg_acc)
            self.temp_accs = []  # Reset temporary storage
            self.batch_counter = 0

batch_accumulator = BatchAccumulator()

# Training the model
history = model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_val, y_val), callbacks=[batch_accumulator])

# Plotting the accuracies at the end
plt.plot(batch_accumulator.batch_accs)
plt.title('Accuracy every 100 inputs')
plt.xlabel('Batch (each represents 100 inputs)')
plt.ylabel('Accuracy')
plt.show()

model.summary()

In [None]:
X = text_seq
y = data.drop(columns=['id','comment_text'],axis=1)
print(len(X),len(y))

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y)

In [None]:
model.fit(train_X,train_y)

In [None]:
from sklearn.metrics import roc_auc_score

preds = model.predict(test_X)
print("ROC AUC Score",roc_auc_score(test_y,preds))

In [None]:
import matplotlib.pyplot as plt

# Assuming 'history.history['acc']' and 'history.history['val_acc']' are lists with length equal to the number of epochs
num_epochs = len(history.history['acc'])

plt.plot(range(1, num_epochs + 1), history.history['acc'], label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), history.history['val_acc'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.xticks(range(1, num_epochs + 1))  # Set x-ticks to correspond to the epochs
plt.legend()
plt.show()