In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

In [None]:
data = pd.read_csv("train.csv")
data

In [None]:
!pip install gensim
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

sentences = [text.split() for text in data['comment_text']]
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)


max_len = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['comment_text'])
text_seq = tokenizer.texts_to_sequences(data['comment_text'])
text_seq = pad_sequences(text_seq,maxlen=max_len)

In [None]:
vocab = tokenizer.word_index
num_tokens = len(vocab) + 2
final_embed = np.zeros((num_tokens, 300))

for word, i in vocab.items():
    if word in word2vec_model.wv:
        final_embed[i] = word2vec_model.wv[word]

In [None]:
from keras.regularizers import l2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.datasets import make_multilabel_classification
#LSTM and RNN

embed_input = keras.layers.Input(shape=(text_seq.shape[1],))
embed_layer = keras.layers.Embedding(num_tokens, 300, embeddings_initializer=keras.initializers.Constant(final_embed), trainable=False)(embed_input)
spatial_dropout = keras.layers.SpatialDropout1D(0.2)(embed_layer)
bilstm_layer = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(spatial_dropout)
additional_lstm_layer = keras.layers.LSTM(64, return_sequences=True)(bilstm_layer)
global_pool = keras.layers.GlobalMaxPool1D()(additional_lstm_layer)
dense_128 = keras.layers.Dense(128, kernel_regularizer=l2(0.01))(global_pool)  # Added L2 regularization
dropout = keras.layers.Dropout(0.1)(dense_128)
batch_normalization = keras.layers.BatchNormalization()(dropout)
output = keras.layers.Dense(6, activation='sigmoid')(batch_normalization)

model = keras.Model(inputs=embed_input, outputs=output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0005),  # Reduced learning rate
              loss="binary_crossentropy", 
              metrics=["acc"])

# Early stopping callback
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

toxicity_labels = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

# Assuming 'text_seq' is your feature data and 'toxicity_labels' is your target data
X, y = make_multilabel_classification(len(text_seq), len(toxicity_labels), random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # 20% data as validation


# Initialize the classifier with OneVsRest
clf = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, warm_start=True))

# Prepare for real-time plotting
plt.ion()
fig, ax = plt.subplots()
accuracies = []

# Simulate mini-batch learning
for epoch in range(10):
    for i in range(0, len(X_train), 100): # assuming batch size of 100
        X_batch = X_train[i:i+100]
        y_batch = y_train[i:i+100]
        clf.partial_fit(X_batch, y_batch, classes=np.unique(y))

        # Predict on the test set
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        # Update the plot
        ax.clear()
        ax.plot(accuracies)
        ax.set_title('Real-Time Accuracy')
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Accuracy')
        plt.draw()
        plt.pause(0.1)

plt.ioff()
plt.show()

In [None]:
X = text_seq
y = data.drop(columns=['id','comment_text'],axis=1)
print(len(X),len(y))

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y)

In [None]:
model.fit(train_X,train_y)

In [None]:
# print(model.evaluate(test_X,test_y))

In [None]:
from sklearn.metrics import roc_auc_score

preds = model.predict(test_X)
print("ROC AUC Score",roc_auc_score(test_y,preds))

In [None]:
import matplotlib.pyplot as plt

# Assuming 'history.history['acc']' and 'history.history['val_acc']' are lists with length equal to the number of epochs
num_epochs = len(history.history['acc'])

plt.plot(range(1, num_epochs + 1), history.history['acc'], label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), history.history['val_acc'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.xticks(range(1, num_epochs + 1))  # Set x-ticks to correspond to the epochs
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'history.history['loss']' and 'history.history['val_loss']' are lists with length equal to the number of epochs
num_epochs = len(history.history['loss'])

plt.plot(range(1, num_epochs + 1), history.history['loss'], label='Training Loss')
plt.plot(range(1, num_epochs + 1), history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.xticks(range(1, num_epochs + 1))  # Set x-ticks to correspond to the epochs
plt.legend()
plt.show()