In [1]:
import pandas as pd

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")

In [4]:
test_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")

In [5]:
train_df.head()

In [6]:
tokenizer = Tokenizer(
    num_words=5000,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=' ',
    char_level=False,
    oov_token=0
)

In [7]:
tokenizer.fit_on_texts(
    train_df["comment_text"].to_list()
)

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_df["comment_text"].to_list())

In [9]:
test_sequences = tokenizer.texts_to_sequences(test_df["comment_text"].to_list())

In [10]:
tok_conf = tokenizer.get_config()

In [11]:
lengths = [len(texto) for texto in train_sequences]

In [12]:
from collections import Counter

In [13]:
from matplotlib import pyplot as plt

In [14]:
plt.figure(figsize=(18,6))
plt.hist(lengths,bins = 1000)
plt.show()

In [15]:
train_sequences_padded = pad_sequences(
    train_sequences,
    maxlen=200,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)

In [16]:
lengths = [len(texto) for texto in train_sequences_padded]
plt.figure(figsize=(18,6))
plt.hist(lengths,bins = 1000)
plt.show()

In [17]:
test_sequences_padded = pad_sequences(
    test_sequences,
    maxlen=200,
    dtype='int32',
    padding='pre',
    truncating='pre',
    value=0.0
)

In [18]:
test_sequences_padded[0]

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding
from tensorflow.keras.callbacks import EarlyStopping

In [20]:
model=Sequential()

In [21]:
model.add(Embedding(5001, 64, mask_zero=True, input_length=200))

In [22]:
model.summary()

In [23]:
model.add(LSTM(100, return_sequences=True, unroll=True))
model.add(LSTM(100, unroll=True))

In [24]:
model.summary()

In [25]:
model.add(Dense(100, activation="relu"))
model.add(Dense(6, activation="sigmoid"))

In [26]:
model.summary()

In [27]:
from tensorflow.keras.metrics import AUC

In [28]:
auc = AUC(
    num_thresholds=200,
    curve='ROC',
    summation_method='interpolation',
    name=None,
    dtype=None,
    thresholds=None,
    multi_label=False,
    num_labels=None,
    label_weights=None,
    from_logits=False
)

In [29]:
model.compile(loss="binary_crossentropy", optimizer="RMSProp", metrics = [auc, "accuracy"])

In [30]:
y_train = train_df.iloc[:,2:].values

In [31]:
es = EarlyStopping(monitor="val_auc", patience=5,restore_best_weights=True )

In [32]:
model.fit(train_sequences_padded, y_train, batch_size=128, epochs=10, validation_split=0.2, callbacks=[es])

In [33]:
y_test = model.predict(test_sequences_padded, verbose = 1)

In [34]:
test_df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]=0

In [35]:
test_df

In [36]:
test_df.iloc[:,2:] = y_test

In [37]:
test_df.drop(columns="comment_text").to_csv("submission.csv",index=False)