In [None]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras import utils
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import one_hot
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.utils import class_weight

In [None]:
sep = os.path.sep
dataDirec = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification' + sep
filt = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def readDataFrame():
    dfTrain = pd.read_csv(dataDirec + 'jigsaw-toxic-comment-train.csv')
    dfBias = pd.read_csv(dataDirec + 'jigsaw-unintended-bias-train.csv')
    dfValid = pd.read_csv(dataDirec + 'validation.csv')
    dfTest = pd.read_csv(dataDirec + 'test.csv')
    return dfTrain, dfBias, dfValid, dfTest

In [None]:
train, bias, valid, test = readDataFrame()

In [None]:
def getData(train, bias, valid, test):
    
    trainData = np.array(train['comment_text'])
    
    biasData = np.array(bias['comment_text'])
    
    trainLabel = np.array(train['toxic'])
    biasLabel = np.round(np.array(bias['toxic']))
    
    testData = np.array(test['content'])
    
    validData = np.array(valid['comment_text'])
    
    validLabel = np.array(valid['toxic'])
    
    Xtrain = np.r_[trainData, biasData]
    Xlabel = np.r_[trainLabel, biasLabel]
    
    Xtrain, Xlabel = shuffle(Xtrain, Xlabel, random_state=0)    
    
    validData, validLabel = shuffle(validData, validLabel, random_state=0)
    
    return Xtrain[:1000000], Xlabel[:1000000], validData, validLabel, testData

In [None]:
Xtrain, Xlabels, validData, validLabel, testData = getData(train, bias, valid, test)
del train, bias, valid, test

In [None]:
def preProcess(Xtrain, validData, test):
    for i, sent in enumerate(Xtrain):
        if i % 100000 == 0:
            print(i, end='\r')
        Xtrain[i] = one_hot(sent, 400, filters=filt, lower=True, split=' ')
    Xtrain = sequence.pad_sequences(Xtrain, maxlen=400, padding='post')
    print("Done")
    
    for i, sent in enumerate(validData):
        if i % 1000 == 0:
            print(i, end='\r')
        validData[i] = one_hot(sent, 400, filters=filt, lower=True, split=' ')
    validData = sequence.pad_sequences(validData, maxlen=400, padding='post')
    print("Done")

    for i, sent in enumerate(test):
        if i % 10000 == 0:
            print(i, end='\r')
        test[i] = one_hot(sent, 400, filters=filt, lower=True, split=' ')
    test = sequence.pad_sequences(test, maxlen=400, padding='post')
    print("Done")

    return Xtrain, validData, test

In [None]:
Xtrain, validData, testData = preProcess(Xtrain, validData, testData)

In [None]:
Xtrain = tf.keras.utils.normalize(Xtrain, axis=-1, order=2)
validData = tf.keras.utils.normalize(validData, axis=-1, order=2)
testData = tf.keras.utils.normalize(testData, axis=-1, order=2)

In [None]:
def generator(train, labels, batch_size):
    while True:
        ind = np.random.choice(np.where(labels == 0)[0], batch_size//2)
        ind1 = np.random.choice(np.where(labels == 1)[0], batch_size//2)
        train = np.r_[train[ind], train[ind1]]
        labels = np.r_[labels[ind], labels[ind1]]
        yield train, labels

In [None]:
def createModel(Xtrain, Xlabels):
    max_words = 400
    lstmSize = 120
    epochs = 5
    batch_size = 300
    validation_split = int(0.03 * Xtrain.shape[0])
    callBack = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
    metrics = [tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()]
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    class_weights = class_weight.compute_class_weight('balanced', np.unique(Xlabels), Xlabels)
    
    validData = Xtrain[:validation_split]
    validLabel = Xlabels[:validation_split]
    
    model = Sequential()
    model.add(Embedding(max_words, lstmSize))
    model.add(LSTM(lstmSize, dropout=0.4, recurrent_dropout=0.5))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4), bias_regularizer=tf.keras.regularizers.l2(1e-4)))
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    
    model.summary()
        
    model.fit(generator(Xtrain, Xlabels, batch_size), epochs=epochs, verbose=1, validation_data=(validData, validLabel), steps_per_epoch=500, callbacks=[callBack], shuffle=True)
    return model

In [None]:
model = createModel(np.r_[Xtrain, validData], np.r_[Xlabels, validLabel])

In [None]:
pseudoLabel = model.predict(testData, batch_size=32, verbose=1)

In [None]:
model = createModel(np.r_[Xtrain, validData, testData], np.r_[Xlabels, validLabel, pseudoLabel[:, 0]])

In [None]:
Label = model.predict(testData, batch_size=32, verbose=1)

In [None]:
data = {'id': np.arange(0, len(testData)), 'toxic': Label[:, 0]}
dframe = pd.DataFrame(data=data)
dframe.to_csv('/kaggle/working/submission.csv', index=False)