In [None]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM
from gensim.models import KeyedVectors
import gensim

# Load GloVe vectors
glove_input_file = '/kaggle/input/glove-6b-300d-txt/glove.6B.300d.txt'
gensim_output_file = 'glove.6B.300d.gensim'

def glove_to_gensim(glove_input_file, gensim_output_file):
    print("Loading GloVe vectors...")
    model = gensim.models.KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True)
    print("Saving Gensim vectors...")
    model.save(gensim_output_file)
    print("Done!")

glove_to_gensim(glove_input_file, gensim_output_file)

EMBEDDING_FILES = [gensim_output_file]
NUM_MODELS = 2
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
IDENTITY_COLUMNS = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
                    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

# Build the embedding matrix
def build_matrix(word_index, path):
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        for candidate in [word, word.lower()]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                break
    return embedding_matrix

# Build the model
def build_model(embedding_matrix, num_aux_targets):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([GlobalMaxPooling1D()(x), GlobalAveragePooling1D()(x)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid', name='main_output')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid', name='aux_output')(hidden)

    model = Model(inputs=words, outputs=[result, aux_result])

    # Define distinct accuracy metrics for each output using the names of the outputs
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics={'main_output': ['accuracy'], 'aux_output': ['accuracy']}  # Match the layer names here
    )

    return model

# Read train and test data
train_df = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

x_train = train_df[TEXT_COLUMN].astype(str)
y_train = train_df[TARGET_COLUMN].values
y_aux_train = train_df[AUX_COLUMNS].values
x_test = test_df[TEXT_COLUMN].astype(str)

# Convert target and identity columns to boolean values
for column in IDENTITY_COLUMNS + [TARGET_COLUMN]:
    train_df[column] = np.where(train_df[column] >= 0.5, True, False)

# Tokenize the text data
tokenizer = Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

# Calculate sample weights
sample_weights = np.ones(len(x_train), dtype=np.float32)
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
sample_weights /= sample_weights.mean()

# Build embedding matrix
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

# Prepare for storing predictions
checkpoint_predictions = []
weights = []

# Train the model
for model_idx in range(NUM_MODELS):
    model = build_model(embedding_matrix, y_aux_train.shape[-1])
    for global_epoch in range(EPOCHS):
        # Train the model for 1 epoch and get the history object
        history = model.fit(
            x_train,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=2,
            sample_weight=[sample_weights.values, np.ones_like(sample_weights)]
        )
        
        # Print the accuracy for the primary and auxiliary outputs
        # Access accuracy for the first output (main target)
        train_accuracy = history.history['main_output_accuracy'][-1]  # Accuracy for the first output
        aux_train_accuracy = history.history['aux_output_accuracy'][-1]  # Accuracy for the second output
        
        print(f"Epoch {global_epoch + 1}/{EPOCHS}, Train accuracy: {train_accuracy}, Aux Train accuracy: {aux_train_accuracy}")
        
        # Save predictions for the checkpoint
        checkpoint_predictions.append(model.predict(x_test, batch_size=2048)[0].flatten())
        weights.append(2 ** global_epoch)

# Combine predictions from all models and calculate the average
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

# Create the submission file
submission = pd.DataFrame.from_dict({
    'id': test_df.id,
    'prediction': predictions
})
submission.to_csv('submission.csv', index=False)


Loading GloVe vectors...
Saving Gensim vectors...
Done!
3526/3526 - 11048s - 3s/step - dense_2_loss: 0.4296 - dense_3_loss: 0.1095 - loss: 0.5391
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 5s/step
3526/3526 - 10526s - 3s/step - dense_2_loss: 0.4130 - dense_3_loss: 0.1042 - loss: 0.5172
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 5s/step
3526/3526 - 10770s - 3s/step - dense_2_loss: 0.4081 - dense_3_loss: 0.1030 - loss: 0.5111
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 5s/step
3526/3526 - 10623s - 3s/step - dense_2_loss: 0.4047 - dense_3_loss: 0.1024 - loss: 0.5070
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 6s/step
3526/3526 - 10837s - 3s/step - dense_6_loss: 0.4297 - dense_7_loss: 0.1097 - loss: 0.5393
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 5s/step
3526/3526 - 10388s - 3s/step - dense_6_loss: 0.4132 - dense_7_loss: 0.1042 - loss: 0.5173
[1m48/48[0m [32m━━━━━━━━━━━━━━━