In [None]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Bidirectional, LSTM, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.preprocessing import sequence
from keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models import KeyedVectors
import gensim

# Paths for GloVe and Gensim files
glove_path = '/kaggle/input/glove-6b-300d-txt/glove.6B.300d.txt'
gensim_path = 'glove.6B.300d.gensim'

# Model hyperparameters
embedding_paths = [gensim_path]
num_models = 2
batch_size = 512
lstm_units = 128
dense_units = 4 * lstm_units
epochs = 4
max_sequence_length = 220

# Columns for identity and auxiliary information
identity_cols = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
auxiliary_cols = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
text_column = 'comment_text'
target_column = 'target'
char_to_remove = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

# Load and convert GloVe to Gensim format
def convert_glove_to_gensim(glove_path, gensim_path):
    glove_model = gensim.models.KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)
    glove_model.save(gensim_path)

# Create the embedding matrix from GloVe embeddings
def create_embedding_matrix(word_index, embedding_path):
    embeddings = KeyedVectors.load(embedding_path, mmap='r')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, index in word_index.items():
        for variant in [word, word.lower()]:
            if variant in embeddings:
                embedding_matrix[index] = embeddings[variant]
                break
    return embedding_matrix

# Define the model architecture
def build_model(embedding_matrix, num_aux_targets):
    input_text = Input(shape=(None,))
    embedding_layer = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(input_text)
    x = SpatialDropout1D(0.2)(embedding_layer)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)

    pooled = concatenate([GlobalMaxPooling1D()(x), GlobalAveragePooling1D()(x)])
    hidden_layer = Dense(dense_units, activation='relu')(pooled)
    hidden_layer = Dense(dense_units, activation='relu')(hidden_layer)
    
    main_output = Dense(1, activation='sigmoid', name='main_output')(hidden_layer)
    auxiliary_output = Dense(num_aux_targets, activation='sigmoid', name='aux_output')(hidden_layer)

    model = Model(inputs=input_text, outputs=[main_output, auxiliary_output])

    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics={'main_output': ['accuracy'], 'aux_output': ['accuracy']}
    )

    return model

# Load train and test datasets
def load_data():
    train_df = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
    test_df = pd.read_csv('/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
    return train_df, test_df

# Preprocess the data (tokenization and padding)
def preprocess_data(train_df, test_df):
    x_train_data = train_df[text_column].astype(str)
    y_train_data = train_df[target_column].values
    y_aux_train_data = train_df[auxiliary_cols].values
    x_test_data = test_df[text_column].astype(str)

    # Convert target and identity columns to binary (True/False)
    for col in identity_cols + [target_column]:
        train_df[col] = (train_df[col] >= 0.5).astype(bool)

    text_tokenizer = Tokenizer(filters=char_to_remove, lower=False)
    text_tokenizer.fit_on_texts(list(x_train_data) + list(x_test_data))

    x_train_data = text_tokenizer.texts_to_sequences(x_train_data)
    x_test_data = text_tokenizer.texts_to_sequences(x_test_data)
    x_train_data = sequence.pad_sequences(x_train_data, maxlen=max_sequence_length)
    x_test_data = sequence.pad_sequences(x_test_data, maxlen=max_sequence_length)

    return x_train_data, y_train_data, y_aux_train_data, x_test_data, text_tokenizer

# Calculate sample weights for the training data
def calculate_sample_weights(train_df):
    sample_weights = np.ones(len(train_df), dtype=np.float32)
    sample_weights += train_df[identity_cols].sum(axis=1)
    sample_weights += train_df[target_column] * (~train_df[identity_cols]).sum(axis=1)
    sample_weights += (~train_df[target_column]) * train_df[identity_cols].sum(axis=1) * 5
    sample_weights /= sample_weights.mean()
    return sample_weights

# Train and evaluate the model
# Train and evaluate the model
def train_and_evaluate_model(model, x_train_data, y_train_data, y_aux_train_data, sample_weights, x_test_data):
    model_predictions = []
    model_weights = []

    for epoch_index in range(epochs):
        # Fit the model for 1 epoch
        history = model.fit(
            x_train_data,
            [y_train_data, y_aux_train_data],
            batch_size=batch_size,
            epochs=1,
            verbose=2,
            sample_weight=[sample_weights.values, np.ones_like(sample_weights)]
        )

        # Print training accuracy for main and auxiliary outputs
        main_acc = history.history['main_output_accuracy'][-1]
        aux_acc = history.history['aux_output_accuracy'][-1]

        print(f"Epoch {epoch_index + 1}/{epochs}, Main Accuracy: {main_acc}, Auxiliary Accuracy: {aux_acc}")
        
        # Save model predictions for the current checkpoint
        model_predictions.append(model.predict(x_test_data, batch_size=2048)[0].flatten())
        model_weights.append(2 ** epoch_index)

    return model_predictions, model_weights

# Finalize the predictions
def finalize_predictions(model_predictions, model_weights):
    final_prediction = np.average(model_predictions, weights=model_weights, axis=0)
    return final_prediction

# Prepare and save the submission file
def save_submission(final_prediction, test_df):
    submission_df = pd.DataFrame({
        'id': test_df.id,
        'prediction': final_prediction
    })
    submission_df.to_csv('submission.csv', index=False)

# Main process
def main():
    convert_glove_to_gensim(glove_path, gensim_path)

    # Load and preprocess the data
    train_df, test_df = load_data()
    x_train_data, y_train_data, y_aux_train_data, x_test_data, text_tokenizer = preprocess_data(train_df, test_df)

    # Calculate sample weights
    sample_weights = calculate_sample_weights(train_df)

    # Build the embedding matrix
    embedding_matrix = np.concatenate(
        [create_embedding_matrix(text_tokenizer.word_index, file) for file in embedding_paths], axis=-1
    )

    # Train models and collect predictions
    model_predictions = []
    model_weights = []

    for model_index in range(num_models):
        model = build_model(embedding_matrix, y_aux_train_data.shape[-1])
        model_predictions_for_model, model_weights_for_model = train_and_evaluate_model(
            model, x_train_data, y_train_data, y_aux_train_data, sample_weights, x_test_data
        )
        model_predictions.extend(model_predictions_for_model)
        model_weights.extend(model_weights_for_model)

    # Finalize predictions and save the submission
    final_prediction = finalize_predictions(model_predictions, model_weights)
    save_submission(final_prediction, test_df)

# Run the main process
if __name__ == "__main__":
    main()

Loading GloVe vectors...
Saving Gensim vectors...
Done!
3526/3526 - 11048s - 3s/step - dense_2_loss: 0.4296 - dense_3_loss: 0.1095 - loss: 0.5391
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 5s/step
3526/3526 - 10526s - 3s/step - dense_2_loss: 0.4130 - dense_3_loss: 0.1042 - loss: 0.5172
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 5s/step
3526/3526 - 10770s - 3s/step - dense_2_loss: 0.4081 - dense_3_loss: 0.1030 - loss: 0.5111
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 5s/step
3526/3526 - 10623s - 3s/step - dense_2_loss: 0.4047 - dense_3_loss: 0.1024 - loss: 0.5070
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 6s/step
3526/3526 - 10837s - 3s/step - dense_6_loss: 0.4297 - dense_7_loss: 0.1097 - loss: 0.5393
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 5s/step
3526/3526 - 10388s - 3s/step - dense_6_loss: 0.4132 - dense_7_loss: 0.1042 - loss: 0.5173
[1m48/48[0m [32m━━━━━━━━━━━━━━━