# Toxic Text - Simple Dense NN Embeddings CV
_by Nick Brooks, January 2020_


In [None]:
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Embedding, SpatialDropout1D, concatenate, Dropout, BatchNormalization, Activation
from keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras import optimizers
from gensim.models import KeyedVectors

from sklearn.model_selection import KFold
from tensorflow.keras import callbacks
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint

import seaborn as sns
sns.set_style("whitegrid")
notebookstart = time.time()
pd.options.display.max_colwidth = 1500

import keras
print("Keras Version: ",keras.__version__)
import tensorflow
print("Tensorflow Version: ", tensorflow.__version__)

EMBEDDING_FILES = [
    '../input/gensim-embeddings-dataset/crawl-300d-2M.gensim',
    '../input/gensim-embeddings-dataset/glove.840B.300d.gensim'
]

seed = 25

N_ROWS = None
BATCH_SIZE = 64
EPOCHS = 10

MAX_LEN = 512
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [None]:
def build_matrix(word_index, path):
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        for candidate in [word, word.lower()]:
            if candidate in embedding_index:
                embedding_matrix[i] = embedding_index[candidate]
                break
    return embedding_matrix

In [None]:
TARGET_COLUMN = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print("Read Data")
DATA_PATH = "jigsaw-toxic-comment-classification-challenge"
TEXT_COLUMN = 'comment_text'
N_CLASSES = len(TARGET_COLUMN)

train_df = pd.read_csv('../input/{}/train.csv.zip'.format(DATA_PATH), nrows = N_ROWS)
test_df = pd.read_csv('../input/{}/test.csv.zip'.format(DATA_PATH), nrows = N_ROWS)

X = train_df[TEXT_COLUMN].astype(str).fillna('None')
y = train_df[TARGET_COLUMN].values
test = test_df[TEXT_COLUMN].astype(str)

print("Train Shape: {} Rows".format(X.shape[0]))
print("Test Shape: {} Rows".format(test.shape[0]))
print('Dependent Variable Factor Ratio: ',train_df[TARGET_COLUMN].sum().to_dict())

tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(X) + list(test))

X = tokenizer.texts_to_sequences(X)
test = tokenizer.texts_to_sequences(test)

length_info = [len(x) for x in X]
print("Train Sequence Length - Mean {:.1f} +/- {:.1f}, Max {:.1f}, Min {:.1f}".format(
    np.mean(length_info), np.std(length_info), np.max(length_info), np.min(length_info)))

X = sequence.pad_sequences(X, maxlen=MAX_LEN)
test = sequence.pad_sequences(test, maxlen=MAX_LEN)

embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

print("Embeddings Matrix Shape:", embedding_matrix.shape)

checkpoint_predictions = []
weights = []

In [None]:
train_df.sample(2)

In [None]:
def build_model(embedding_matrix, n_classes):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix],
                  trainable=False, input_length=MAX_LEN)(words)
    x = Flatten()(x)
    x = Dropout(.4)(x)
    x = Dense(512, activation = 'relu')(x)
    x = Dropout(.4)(x)
    result = Dense(n_classes, activation='sigmoid')(x)
    model = Model(inputs=words, outputs=result)
    opt = optimizers.Adam(learning_rate=0.00004, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])
    
    return model

model = build_model(embedding_matrix, N_CLASSES)
model.summary()

In [None]:
oof_preds = np.zeros([X.shape[0], N_CLASSES])
test_preds = np.zeros([test.shape[0], N_CLASSES])

n_splits = 3
folds = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
plot_metrics = ['loss', 'acc']

score_list = []

fold_hist = {}
for i, (trn_idx, val_idx) in enumerate(folds.split(X)):
    modelstart = time.time()
    model = build_model(embedding_matrix, N_CLASSES)
    
    es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=4, verbose=1,
                                 mode='min', baseline=None, restore_best_weights=True)
    rlr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-7,
                                      mode='min', verbose=1)
    
    history = model.fit(
            X[trn_idx],
            y[trn_idx],
            validation_data=(X[val_idx], y[val_idx]),
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            verbose=0,
            callbacks=[
                es,
#                 rlr
                      ]
        )

    best_index = np.argmin(history.history['val_loss'])
    fold_hist[i] = history
    
    oof_preds[val_idx] = model.predict(X[val_idx])
    test_preds += model.predict(test)
    
    roc_auc = {}
    mean_score = 0
    for ci in range(0, N_CLASSES):
        score = roc_auc_score(y[val_idx,ci], oof_preds[val_idx,ci])
        roc_auc[TARGET_COLUMN[ci]] = score
        mean_score += score
    print(roc_auc)
    print("Mean ROC AUC: {:.4f}".format(mean_score / N_CLASSES))
    score_list.append([roc_auc, mean_score])

    print("\nFOLD {} COMPLETE in {:.1f} Minutes - Best Epoch {}".format(i, (time.time() - modelstart)/60, best_index + 1))
    best_metrics = {metric: scores[best_index] for metric, scores in history.history.items()}
    pprint.pprint(best_metrics)
    
    f, ax = plt.subplots(1,len(plot_metrics),figsize = [12,4])
    for p_i,metric in enumerate(plot_metrics):
        ax[p_i].plot(history.history[metric], label='Train ' + metric)
        ax[p_i].plot(history.history['val_' + metric], label='Val ' + metric)
        ax[p_i].set_title("{} Fold Loss Curve - {}\nBest Epoch {}".format(i, metric, best_index))
        ax[p_i].legend()
        ax[p_i].axvline(x=best_index, c='black')
    plt.show()

In [None]:
roc_auc = {}
mean_score = 0
for ci in range(0, N_CLASSES):
    score = roc_auc_score(y[:,ci], oof_preds[:,ci])
    roc_auc[TARGET_COLUMN[ci]] = score
    mean_score += score
print(roc_auc)
print("Mean ROC AUC: {:.4f}".format(mean_score / N_CLASSES))

In [None]:
train_df['error'] = (y - oof_preds).sum(axis = 1)

print("Look at False Negative")
display(train_df.sort_values(by = 'error', ascending=False).iloc[:20])

print("Look at False Positives")
display(train_df.sort_values(by = 'error', ascending=True).iloc[:20])

#### Submit

In [None]:
submission = pd.DataFrame(test_preds / n_splits, columns=TARGET_COLUMN)
submission['id'] = test_df.id

submission.to_csv('submission_NN.csv', index=False)
print(submission[TARGET_COLUMN].mean().to_dict())
submission.head()

In [None]:
oof_pd = pd.DataFrame(oof_preds, columns = TARGET_COLUMN)
oof_pd['id'] = train_df.id
oof_pd.to_csv("oof_dense_nn.csv")
oof_pd.shape

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))