In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.optimizers import Adam
from keras.preprocessing import text
from keras.utils.data_utils import pad_sequences
from keras.callbacks import LearningRateScheduler

Download and unzip the common crawl, 300 dimensional word vectors from:

https://fasttext.cc/docs/en/english-vectors.html

https://nlp.stanford.edu/projects/glove/

This notebook was adapted from:

https://www.kaggle.com/thousandvoices/simple-lstm

In [2]:
SEED = 14
DATA_PATH = '/'.join(os.getcwd().split("/")[:-1]) + '/data/jigsaw_unintended_bias/'
WORD_EMBEDDINGS = {
    'fasttext': '../word_vectors/crawl-300d-2M.vec',
    'glove': '../word_vectors/glove.840B.300d.txt'
}
TARGET_COLUMN = 'target'
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
NUM_MODELS = 1
EMBED_DIM = 300
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220  # max word embeddings per document
VOCAB_SIZE = 100000  # total distinct words or features - this limits the words to be embedded
EPOCHS = 1
BATCH_SIZE = 512

In [3]:
def preprocess(data: pd.Series):
    """
    Cleans the text by removing special characters and returning a pd.Series of string type.
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    """
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

    def clean_special_chars(text: str, punct: str):
        """Replaces the given characters, punct, in the string, text."""
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data


def get_coefs(word, *arr):
    """
    Converts a line from the embedding file to a tuple of (word, 32-bit numpy array)

    :param word: the first element in each line is the word
    :param arr: elements 2-n are the embedding dimensions
    """
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path: str):
    """
    Utility function to load word embeddings.  Each word embedding looks like:
    word 0.3 0.4 0.5 0.6 ...
    This function converts the embeddings to a dictionary of {word: numpy array}
    """
    with open(path, 'r', encoding='UTF-8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))


def get_word_embeddings(word_index: dict, path: str):
    """
    Maps words fround in the text (word_index) to their corresponding word embeddings from the 
    pre-trained model loaded from (path).  If any words cannot be found in the pre-trained model, 
    they are tracked in unknown_words.
    """
    embedding_index = load_embeddings(path)
    # create an empty matrix of shape (nbr_words, embed_dim)
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_DIM))
    unknown_words = []
    
    # map all words from the text to their embeddings, if they exist in the embedding index
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words


def build_model(embedding_matrix, num_aux_targets):
    """
    Builds and compiles a model.
    """
    word_vectors = Input(shape=(MAX_LEN,))

    # create an embedding layer for the words of EMBED_DIM * NUM_MODELS dimensions
    # this layer uses the pre-trained word embeddings
    # instead of passing input_dim, output_dim explicitly, could also just pass *embedding_matrix.shape
    x = Embedding(
        input_dim = embedding_matrix.shape[0], 
        output_dim = embedding_matrix.shape[1], 
        weights=[embedding_matrix], 
        trainable=False
    )(word_vectors)

    # randomly drop features, i.e. [[1, 1, 1], [2, 1, 2]] -> [[1, 0, 1], [2, 0, 2]]
    x = SpatialDropout1D(0.25)(x)

    # each bidirectional layer outputs 2 sequences: 1 forward, 1 backward, and concatenates them
    # so stacking 2 enriches the sequence features
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    # pool and reshape x from (batch_size, MAX_LEN, 2 * LSTM_UNITS) to hidden (BATCH_SIZE, 4 * LSTM_UNITS)
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])

    # skip connections...
    # add a product of a dense layer with the hidden layer to the output of the the hidden layer
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='tanh')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid', name='main_output')(hidden)

    # auxiliary outputs to be predicted as an alternative to the main output
    aux_result = Dense(num_aux_targets, activation='sigmoid', name='aux_ouput')(hidden)

    model = Model(inputs=word_vectors, outputs=[result, aux_result])
    print(model.summary())

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(clipnorm=0.1),
        metrics=['accuracy']
    )

    return model


In [4]:
# expect ~2 Gb RAM for the data
train_df = pd.read_csv(DATA_PATH + 'train.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')

x_train = preprocess(train_df['comment_text'])
y_train = train_df[TARGET_COLUMN]
y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test_df['comment_text'])

# binarize these variables as boolean values for sample weighting later
# this conversion must follow the code above or it will give TF trouble converting the boolean array to a tensor
for col in [TARGET_COLUMN] + IDENTITY_COLUMNS:
    train_df[col] = np.where(train_df[col] >= 0.5, True, False)

In [5]:
tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=MAX_LEN)
x_test = pad_sequences(x_test, maxlen=MAX_LEN)

max_features = min(VOCAB_SIZE, len(tokenizer.word_index) + 1)
print(f"Max Features (Vocab Size) {max_features}")

Max Features (Vocab Size) 100000


In [6]:
print("Average of target variable:", train_df[TARGET_COLUMN].mean())
print("Average of identity columns:\n", train_df[IDENTITY_COLUMNS].mean(), "\n")

# start off with uniform weights of 1
sample_weights = np.ones(len(x_train), dtype=np.float32)

# sum the binary identity columns to give rows with identity values more weight (they contain more information) 
sample_weights += train_df[IDENTITY_COLUMNS].sum(axis=1)
change = train_df[IDENTITY_COLUMNS].sum(axis=1)
print(change.sum())

# when the target is 1, increase the weights further by counting the identity columns with 0 values
# this increases the weights for the positive class (toxic comment)
sample_weights += train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
change = train_df[TARGET_COLUMN] * (~train_df[IDENTITY_COLUMNS]).sum(axis=1)
print(change.sum())

# when the target is 0, increase the weights by counting the identity columns with value of 1
# multiply by some constant (5) to give this weighting more impact
# this increases the weights for the negative class (not a toxic comment)
sample_weights += (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
change = (~train_df[TARGET_COLUMN]) * train_df[IDENTITY_COLUMNS].sum(axis=1) * 5
print(change.sum())

# normalize the weights
sample_weights /= sample_weights.mean()

print("\n", sample_weights.shape, sample_weights.min(), sample_weights.max())

Average of target variable: 0.0799690172277954
Average of identity columns:
 male                             0.024647
female                           0.029603
homosexual_gay_or_lesbian        0.006093
christian                        0.022397
jewish                           0.004239
muslim                           0.011638
black                            0.008256
white                            0.013897
psychiatric_or_mental_illness    0.002709
dtype: float64 

222862
1259449
916525

 (1804874,) 0.42935264 21.03828


The target is imbalanced.  The identity columns are sparse.  This weighting scheme makes 3 things happen:
1. Observations with identity inforation are given a little more weight.

2. Toxic comments are given a lot more weight (the amount of extra weight depends on missing identity information, which is a large number of rows - this is just a convient and arbitrary way of weighting toxic comments more heavily since most rows are missing identity information)

3. Non-toxic comments are weighted even more when they do contain identity information.  This increase in weighting is scaled by 5, which is another arbitrary value.

Weighting observations like this eliminates the need to re-sample the training data to balance the target and makes certain observations impact the model's error more than others.  

In [7]:
# create word embeddings

fasttext_embeddings, fasttext_unknown_words = get_word_embeddings(tokenizer.word_index, WORD_EMBEDDINGS['fasttext'])
print('Unknown words (fast text): ', len(fasttext_unknown_words))

glove_embeddings, glove_unknown_words = get_word_embeddings(tokenizer.word_index, WORD_EMBEDDINGS['glove'])
print('Unknown words (glove): ', len(glove_unknown_words))

1999996it [01:00, 33327.60it/s]


Unknown words (fast text):  173678


2196017it [01:06, 32896.80it/s]


Unknown words (glove):  170383


In [8]:
embedding_matrix = np.concatenate([fasttext_embeddings, glove_embeddings], axis=-1)
print("Embedding matrix shape: ", embedding_matrix.shape)

del fasttext_embeddings
del glove_embeddings
gc.collect()

Embedding matrix shape:  (327009, 600)


0

In [9]:
def train_model(model, preds, weights):
    """
    Train a model EPOCHS times.  After each epoch, reset the learning rate of the optimizer, using 
    a learning rate scheduler.
    """
    for global_epoch in range(EPOCHS):
        model.fit(
            x_train,
            [y_train, y_aux_train],
            batch_size=BATCH_SIZE,
            epochs=1,
            verbose=1,
            sample_weight=[sample_weights.values, np.ones_like(sample_weights)],
            callbacks=[
                LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** global_epoch), verbose=1)
            ]
        )
        # although model has main_output and aux_output, only keep main_output (index 0)
        checkpoint_predictions.append(
            model.predict(x_test, batch_size=BATCH_SIZE)[0].flatten()
        )
        weights.append(2 ** global_epoch)


checkpoint_predictions = []
weights = []

for model_idx in range(NUM_MODELS):
    train_model(
        model=build_model(embedding_matrix, y_aux_train.shape[-1]),
        preds=checkpoint_predictions,
        weights=weights
    )

# average the output of the NUM_MODELS models as the final predictions
predictions = np.average(checkpoint_predictions, weights=weights, axis=0)

submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': predictions
})

submission.to_csv('submission.csv', index=False)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 220)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 220, 600)     196205400   ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 220, 600)    0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 220, 256)     747520      ['spatial_dropout1d[0][0]']  

NameError: name 'test' is not defined

In [15]:
test_df.loc[np.where(predictions>0.5)].head(25)

Unnamed: 0,id,comment_text
6,7097326,Our oils read; President IS taking different ...
19,7097339,Well here we go again. Let's continue to subs...
26,7097346,"Ignorance is bliss, ain't it?"
33,7097353,this is *&^%ing outrageous. The prosecutor sho...
35,7097355,The profoundly stupid have spoken.
36,7097356,The ignorance and bigotry comes from your post!
49,7097369,"An ""abject lesson"" is a lesson that is painful..."
51,7097371,Right on the money Gary Crum. And if they hide...
82,7097402,"Hey Dallas, Don't let the Iditarod get the bes..."
83,7097403,I can't believe this country was so stupid.. -...
