In [1]:
import os
import pandas as pd
import numpy as np
import random
import time
import gc
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from tqdm.notebook import tqdm_notebook as tqdm
from keras.preprocessing import text
from keras.utils.data_utils import pad_sequences
from sklearn.metrics import f1_score

Download and unzip the common crawl, 300 dimensional word vectors from:

https://fasttext.cc/docs/en/english-vectors.html

https://nlp.stanford.edu/projects/glove/

This notebook was adapted from:

https://www.kaggle.com/code/bminixhofer/deterministic-neural-networks-using-pytorch/notebook

https://www.kaggle.com/code/bminixhofer/simple-lstm-pytorch-version/notebook

https://www.kaggle.com/code/shujian/single-rnn-with-4-folds-clr/notebook

In [2]:
SEED = 14
DATA_PATH = '/'.join(os.getcwd().split("/")[:-1]) + '/data/jigsaw_unintended_bias/'
WORD_EMBEDDINGS = {
    'fasttext': '../word_vectors/crawl-300d-2M.vec',
    'glove': '../word_vectors/glove.840B.300d.txt'
}
TARGET_COLUMN = 'target'
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
NUM_MODELS = 1
EMBED_DIM = 300
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220  # max word embeddings per document
VOCAB_SIZE = 100000  # total distinct words or features - this limits the words to be embedded
EPOCHS = 1
BATCH_SIZE = 512

In [3]:
def seed_everything(seed: int):
    """Ensures experiment will run deterministically for any given seed, even with CUDA"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def preprocess(data: pd.Series):
    """
    Cleans the text by removing special characters and returning a pd.Series of string type.
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    """
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

    def clean_special_chars(text: str, punct: str):
        """Replaces the given characters, punct, in the string, text."""
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data


def get_coefs(word, *arr):
    """
    Converts a line from the embedding file to a tuple of (word, 32-bit numpy array)

    :param word: the first element in each line is the word
    :param arr: elements 2-n are the embedding dimensions
    """
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path: str):
    """
    Utility function to load word embeddings.  Each word embedding looks like:
    word 0.3 0.4 0.5 0.6 ...
    This function converts the embeddings to a dictionary of {word: numpy array}
    """
    with open(path, 'r', encoding='UTF-8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))


def get_word_embeddings(word_index: dict, path: str):
    """
    Maps words fround in the text (word_index) to their corresponding word embeddings from the 
    pre-trained model loaded from (path).  If any words cannot be found in the pre-trained model, 
    they are tracked in unknown_words.
    """
    embedding_index = load_embeddings(path)
    # create an empty matrix of shape (nbr_words, embed_dim)
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_DIM))
    unknown_words = []
    
    # map all words from the text to their embeddings, if they exist in the embedding index
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words


def sigmoid(x: np.ndarray):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-x))


def train_model(
    model, 
    train, 
    test, 
    loss_fn, 
    output_dim: int, 
    lr: float = 0.001,
    batch_size: int = 512, 
    n_epochs: int = 4, 
    enable_checkpoint_ensemble: bool = True
):
    """
    Trains a model on the training set.  
    """
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    # decay the learning rate using a schedule of 0.6^epoch
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        start_time = time.time()
        
        # increment learning rate schedule
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for data in tqdm(train_loader, disable=False):
            # the target is the final column in data
            x_batch = data[:-1]
            y_batch = data[-1]

            # forward pass and calculate loss
            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            # zero out the gradients for the optimizer, now that the loss has been calculated
            optimizer.zero_grad()
            
            # backpropagate the loss
            loss.backward()

            # update the weights using the optimizer
            optimizer.step()
            
            # track the mean loss over all batches in this epoch
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))

        # run each batch of the test data through the model
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        # append the batch test_preds to the epoch's all_test_preds
        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print(f'Epoch {epoch + 1}/{n_epochs} \t loss={avg_loss:.4f} \t time={elapsed_time:.2f}s')

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds


def threshold_search(y_true, y_proba):
    """Finds the best probability threshold to maximize F1 score"""
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result


class Attention(nn.Module):
    """
    Implements an attention module.
    """
    def __init__(self, feature_dim: int, step_dim: int, bias: bool = True, **kwargs):
        """
        Builds the decoder piece of attention module.  The encoder piece is assumed to have 
        been built with either a bi-directional RNN or self attention. 
        
        :param feature_dim: the number of features, or input layer size
        :param step_dim: the max sequence length
        """
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True
        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        # initialize weights
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        """
        Implements forward pass through attention module
        
        :param x: the encoded input vector from a bi-directional RNN, or in other words 
            the concatenated foward and backward hidden states, h_j, from the equations 
            for attention.  For help understanding this, 
            see: https://bgg.medium.com/seq2seq-pay-attention-to-self-attention-part-1-d332e85e9aad
        """
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        # alignment vector that scores how well the inputs at position j match the output at position i
        # this is e_ij = a(s_i-1, h_j), 
        #   where s_i-1 is the decoder hidden states (self.weight) and h_j is the jth input label (x)
        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)

        # the attention score, a_ij, is just 'a' here, and the next line computs the numerator of a_ij
        a = torch.exp(eij)
        
        # if masked, multiply the attention score by hidden states of the input sequence
        if mask is not None:
            a = a * mask

        # finalize computation of a_ij
        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        # weight the input by multiplying it by the attention score, a_ij * h_j
        weighted_input = x * torch.unsqueeze(a, -1)
        
        # sum the weighted input to return the context vector, ci
        return torch.sum(weighted_input, 1)


class SpatialDropout(nn.Dropout2d):
    """
    Implements the functionality of Keras' SpatialDropout1D.
    Randomly drop features, i.e. [[1, 1, 1], [2, 1, 2]] -> [[1, 0, 1], [2, 0, 2]]
    Compare this with ordinary dropout that drops by sample, i.e. [[1, 1, 1], [2, 1, 2]] -> [[1, 0, 1], [0, 1, 2]]
    """
    def forward(self, x):
        x = x.unsqueeze(2)    # add a dimension of size 1 at position 2, producing (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # re-order dimensions to (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # re-order dimensions to (N, T, 1, K)
        x = x.squeeze(2)  # remove dimension of size 1 at position 2, producing (N, T, K)
        return x


class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix: np.ndarray, num_aux_targets: int):
        """Sets up neural network architecture"""
        super(NeuralNet, self).__init__()
        
        # set up a non-trainable, pre-trained embedding layer from the provided embedding_matrix
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)  # randomly drop this percent of features
        
        # each bidirectional layer outputs 2 sequences: 1 forward, 1 backward, and concatenates them
        # so stacking 2 enriches the sequence features
        self.lstm1 = nn.LSTM(
            input_size=EMBED_DIM, 
            hidden_size=LSTM_UNITS, 
            bidirectional=True, 
            batch_first=True
        )
        self.lstm2 = nn.LSTM(
            input_size=LSTM_UNITS * 2, 
            hidden_size=LSTM_UNITS, 
            bidirectional=True, 
            batch_first=True
        )
    
        # skip connections...
        # add a product of a dense layer with the hidden layer to the output of the the hidden layer
        self.linear1 = nn.Linear(in_features=DENSE_HIDDEN_UNITS, out_features=DENSE_HIDDEN_UNITS, bias=True)
        self.linear2 = nn.Linear(in_features=DENSE_HIDDEN_UNITS, out_features=DENSE_HIDDEN_UNITS, bias=True)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        
        # auxiliary outputs to be predicted as an alternative to the main output
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
    def forward(self, x):
        """Implements forward pass"""
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # attenion module can be inserted here, as an attention module's encoder uses a bi-directional RNN, 
        #   which was just defined above
        # atten_1 = Attention(LSTM_UNITS * 2, MAX_LEN)(h_lstm1)  # skip connection
        # atten_2 = Attention(LSTM_UNITS * 2, MAX_LEN)(h_lstm2)

        avg_pool = torch.mean(h_lstm2, 1)  # global mean pooling
        max_pool, _ = torch.max(h_lstm2, 1)  # global max pooling

        # concatenate to reshape from (batch_size, MAX_LEN, LSTM_UNITS * 2) to h_conc (BATCH_SIZE, LSTM_UNITS * 4)
        # if using attention, un-comment the next line and comment out the line after
        # h_conc = torch.cat((atten_1, atten_2, max_pool, avg_pool), 1)
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out


In [4]:
seed_everything(seed=SEED)

In [5]:
# expect ~2 Gb RAM for the data
train_df = pd.read_csv(DATA_PATH + 'train.csv')
test_df = pd.read_csv(DATA_PATH + 'test.csv')

x_train = preprocess(train_df['comment_text'])
y_train = np.where(train_df['target'] >= 0.5, 1, 0)  # binarize the target
y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test_df['comment_text'])

In [6]:
tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=MAX_LEN)
x_test = pad_sequences(x_test, maxlen=MAX_LEN)

max_features = min(VOCAB_SIZE, len(tokenizer.word_index) + 1)
print(f"Max Features (Vocab Size) {max_features}")

Max Features (Vocab Size) 100000


In [7]:
# create word embeddings

fasttext_embeddings, fasttext_unknown_words = get_word_embeddings(tokenizer.word_index, WORD_EMBEDDINGS['fasttext'])
print('Unknown words (fast text): ', len(fasttext_unknown_words))

glove_embeddings, glove_unknown_words = get_word_embeddings(tokenizer.word_index, WORD_EMBEDDINGS['glove'])
print('Unknown words (glove): ', len(glove_unknown_words))

0it [00:00, ?it/s]

Unknown words (fast text):  173678


0it [00:00, ?it/s]

Unknown words (glove):  170383


In [8]:
embedding_matrix = np.concatenate([fasttext_embeddings, glove_embeddings], axis=-1)
print("Embedding matrix shape: ", embedding_matrix.shape)

del fasttext_embeddings
del glove_embeddings
gc.collect()

Embedding matrix shape:  (327009, 600)


42

In [10]:
# move data to CUDA
x_train_torch = torch.tensor(x_train, dtype=torch.long)#.cuda()
x_test_torch = torch.tensor(x_test, dtype=torch.long)#.cuda()
y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]), dtype=torch.float32)#.cuda()

# convert to tensor datasets
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

In [None]:
all_test_preds = []

# train NUM_MODELS models and average their output for the final predictions
for model_idx in range(NUM_MODELS):
    print('\nModel ', model_idx)

    # fit each model with a different seed, otherwise they will be identical
    seed_everything(SEED + model_idx)
    
    model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
    #model.cuda()
    
    test_preds = train_model(
        model, 
        train_dataset, 
        test_dataset, 
        output_dim=y_train_torch.shape[-1],
        loss_fn=nn.BCEWithLogitsLoss(reduction='mean')
    )
    all_test_preds.append(test_preds)


Model  0


In [None]:
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': np.mean(all_test_preds, axis=0)[:, 0]
})

submission.to_csv('pt_submission.csv', index=False)

In [None]:
test_df.loc[np.where(np.mean(all_test_preds, axis=0)[:, 0]>0.5)].head(25)

Sample weighting was not used here.

## Notes from the author

Note that the solution is not validated in this kernel. So for tuning anything, you should build a validation framework using e. g. KFold CV. If you just check what works best by submitting, you are very likely to overfit to the public leader board.


### Ways to improve this kernel

This kernel is just a simple baseline kernel, so there are many ways to improve it. Some ideas to get you started:

* Add a contraction mapping. E. g. mapping "is'nt" to "is not" can help the network because "not" is explicitly mentioned. They were very popular in the recent quora competition, see for example this kernel: https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing.
* Try to reduce the number of words that are not found in the embeddings. At the moment, around 170k words are not found. We can take some steps to decrease this amount, for example trying to find a vector for a processed (capitalized, stemmed, ...) version of the word when the vector for the regular word can not be found. See the 3rd place solution of the quora competition (https://www.kaggle.com/wowfattie/3rd-place) for an excellent implementation of this.
* Try cyclic learning rate (CLR). I have found CLR to almost always improve my network recently compared to the default parameters for Adam. In this case, we are already using a learning rate scheduler, so this might not be the case. But it is still worth to try it out. See for example my my other PyTorch kernel (https://www.kaggle.com/bminixhofer/deterministic-neural-networks-using-pytorch) for an implementation of CLR in PyTorch.
* Use sequence bucketing to train faster and fit more networks into the two hours. The winning team of the quora competition (https://www.kaggle.com/c/quora-insincere-questions-classification/discussion/80568#latest-487092) successfully used sequence bucketing to drastically reduce the time it took to train RNNs. An excerpt from their solution summary:

"
We aimed at combining as many models as possible. To do this, we needed to improve runtime and the most important thing to achieve this was the following. We do not pad sequences to the same length based on the whole data, but just on a batch level. That means we conduct padding and truncation on the data generator level for each batch separately, so that length of the sentences in a batch can vary in size. Additionally, we further improved this by not truncating based on the length of the longest sequence in the batch, but based on the 95% percentile of lengths within the sequence. This improved runtime heavily and kept accuracy quite robust on single model level, and improved it by being able to average more models.
"

* Try a (weighted) average of embeddings instead of concatenating them. A 600d vector for each word is a lot, it might work better to average them instead. See this paper for why this even works (https://www.aclweb.org/anthology/N18-2031).
* Limit the maximum number of words used to train the NN. At the moment, there is no limit set to the maximum number of words in the tokenizer, so we use every word that occurs in the training data, even if it is only mentioned once. This could lead to overfitting so it might be better to limit the maximum number of words to e. g. 100k.