# Toxic Comments Classification

In [None]:
import argparse

In [4]:
import  torch
import os
import itertools, functools
import pickle

In [5]:
import scipy, matplotlib, tqdm
import numpy as np
import pandas as pd
import nltk, re
from multiprocessing import Pool

NLTK stuff needed for lemmatizing.

In [4]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tsypi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tsypi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Basic editing. Removing non-letters and putting letters to lower register.

In [5]:
def edit_text(sentence):
    return ' '.join([elem for elem in re.split('\W+', str.lower(sentence))])

Get word tags from nltk for lemmatizing.

In [6]:
def get_wordnet_pos(tag):
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return nltk.corpus.wordnet.NOUN
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return nltk.corpus.wordnet.VERB
    elif tag in ['RB', 'RBR', 'RBS']:
        return nltk.corpus.wordnet.ADV
    elif tag in ['JJ', 'JJR', 'JJS']:
        return nltk.corpus.wordnet.ADJ
    else:
        return nltk.corpus.wordnet.NOUN

Given a mapping from words to their indices transform a sentence to indices sequence.

In [7]:
def get_indices(sentence, emb_dict):
    res = []
    for word in sentence.split(' '):
        ind = emb_dict.get(word)
        if ind is None:
            continue
        else:
            res.append(ind)
    return res

Defining simple lemmatizer.

In [8]:
lemmatizer = nltk.WordNetLemmatizer()

def simple_lemmatizer(sentence):
    tokenized_sent = sentence.split()
    pos_taged = [(word, get_wordnet_pos(tag)) for word, tag in nltk.pos_tag(tokenized_sent)]
    return " ".join([lemmatizer.lemmatize(word, tag) for word, tag in pos_taged])

Uploading train set and test set.

In [9]:
training_set = pd.read_csv("./data/train.csv", sep=',')
training_labels = training_set.iloc[:, 2:]
test_set = pd.read_csv("./data/test.csv", sep=',')
test_labels = pd.read_csv("./data/test_labels.csv", sep=',')

Delete all symbols from training set except for letters and numbers, convert to lowercase:

In [70]:
X_train = list(map(edit_text, training_set.iloc[:, 1].values))
X_test = list(map(edit_text, test_set.iloc[:, 1].values))

Lemmatize words:

In [71]:
X_train_lemmatized = list(map(simple_lemmatizer, X_train))
X_test_lemmatized = list(map(simple_lemmatizer, X_test))

Save lemmatized datasets due to extensive computation time.

In [73]:
with open("./data/train_lem.pickle", 'wb') as f:
    pickle.dump(X_train_lemmatized, f)

with open("./data/test_lem.pickle", 'wb') as f:
    pickle.dump(X_test_lemmatized, f)

Load lemmatized datasets if needed:

In [10]:
#with open("./data/train_lem.pickle", 'rb') as f:
#    X_train_lemmatized = pickle.load(f)

#with open("./data/test_lem.pickle", 'rb') as f:
#    X_test_lemmatized = pickle.load(f)

Paths to word embeddings.

In [1]:
FASTTEXT_path = "./embeddings/crawl-300d-2M.vec"
GLOVE_path = "./embeddings/glove.twitter.27B.200d.txt"

Get embeddings of one million most popular words:

In [16]:
embedding_size = 200

In [17]:
glove_text_emb = []
glove_text_to_ix = {}

for i, line in tqdm.tqdm(enumerate(open(os.getcwd() + GLOVE_path, encoding='utf-8'))):
    if i == 0:
        continue
    if i == 1000001:
        break
    line = line.rstrip('\n').split(' ')
    glove_text_to_ix.update({line[0]:i - 1})
    glove_text_emb.append(np.array(list(map(lambda x:float(x), line[1:]))))

glove_text_emb.append(np.zeros(embedding_size))
glove_text_emb = torch.from_numpy(np.array(glove_text_emb))

999668it [01:28, 10082.20it/s]

In [17]:
embedding_size_ft = 300

In [81]:
fast_text_emb = []
fast_text_to_ix = {}

for i, line in tqdm.tqdm(enumerate(open(os.getcwd() + FASTTEXT_path, encoding='utf-8'))):
    if i == 0:
        continue
    if i == 1000001:
        break
    line = line.strip('\n').split(' ')
    fast_text_to_ix.update({line[0]:i - 1})
    fast_text_emb.append(np.array(list(map(lambda x:float(x), line[1:-1]))))

fast_text_emb.append(np.zeros(embedding_size_ft))
fast_text_emb = torch.from_numpy(np.array(fast_text_emb))


0it [00:00, ?it/s]
799it [00:00, 7932.10it/s]
1616it [00:00, 7984.87it/s]
2446it [00:00, 8060.07it/s]
3276it [00:00, 8108.63it/s]
4078it [00:00, 8058.08it/s]
4867it [00:00, 7988.50it/s]
5631it [00:00, 7862.82it/s]
6400it [00:00, 7793.51it/s]
7200it [00:00, 7836.72it/s]
8004it [00:01, 7879.76it/s]
8829it [00:01, 7970.18it/s]
9664it [00:01, 8064.16it/s]
10484it [00:01, 8086.12it/s]
11286it [00:01, 7906.18it/s]
12077it [00:01, 7893.70it/s]
12912it [00:01, 8011.79it/s]
13712it [00:01, 7944.14it/s]
14524it [00:01, 7978.86it/s]
15355it [00:01, 8053.06it/s]
16161it [00:02, 7770.63it/s]
16940it [00:02, 7759.45it/s]
17718it [00:02, 7702.03it/s]
18492it [00:02, 7697.29it/s]
19263it [00:02, 7615.56it/s]
20051it [00:02, 7677.18it/s]
20870it [00:02, 7807.80it/s]
21652it [00:02, 7770.47it/s]
22430it [00:02, 7733.27it/s]
23204it [00:02, 7704.00it/s]
23975it [00:03, 7689.00it/s]
24761it [00:03, 7730.06it/s]
25562it [00:03, 7799.70it/s]
26373it [00:03, 7867.98it/s]
27161it [00:03, 7866.71it/s]
27948it

Save embeddings:

In [82]:
with open('./embeddings/glove_emb.pickle', 'wb') as f:
    pickle.dump((glove_text_emb, glove_text_to_ix), f)
    
with open('./embeddings/fasttext_emb.pickle', 'wb') as f:
    pickle.dump((fast_text_emb, fast_text_to_ix), f)

Load embeddings if needed:

In [11]:
#with open('./embeddings/glove_emb.pickle', 'rb') as f:
#    glove_text_emb, glove_text_to_ix = pickle.load(f)
    
#with open('./embeddings/fasttext_emb.pickle', 'rb') as f:
#    fast_text_emb, fast_text_to_ix = pickle.load(f)

Transform datasets into sets of index sequences.

Each row of $X\_train$ is a sequence of indices corresponding to their respective GLOVE embeddings.
 
For $X\_train\_lemmatized$ each index corresponds to the index of FASTText embedding.

In [None]:
X_train = list(map(functools.partial(get_indices, emb_dict = glove_text_to_ix), X_train))
X_test = list(map(functools.partial(get_indices, emb_dict = glove_text_to_ix), X_test))

In [12]:
X_train_lemmatized = list(map(functools.partial(get_indices, emb_dict = fast_text_to_ix), X_train_lemmatized))
X_test_lemmatized = list(map(functools.partial(get_indices, emb_dict = fast_text_to_ix), X_test_lemmatized))

Batch generator which padds sequences to _maxlen_ in batch. 

In [13]:
def batch_generator(X_train, batch_size, ind):
    for i in range(0, len(X_train), batch_size):
        batch = X_train[i:i + batch_size]
        max_len = max(list(map(lambda x:len(x), batch)))
        yield torch.LongTensor(list(map(lambda x:x + [ind] * (max_len - len(x)), batch)))

Configure divece and initiate neural network:

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() is True else 'cpu')
#device = torch.device('cpu')

GLOVE embeddings with no lemmatization.

Define hyper parameters:

In [18]:
input_size = embedding_size
hidden_size = 128
num_layers = 2
num_classes = 6
batch_size = 64
num_epochs = 2
learning_rate = 0.005

Simple two-layer bidirectional LSTM with dropout:

In [19]:
class BLSTM(torch.nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, num_classes, embeddings, dropout=0.5):
        super(BLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = torch.nn.Embedding.from_pretrained(embeddings)
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True).to(device)
        self.linear = torch.nn.Linear(hidden_size * 2, num_classes).to(device)
    
    def init_hidden(self, batch_size):
        h0 = torch.rand(self.num_layers * 2, batch_size, self.hidden_size).to(device)
        c0 = torch.rand(self.num_layers * 2, batch_size, self.hidden_size).to(device)
        return (h0, c0)

    def forward(self, x):
        h0, c0 = self.init_hidden(x.size(0))
        out = self.embedding(x).float().to(device)
        out, _ = self.lstm(out, (h0, c0))
        out = self.linear(out[:, -1, :])
        return torch.nn.functional.sigmoid(out)

In [28]:
model = BLSTM(input_size, hidden_size, num_layers, num_classes, glove_text_emb)

Load model from a file if needed:

In [75]:
#model.load_state_dict(torch.load(os.getcwd() + "./checkpoints/bilstm.ckpt"))

Using binary cross-entropy as loss-function ans Adam as optimizer.

In [29]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Training process:

In [30]:
acc_loss = 0
verbose_step = 12800
total_steps = len(X_train)

for epoch in range(num_epochs):
    gen = batch_generator(X_train, batch_size, len(glove_text_emb) - 1)
    for i, batch in enumerate(gen):
        labels = torch.FloatTensor(training_labels.iloc[batch_size * i:batch_size * (i + 1), :].values).to(device)
        
        outputs = model(batch)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        acc_loss += loss.item()
        if ((i + 1) * batch_size) % verbose_step == 0:
            print('Epoch [{:1d} / {:1d}], Step [{:6d} / {:6d}], Average loss: {:.4f}'
                  .format(epoch + 1, num_epochs, (i + 1) * batch_size, total_steps, acc_loss * batch_size / verbose_step))
            acc_loss = 0
        

Epoch [1 / 2], Step [ 12800 / 159571], Average loss: 0.1511
Epoch [1 / 2], Step [ 25600 / 159571], Average loss: 0.1477
Epoch [1 / 2], Step [ 38400 / 159571], Average loss: 0.1448
Epoch [1 / 2], Step [ 51200 / 159571], Average loss: 0.1420
Epoch [1 / 2], Step [ 64000 / 159571], Average loss: 0.1417
Epoch [1 / 2], Step [ 76800 / 159571], Average loss: 0.1462
Epoch [1 / 2], Step [ 89600 / 159571], Average loss: 0.1449
Epoch [1 / 2], Step [102400 / 159571], Average loss: 0.1360
Epoch [1 / 2], Step [115200 / 159571], Average loss: 0.1378
Epoch [1 / 2], Step [128000 / 159571], Average loss: 0.1438
Epoch [1 / 2], Step [140800 / 159571], Average loss: 0.1409
Epoch [1 / 2], Step [153600 / 159571], Average loss: 0.1428
Epoch [2 / 2], Step [ 12800 / 159571], Average loss: 0.2073
Epoch [2 / 2], Step [ 25600 / 159571], Average loss: 0.1032
Epoch [2 / 2], Step [ 38400 / 159571], Average loss: 0.0798
Epoch [2 / 2], Step [ 51200 / 159571], Average loss: 0.0616
Epoch [2 / 2], Step [ 64000 / 159571], A

Saving model:

In [31]:
torch.save(model.state_dict(), './checkpoints/bilstm.ckpt')

Evaluation:

In [29]:
def predict(model, X_test, emb):
    res = []
    gen = batch_generator(X_test, batch_size, len(emb) - 1)
    for batch in gen:
        output = model(batch).to(torch.device('cpu'))
        res.append(output)
    return res

In [43]:
with torch.no_grad():
    res = predict(model, X_test, glove_text_emb)

In [58]:
last = res[-1]
res = torch.stack(res[:-1])
res = res.reshape(-1, 6)
res = torch.cat((res, last), 0)

Saving predictions to csv file.

In [None]:
submission = pd.DataFrame(res.numpy(), columns=training_set.columns[2:])
submission = pd.concat((test_set.id, submission), axis=1)
submission.to_csv("blstm_submission.csv", index=False)

In [22]:
input_size_ft = embedding_size_ft

Fasttext embeddings + lemmatization:

In [23]:
model_lem = BLSTM(input_size_ft, hidden_size, num_layers, num_classes, fast_text_emb)

Load model from a file if needed:

In [24]:
#model_lem.load_state_dict(torch.load(os.getcwd() + "./checkpoints/bilstm_lemm.ckpt"))

In [95]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model_lem.parameters(), lr=learning_rate)

Training process:

In [97]:
acc_loss = 0
verbose_step = 12800
total_steps = len(X_train_lemmatized)

for epoch in range(1):
    gen = batch_generator(X_train_lemmatized, batch_size, len(fast_text_emb) - 1)
    for i, batch in enumerate(gen):
        #sentence = sentence.reshape(batch_size, sentence.size(0))
        labels = torch.FloatTensor(training_labels.iloc[batch_size * i:batch_size * (i + 1), :].values).to(device)
        
        outputs = model_lem(batch)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        acc_loss += loss.item()
        if ((i + 1) * batch_size) % verbose_step == 0:
            print('Epoch [{:1d} / {:1d}], Step [{:6d} / {:6d}], Average loss: {:.4f}'
                  .format(epoch + 1, num_epochs, (i + 1) * batch_size, total_steps, acc_loss * batch_size / verbose_step))
            acc_loss = 0
        

Epoch [1 / 2], Step [ 12800 / 159571], Average loss: 0.0545
Epoch [1 / 2], Step [ 25600 / 159571], Average loss: 0.0566
Epoch [1 / 2], Step [ 38400 / 159571], Average loss: 0.0536
Epoch [1 / 2], Step [ 51200 / 159571], Average loss: 0.0545
Epoch [1 / 2], Step [ 64000 / 159571], Average loss: 0.0582
Epoch [1 / 2], Step [ 76800 / 159571], Average loss: 0.0529
Epoch [1 / 2], Step [ 89600 / 159571], Average loss: 0.0544
Epoch [1 / 2], Step [102400 / 159571], Average loss: 0.0492
Epoch [1 / 2], Step [115200 / 159571], Average loss: 0.0501
Epoch [1 / 2], Step [128000 / 159571], Average loss: 0.0528
Epoch [1 / 2], Step [140800 / 159571], Average loss: 0.0503
Epoch [1 / 2], Step [153600 / 159571], Average loss: 0.0526


Saving model.

In [98]:
torch.save(model_lem.state_dict(), './checkpoints/bilstm_lemm.ckpt')

Evaluation:

In [31]:
with torch.no_grad():
    res_lem = predict(model_lem, X_test_lemmatized, fast_text_emb)

In [32]:
last_lem = res_lem[-1]
res_lem = torch.stack(res_lem[:-1])
res_lem = res_lem.reshape(-1, 6)
res_lem = torch.cat((res_lem, last_lem), 0)

Saving submission to csv file.

In [34]:
submission_lem = pd.DataFrame(res_lem.numpy(), columns=training_set.columns[2:])
submission_lem = pd.concat((test_set.id, submission_lem), axis=1)
submission_lem.to_csv("./submissions/blstm_lem_submission.csv", index=False)

Upload first submission if needed.

In [36]:
submission = pd.read_csv(os.getcwd() + "./submissions/blstm_submission.csv", sep=',')

Averaging two submissions to make a simple ensemble.

In [45]:
stacked_submission = (submission.iloc[:, 1:].values + submission_lem.iloc[:, 1:].values) / 2
stacked_submission = pd.DataFrame(stacked_submission, columns=training_set.columns[2:])
stacked_submission = pd.concat((test_set.id, stacked_submission), axis=1)
stacked_submission.to_csv("./submissions/stacked_submission.csv", index=False)