In [1]:
13

13

In [2]:
import sklearn
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors

from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.regularizers import l2

In [3]:
COMMENT = 'comment_text'
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
base_path = 'jigsaw-toxic-comment-classification-challenge'
train = pd.read_csv(f'{base_path}/train.csv')
test = pd.read_csv(f'{base_path}/test.csv')
submission = pd.read_csv(f'{base_path}/sample_submission.csv')
test_labels = pd.read_csv(f'{base_path}/test_labels.csv')

In [5]:
EMBEDDING_FILE = f'{base_path}/glove.6B.300d.txt'
def get_coefs(word,*arr): 
    return word.lower(), np.asarray(arr, dtype='float32')

with open(EMBEDDING_FILE) as f:
    embeddings_index_1 = dict(get_coefs(*o.strip().split()) for o in f)

all_embs_1 = np.stack(embeddings_index_1.values())
emb_mean_1 = all_embs_1.mean()
emb_std_1 = all_embs_1.std()
emb_mean_1, emb_std_1

  if self.run_code(code, result):


(-0.0039050116, 0.38177028)

In [6]:
EMBEDDING_FILE = f'{base_path}/GoogleNews-vectors-negative300.bin'

wv = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embeddings_index_2 = {w.lower():wv.get_vector(w) for w in wv.index2word}

all_embs_2 = np.stack(embeddings_index_2.values())
emb_mean_2 = all_embs_2.mean()
emb_std_2 = all_embs_2.std()
emb_mean_2, emb_std_2

  if self.run_code(code, result):


(-0.0033393223, 0.1248688)

# Problem understanding

# Dataset generation

# Metrics define

# Validation strategy

# Data processing (extract useful information)

In [7]:
import sys
sys.path.append('/home/shanger_lin/python3.6/ShangerUtility/')
from spellcheck import Corrector

word_freq = '/home/shanger_lin/python3.6/ShangerUtility/frequency_dictionary_en_82_765.txt'
C = Corrector(word_freq)

In [8]:
import re
import textwrap
import unicodedata
import warnings
from string import ascii_letters, digits, punctuation

from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')


class BaseFilter:
    """Base filter class, it does NOTHING.
    This class is design for providing ``__repr__`` string, which is useful in
    python interactive prompt.
    """

    def __repr__(self):
        return '<%s>' % type(self).__name__

    def __call__(self, text: str) -> str:
        return text


class GeneralFilter(BaseFilter):

    def __repr__(self):
        return '<{clsn} repl=`{repl}`>'.format(clsn=type(self).__name__,
                                               repl=self.repl)

    def __call__(self, s: str, count: int = 0):
        return self.pattern.sub(self.repl, s, count)


class EmailFilter(GeneralFilter):
    
    def __init__(self, replace=' '):
        if isinstance(replace, str):
            self.repl = replace
        else:
            self.repl = lambda match: next(replace) # for iterator tag
        self.expression = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
        self.pattern = re.compile(self.expression)


class URLFilter(GeneralFilter):
    
    # copied from string_constants.py
    schemes = list(map(lambda x: '{}:\/\/'.format(x), ("http", "https", "ftp", "sftp", "irc", "magnet", "file", "data"))) + ['www.']
    schemes_lenient = list(map(lambda x: '{}:\/\/'.format(x), ("h?ttp", "h?ttps", "ftp", "sftp", "irc", "magnet", "file", "data"))) + ['www.']

    # from RFC
    gen_delims = ":/?#[]@"
    sub_delims = "!$&'()*+;=%"
    reserved = gen_delims + sub_delims
    unreserved = ascii_letters + digits + "-._~"
    valid_uri_characters = reserved + unreserved

    def __init__(self, replace=' ', allow_common_typoes=True):
        
        if isinstance(replace, str):
            self.repl = replace
        else:
            self.repl = lambda match: next(replace) # for iterator tag

        if allow_common_typoes:
            self.expression = r"((?:{})[{}]+)".format("|".join(self.schemes), re.escape(self.valid_uri_characters))
        else:
            self.expression = r"((?:{})[{}]+)".format("|".join(self.schemes_lenient), re.escape(self.valid_uri_characters))
        self.pattern = re.compile(self.expression)


class EscapeCharFilter(GeneralFilter):

    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'([{}])'.format('\r\n\t\v')
        self.pattern = re.compile(self.expression)


class InvisableCharFilter(GeneralFilter):
    
    def __init__(self, replace:str='', ):
        self.repl = replace
        self.expression = r'([{}])'.format(r'\xa0\xad\u200c')
        self.pattern = re.compile(self.expression)


class KeyWordsFilter(GeneralFilter):
    
    def __init__(self, replace:str=' ', keywords=('mailto',)):
        self.repl = replace
        self.expression = r'({})'.format('|'.join(map(lambda x: '({})'.format(x), keywords)))
        self.pattern = re.compile(self.expression)


class MultiSpacesFilter(GeneralFilter):
    
    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'(\s+)'
        self.pattern = re.compile(self.expression)


class FilterNonChars(GeneralFilter):

    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'(\W+)'
        self.pattern = re.compile(self.expression)
        
        
class HTMLFilter(GeneralFilter):
    def __init__(self, replace:str=''):
        self.repl = replace
        self.expression = "<[^>]*>"
        self.pattern = re.compile(self.expression)

class HTML2Text:
    
    def __init__(self, filters=('script', 'style', 'meta', 'noscript')):
        self.filters = filters

    def extract_text_from_html(self, html:str):

        soup = BeautifulSoup(html, features='lxml')
        for script in soup(self.filters): # remove all javascript and stylesheet code
            script.decompose()
        return soup.get_text()
    
    def __call__(self, html:str):
        try:
            return self.extract_text_from_html(html)
        except:
            return str()



class UnicodeStandarizer:
    
    def __init__(self, method='NFKC'): # NFD, NFC, NFKD, NFKC
        self.method = method
    
    def __call__(self, text:str):
        return unicodedata.normalize(self.method, text)


class Stripper:
    @staticmethod    
    def __call__(text:str):
        return text.strip()


class Lowercasting:
    @staticmethod    
    def __call__(text:str):
        return text.lower()


class SequentialProcessor(BaseFilter):
    """Sequential processor: A class provide an interface to create integral
    processor that would sequentially filter text by the given filters.
    Usage::
        text_preprocessor = SequentialProcessor(
            UnicodeStandarizer(),
            HTML2Text(),
            InvisableCharFilter(),
            LIDFilter(),
            Lowercasting(),
            KeyWordsFilter(),
            MultiSpacesFilter(),
            Stripper(),
        )
    """

    def __init__(self, *processors):
        self.pipeline = processors

    def __repr__(self):
        if len(self.pipeline) == 1:
            children_desc = repr(self.pipeline)
        else:
            # use textwrap to support nested processors
            children_desc = '\n' + ',\n'.join(
                textwrap.indent(repr(p), ' ' * 4)
                for p in self.pipeline)
        return '<{clsn} [{child}]>'.format(clsn=type(self).__name__,
                                           child=children_desc)

    def __call__(self, text: str):
        for processor in self.pipeline:
            text = processor(text)
        return text

class GeneralProcessor(SequentialProcessor):
    pass

class KeepWordSpace(GeneralFilter):
    def __init__(self, replace:str=' '):
        self.repl = replace
        self.expression = r'([^\w\s])'
        self.pattern = re.compile(self.expression)
        
class WordModelPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), URLFilter(), EscapeCharFilter(), KeepWordSpace(), Lowercasting(), MultiSpacesFilter(), Stripper())

class LIDFilter(GeneralFilter):
    
    def __init__(self, replace=' '):
        self.repl = replace
        filters = EmailFilter(), URLFilter(), EscapeCharFilter(), FilterNonChars()
        self.expression = '|'.join(list(map(lambda x: x.expression, filters)))
        self.pattern = re.compile(self.expression)

class TextPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), InvisableCharFilter(), LIDFilter(), Lowercasting(), KeyWordsFilter(), MultiSpacesFilter(), Stripper())

In [9]:
class Segmanent:
    @staticmethod    
    def __call__(text:str):
        text = C.sentence_segmanent(text, return_details=False)
        return text
class SentenceCheck:
    @staticmethod    
    def __call__(text:str):
        text = C.sentence_check(text, return_details=False)[0]
        return text

class TextPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), URLFilter(), EmailFilter(), InvisableCharFilter(), LIDFilter(), Lowercasting(), KeyWordsFilter(), MultiSpacesFilter(), Stripper(), Segmanent(), SentenceCheck())
        
class SimpleTextPreprocessor(GeneralProcessor):
    
    def __init__(self):
        self.pipeline = (UnicodeStandarizer(), HTML2Text(), URLFilter(), EmailFilter(), InvisableCharFilter(), LIDFilter(), Lowercasting(), KeyWordsFilter(), MultiSpacesFilter(), Stripper(), SentenceCheck())

In [62]:
text_preprocessor = TextPreprocessor()
simple_text_preprocessor = SimpleTextPreprocessor()

In [63]:
from multiprocess import Pool

text_preprocessor = TextPreprocessor()
with Pool() as pool:
    train[COMMENT] = pool.map(simple_text_preprocessor, train[COMMENT])
    test[COMMENT] = pool.map(simple_text_preprocessor, test[COMMENT])

In [64]:
x_train = train["comment_text"].values
x_test = test["comment_text"].values

In [65]:
MAX_TOKENS = 30000
MAX_SENTENSE_LEN = 100
EMBEDDING_SIZE_1 = all_embs_1.shape[1]
EMBEDDING_SIZE_2 = all_embs_2.shape[1]
EMBEDDING_SIZE = EMBEDDING_SIZE_1 + EMBEDDING_SIZE_2

In [66]:
x_train = train["comment_text"].values
y_train = train[LABELS].values
x_test = test["comment_text"].values

tokenizer = text.Tokenizer(num_words=MAX_TOKENS)
tokenizer.fit_on_texts(list(x_train) + list(x_test)) # is it a proper trick?

x_train, x_test = map(tokenizer.texts_to_sequences, [x_train, x_test])
x_train, x_test = map(lambda x: sequence.pad_sequences(x, maxlen=MAX_SENTENSE_LEN), [x_train, x_test])


In [67]:
word_index = tokenizer.word_index
nb_words = min(MAX_TOKENS, len(word_index))
embedding_matrix_1 = np.random.normal(emb_mean_1, emb_std_1, (nb_words, EMBEDDING_SIZE_1))
embedding_matrix_2 = np.random.normal(emb_mean_2, emb_std_2, (nb_words, EMBEDDING_SIZE_2))
for word, i in word_index.items():
    if i >= MAX_TOKENS: continue

    embedding_vector_1 = embeddings_index_1.get(word)
    if embedding_vector_1 is not None: 
        embedding_matrix_1[i] = embedding_vector_1
        
    embedding_vector_2 = embeddings_index_2.get(word)
    if embedding_vector_2 is not None: 
        embedding_matrix_2[i] = embedding_vector_2

In [68]:
embedding_matrix_1.shape

(30000, 300)

In [69]:
embedding_matrix_2.shape

(30000, 300)

In [70]:
embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2), axis=1)
embedding_matrix.shape

(30000, 600)

# Data understanding & visualization

# De-noise (no drop data)

# Feature engineering

# Offline augmentation

# Standarization

# Scaling

# Normalization

# Feature selection

# Data selection

# Optimization

In [71]:
model_input = keras.Input(shape=(MAX_SENTENSE_LEN, ))
x = keras.layers.Embedding(MAX_TOKENS, EMBEDDING_SIZE, mask_zero=True, weights=[embedding_matrix], embeddings_regularizer=l2(1e-5))(model_input)
x = keras.layers.GRU(64, return_sequences=True, activation="relu", kernel_regularizer=l2(1e-5))(x)
x = keras.layers.SpatialDropout1D(0.2)(x)
x = keras.layers.GRU(32, return_sequences=False, activation="relu", kernel_regularizer=l2(1e-5))(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(32, activation="relu")(x)
model_output = keras.layers.Dense(len(LABELS), activation="sigmoid")(x)

model = keras.Model(inputs=model_input, outputs=model_output)
model.compile(keras.optimizers.Adam(3e-4), loss='binary_crossentropy', metrics=['acc'])

hist = model.fit(x_train, 
                 y_train, 
                 batch_size=256,
                 shuffle=True,
                 epochs=100, 
                 validation_split=0.05,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          patience=1, 
                                                          verbose=1)], 
                 verbose=1)

y_pred = model.predict(x_test)
submission[LABELS] = y_pred
submission.to_csv('131.submission.csv', index=False)

# 0.96379

Epoch 1/100


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping


In [None]:
model_input = keras.Input(shape=(MAX_SENTENSE_LEN, ))
x = keras.layers.Embedding(MAX_TOKENS, EMBEDDING_SIZE, mask_zero=True, weights=[embedding_matrix], embeddings_regularizer=l2(1e-5))(model_input)
x = keras.layers.Bidirectional(keras.layers.GRU(64, return_sequences=True, activation="relu", recurrent_dropout=0.1))(x)
x = keras.layers.SpatialDropout1D(0.2)(x)
x = keras.layers.Bidirectional(keras.layers.GRU(32, return_sequences=True, activation="relu", recurrent_dropout=0.1))(x)
x = keras.layers.GlobalMaxPool1D()(x)
x = keras.layers.Dense(32, activation="relu")(x)
model_output = keras.layers.Dense(len(LABELS), activation="sigmoid")(x)

model = keras.Model(inputs=model_input, outputs=model_output)
model.compile(keras.optimizers.Adam(3e-4), loss='binary_crossentropy', metrics=['acc'])

hist = model.fit(x_train, 
                 y_train, 
                 batch_size=256,
                 shuffle=True,
                 epochs=100, 
                 validation_split=0.05,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          patience=1, 
                                                          verbose=1)], 
                 verbose=1)

y_pred = model.predict(x_test)
submission[LABELS] = y_pred
submission.to_csv('132.submission.csv', index=False)

# 0.97495

Epoch 1/100


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

In [None]:
model_input = keras.Input(shape=(MAX_SENTENSE_LEN, ))
x = keras.layers.Embedding(MAX_TOKENS, EMBEDDING_SIZE, weights=[embedding_matrix])(model_input)
x = keras.layers.Bidirectional(keras.layers.GRU(64, return_sequences=True, activation="relu", recurrent_dropout=0.1))(x)
x = keras.layers.SpatialDropout1D(0.1)(x)
x = keras.layers.Bidirectional(keras.layers.GRU(32, return_sequences=True, activation="relu", recurrent_dropout=0.1))(x)
x = keras.layers.GlobalMaxPool1D()(x)
x = keras.layers.Dense(32, activation="relu")(x)
model_output = keras.layers.Dense(len(LABELS), activation="sigmoid")(x)

model = keras.Model(inputs=model_input, outputs=model_output)
model.compile(keras.optimizers.Adam(3e-4), loss='binary_crossentropy', metrics=['acc'])

hist = model.fit(x_train, 
                 y_train, 
                 batch_size=256,
                 shuffle=True,
                 epochs=100, 
                 validation_split=0.05,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          patience=1, 
                                                          verbose=1)], 
                 verbose=1)

y_pred = model.predict(x_test)
submission[LABELS] = y_pred
submission.to_csv('133.submission.csv', index=False)

# 0.97868

In [None]:
model_input = keras.Input(shape=(MAX_SENTENSE_LEN, ))
x = keras.layers.Embedding(MAX_TOKENS, EMBEDDING_SIZE, weights=[embedding_matrix])(model_input)
x = keras.layers.Bidirectional(keras.layers.GRU(64, return_sequences=True, activation="relu", recurrent_dropout=0.1))(x)
x = keras.layers.SpatialDropout1D(0.1)(x)
x = keras.layers.Bidirectional(keras.layers.GRU(32, return_sequences=True, activation="relu", recurrent_dropout=0.1))(x)
x = keras.layers.GlobalMaxPool1D()(x)
x = keras.layers.Dense(32, activation="relu")(x)
model_output = keras.layers.Dense(len(LABELS), activation="sigmoid")(x)

model = keras.Model(inputs=model_input, outputs=model_output)
model.compile(keras.optimizers.Adam(3e-4), loss='binary_crossentropy', metrics=['acc'])

hist = model.fit(x_train, 
                 y_train, 
                 batch_size=32,
                 shuffle=True,
                 epochs=100, 
                 validation_split=0.05,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          patience=1, 
                                                          verbose=1)], 
                 verbose=1)

y_pred = model.predict(x_test)
submission[LABELS] = y_pred
submission.to_csv('134.submission.csv', index=False)

# 0.98280

In [None]:
model_input = keras.Input(shape=(MAX_SENTENSE_LEN,))
x = keras.layers.Embedding(MAX_TOKENS, EMBEDDING_SIZE, weights=[embedding_matrix])(model_input)
x = keras.layers.Bidirectional(keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = keras.layers.GlobalMaxPool1D()(x)
x = keras.layers.Dense(50, activation="relu")(x)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(6, activation="sigmoid")(x)
model = keras.Model(inputs=model_input, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

hist = model.fit(x_train, 
                 y_train, 
                 batch_size=256,
                 shuffle=True,
                 epochs=100, 
                 validation_split=0.05,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          patience=1, 
                                                          verbose=1)], 
                 verbose=1)

y_pred = model.predict(x_test)
submission[LABELS] = y_pred
submission.to_csv('135.submission.csv', index=False)

# 0.98082

In [81]:
1

1

In [80]:
model_input = keras.Input(shape=(MAX_SENTENSE_LEN,))
x = keras.layers.Embedding(MAX_TOKENS, EMBEDDING_SIZE, weights=[embedding_matrix])(model_input)
x = keras.layers.Bidirectional(keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = keras.layers.GlobalMaxPool1D()(x)
x = keras.layers.Dense(50, activation="relu")(x)
x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(6, activation="sigmoid")(x)
model = keras.Model(inputs=model_input, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

hist = model.fit(x_train, 
                 y_train, 
                 batch_size=32,
                 shuffle=True,
                 epochs=100, 
                 validation_split=0.05,
                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                          patience=1, 
                                                          verbose=1)], 
                 verbose=1)

y_pred = model.predict(x_test)
submission[LABELS] = y_pred
submission.to_csv('136.submission.csv', index=False)
# 0.98166

Epoch 1/100
Epoch 2/100
Epoch 00002: early stopping


# Parameter tuning

# Online augmentation

# Model selection / blending

# Post-processing

# Evaluation

# Reasoning

# Monitoring