In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
#print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
#EMBEDDING_FILE = '../input/glove840b300dtxt/glove.840B.300d.txt'
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [4]:
max_features=100000
maxlen=150
embed_size=300

In [5]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [6]:
tok_word=text.Tokenizer(num_words=max_features,lower=True)
tok_word.fit_on_texts(list(X_train)+list(X_test))
X_train_word=tok_word.texts_to_sequences(X_train)
X_test_word=tok_word.texts_to_sequences(X_test)
x_train_word=sequence.pad_sequences(X_train_word,maxlen=maxlen)
x_test_word=sequence.pad_sequences(X_test_word,maxlen=maxlen)

In [7]:
x_train_word

array([[    0,     0,     0, ...,  6368,  2687,  1183],
       [    0,     0,     0, ...,   616,  9983,   216],
       [    0,     0,     0, ...,     1,   743,   487],
       ..., 
       [    0,     0,     0, ...,  3471, 16988,  4655],
       [    0,     0,     0, ...,   147,    33,    11],
       [    0,     0,     0, ...,  1650,  2052,    88]])

In [8]:
char_max_len = 400
char_vocab_len = 256
tok_char = text.Tokenizer(num_words=char_vocab_len, filters=None, lower=True, split=' ', char_level=True, oov_token=None)
tok_char.fit_on_texts(list(X_train)+list(X_test))
X_train_char=tok_char.texts_to_sequences(X_train)
X_test_char=tok_char.texts_to_sequences(X_test)


In [9]:
X_test_word

[[2030,
  378,
  4878,
  723,
  8,
  58,
  20911,
  84,
  888,
  349,
  16,
  3439,
  73,
  21,
  6,
  5,
  6226,
  6,
  1555,
  7,
  56,
  378,
  5462,
  1488,
  578,
  5869,
  5,
  94,
  6,
  2,
  3771,
  30,
  340,
  6,
  742,
  33992,
  37,
  4878,
  723,
  8,
  35,
  4222,
  10,
  1205,
  653,
  400,
  476,
  17214,
  9,
  227,
  15,
  154,
  5,
  20074,
  8,
  247,
  23545,
  48,
  4329,
  52,
  24,
  4,
  2108,
  155,
  2432,
  578,
  2428,
  94,
  218,
  143,
  490,
  85],
 [31, 1144, 1, 338, 8, 665, 17, 11, 8, 2722],
 [106, 10637, 15, 66070, 294],
 [22,
  6,
  19,
  4,
  147,
  157,
  33,
  1,
  120,
  1,
  105,
  7,
  1526,
  24,
  1,
  374,
  641,
  7,
  39,
  77,
  650,
  1,
  120,
  3062,
  1526,
  7,
  1149,
  1351,
  1,
  105,
  408,
  127,
  26,
  134,
  6,
  12,
  20,
  377],
 [7, 59, 8673, 79, 83, 33, 40],
 [134, 6, 12, 1116, 7, 65, 102, 1145, 3, 6, 5, 43, 14, 398, 177, 138],
 [50,
  34,
  14,
  152,
  644,
  2,
  28,
  111,
  140,
  18,
  419,
  217,
  5,
  1733,
  4

In [10]:
x_train_char=sequence.pad_sequences(X_train_char,maxlen=400)
x_test_char=sequence.pad_sequences(X_test_char,maxlen=400)

In [12]:
#x_train_char.shape
import pickle

# saving
with open('tokenizer_word.pickle', 'wb') as handle:
    pickle.dump(tok_word, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('tokenizer_char.pickle', 'wb') as handle:
    pickle.dump(tok_char, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # loading
# with open('tokenizer.pickle', 'rb') as handle:
#     tokenizer = pickle.load(handle)

In [16]:
with open('tokenizer_word.pickle', 'rb') as handle:
    tok_word2 = pickle.load(handle)

In [None]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [None]:
word_index = tok_word.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [17]:
char_index = tok_char.word_index
#https://minimaxir.com/2017/04/char-embeddings/
#print(char_index)
char_embed_size = 16

In [19]:
#model:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size,trainable = True)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
w_conv1 = Conv1D(32, kernel_size = 1, padding = "same", kernel_initializer = "glorot_uniform")(x)
w_conv2 = Conv1D(32, kernel_size = 2, padding = "same", kernel_initializer = "glorot_uniform")(x)
w_conv3 = Conv1D(32, kernel_size = 3, padding = "same", kernel_initializer = "glorot_uniform")(x)
w_conv4 = Conv1D(32, kernel_size = 4, padding = "same", kernel_initializer = "glorot_uniform")(x)
w_conv5 = Conv1D(32, kernel_size = 5, padding = "same", kernel_initializer = "glorot_uniform")(x)
w_conv = concatenate([w_conv1, w_conv2, w_conv3, w_conv4, w_conv5]) 
w_avg_pool = GlobalAveragePooling1D()(w_conv)
w_max_pool = GlobalMaxPooling1D()(w_conv)

char_input_init = Input(shape=(char_max_len, ))
char_input = Embedding(len(char_index)+1, char_embed_size, trainable = True)(char_input_init)
char_input = SpatialDropout1D(0.2)(char_input)
#char_input = Bidirectional(LSTM(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(char_input)
# do characters have long term dependencies?????
c_conv1 = Conv1D(32, kernel_size = 3, padding = "same", kernel_initializer = "glorot_uniform")(char_input)
c_conv2 = Conv1D(32, kernel_size = 4, padding = "same", kernel_initializer = "glorot_uniform")(char_input)
c_conv3 = Conv1D(32, kernel_size = 5, padding = "same", kernel_initializer = "glorot_uniform")(char_input)
c_conv4 = Conv1D(32, kernel_size = 6, padding = "same", kernel_initializer = "glorot_uniform")(char_input)
c_conv5 = Conv1D(32, kernel_size = 7, padding = "same", kernel_initializer = "glorot_uniform")(char_input)
c_conv = concatenate([c_conv1, c_conv2, c_conv3, c_conv4, c_conv5]) 
c_avg_pool = GlobalAveragePooling1D()(c_conv)
c_max_pool = GlobalMaxPooling1D()(c_conv)

x = concatenate([w_avg_pool, w_max_pool, c_avg_pool, c_max_pool])

# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(inputs=[sequence_input, char_input_init], outputs=preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])
model.summary()




__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 300)     30000000    input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, 150, 300)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
embedding_

In [20]:
batch_size = 1024
epochs = 4
X_tra_w, X_val_w, y_tra_w, y_val_w = train_test_split(x_train_word, y_train, train_size=0.9, random_state=233)
X_tra_c, X_val_c, _, _ = train_test_split(x_train_char, y_train, train_size=0.9, random_state=233)



In [None]:
del x_train_word
del y_train
del x_train_char
del X_train
del char_index
del word_index
del train
del test
import gc
gc.collect()

In [21]:
# filepath="../input/best-model/best.hdf5"
filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=([X_val_w, X_val_c], y_val_w), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [None]:
#https://stackoverflow.com/questions/44615420/multiple-inputs-with-keras-functional-api
#model.fit([train_X_hour, train_X_port], [train_Y_hour, train_Y_port] epochs=10 batch_size=1, verbose=2, shuffle=False)
model.fit([X_tra_w, X_tra_c], y_tra_w, batch_size=batch_size, epochs=100,validation_data=([X_val_w, X_val_c], y_val_w),callbacks = callbacks_list,verbose=1)
#Loading model weights
model.load_weights(filepath)
#print('Predicting....')
#y_pred = model.predict(x_test,batch_size=1024,verbose=1)

In [22]:
filepath="weights_base.best.cnn_word_cnn_char.hdf5"
model.load_weights(filepath)

In [23]:
# [X_test[0]] 

In [24]:
def single_sentence_tokenizer(tok_word=tok_word, x = X_test[0]):
    print('given sentence:', x)
    X_test_word=tok_word.texts_to_sequences(np.asarray([x]))
    print('word tokens:', X_test_word)
    print('word tokens length:', X_test_word[0].__len__())
    X_test_word=sequence.pad_sequences(X_test_word,maxlen=maxlen)
    
    X_test_char=tok_char.texts_to_sequences(np.asarray([x]))
    print('char tokens:', X_test_char)
    print('char tokens length:',X_test_char[0].__len__())
    X_test_char=sequence.pad_sequences(X_test_char,maxlen=400)
    
    return [X_test_word, X_test_char]
    
testSentence = single_sentence_tokenizer(x='hello world! its been a longe time')


given sentence: hello world! its been a longe time
word tokens: [[320, 244, 104, 57, 4, 85]]
word tokens length: 6
char tokens: [[10, 2, 11, 11, 6, 1, 20, 6, 9, 11, 12, 30, 1, 5, 3, 8, 1, 21, 2, 2, 7, 1, 4, 1, 11, 6, 7, 18, 2, 1, 3, 5, 15, 2]]
char tokens length: 34


In [25]:
x_test_word.shape

(153164, 150)

In [26]:
testSentence = single_sentence_tokenizer(x='hate you')
testPrediction = model.predict(testSentence)
abuseDimensions = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
print()
for i in range(6):
    print(abuseDimensions[i], ':' , testPrediction[0][i]*100.0, '%')

given sentence: hate you
word tokens: [[360, 6]]
word tokens length: 2
char tokens: [[10, 4, 3, 2, 1, 17, 6, 13]]
char tokens length: 8

toxic : 86.803239584 %
severe_toxic : 1.4201676473 %
obscene : 12.7983868122 %
threat : 1.15623837337 %
insult : 16.1546051502 %
identity_hate : 2.70609539002 %


In [15]:
html_as_string = """
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa&gt;bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb&gt;cccccc
ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc&gt;ddddddd
"""

In [16]:
html_as_string.__len__()

269

In [17]:
from html.parser import HTMLParser

class HTMLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = HTMLStripper()
    s.feed(html)
    return s.get_data()

In [29]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_as_string, "lxml")

def stripHtmlTags(htmlTxt):
    if htmlTxt is None:
        return None
    else:
        return ''.join(BeautifulSoup(htmlTxt).findAll(text=True)) 

#print(stripHtmlTags(html_as_string))
t = soup.get_text()
t = t.split('\n')
print(t)

['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb>cccccc', 'ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc>ddddddd', '']


In [28]:
print(strip_tags(html_as_string).split('\n'))
len(strip_tags(html_as_string))
#strip_tags(html_as_string)

['', 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa>bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb>cccccc', 'ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc>ddddddd', '']


260

In [55]:
import gc
gc.collect()

12470

In [None]:
#del model