In [1]:
# read data from data set
from utils.data_utils import jigsaw_toxix_ds_get_df
import numpy as np
from config import *
df = jigsaw_toxix_ds_get_df()
comments = df["comment_text"].tolist()
comments = [x.replace('\n', ' ') for x in comments]
dump_text = '\n'.join(comments)
total_classes = 6
class_matrix = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].astype('int')
label_matrix = class_matrix.values



In [2]:
# load word2vec model
import os 
from config import model_folder
from model_factory.embeddings.w2v import w2v_load_from_keyedvectors, build_embedding_layer
w2v_name = 'google_keyed_vector_format.bin'
model_path = os.path.join(model_folder, w2v_name)
    
w2v_model = w2v_load_from_keyedvectors(model_path)
vocab = w2v_model.vocab

word_2_idx = dict(zip(vocab.keys(), range(len(vocab))))
max_sent_length = 80
trainable = False
embedding_layer = build_embedding_layer(w2v_model, word_2_idx, len(vocab), max_sent_length, trainable)
print(embedding_layer)

Using TensorFlow backend.


<keras.layers.embeddings.Embedding object at 0x7fdd34c8b080>


In [6]:
# tokenization
import nltk
from keras.preprocessing.sequence import pad_sequences



def tokenizer(text, word2idx, max_len=80, total=5000):
    text = text[:total]
    for sentence_idx in range(len(text)):
        sentence = text[sentence_idx]
        text[sentence_idx] = nltk.word_tokenize(sentence)
 
    def _sent_to_idx(s, w2i):
        for word_idx in range(len(s)):
            word = s[word_idx]
            idx = w2i.get(word, w2i['null'])
            s[word_idx] = idx
        return s
        
    for sentence_idx in range(len(text)):
        sentence = text[sentence_idx]
        sequence = _sent_to_idx(sentence, word2idx)
        text[sentence_idx] = sequence
    
    text = pad_sequences(text, maxlen=max_len, value=word2idx['null'])   
    return text

num_samples = 5000
tokenized_sequence = tokenizer(comments, word_2_idx, max_sent_length, total=num_samples)
label_matrix = label_matrix[:num_samples]

In [9]:
# build model
from keras import models, layers, objectives, metrics
h_dim = 300
m = models.Sequential()
m.add(embedding_layer)
m.add(layers.LSTM(
    units=h_dim, 
    return_sequences=False,
))
m.add(layers.Dense(total_classes, activation='sigmoid'))
m.compile(optimizer='adam', loss=objectives.binary_crossentropy, metrics=[metrics.top_k_categorical_accuracy])


In [10]:
# join embeddings with data set dataframe
m.fit(tokenized_sequence, label_matrix)


Epoch 1/1


  32/5000 [..............................] - ETA: 1:49 - loss: 0.7036 - top_k_categorical_accuracy: 0.9688

  64/5000 [..............................] - ETA: 1:01 - loss: 0.6756 - top_k_categorical_accuracy: 0.9844

  96/5000 [..............................] - ETA: 44s - loss: 0.6479 - top_k_categorical_accuracy: 0.9896 

 128/5000 [..............................] - ETA: 36s - loss: 0.6088 - top_k_categorical_accuracy: 0.9922

 160/5000 [..............................] - ETA: 31s - loss: 0.5577 - top_k_categorical_accuracy: 0.9938

 192/5000 [>.............................] - ETA: 28s - loss: 0.4939 - top_k_categorical_accuracy: 0.9948

 224/5000 [>.............................] - ETA: 25s - loss: 0.4538 - top_k_categorical_accuracy: 0.9955

 256/5000 [>.............................] - ETA: 23s - loss: 0.4239 - top_k_categorical_accuracy: 0.9961

 288/5000 [>.............................] - ETA: 22s - loss: 0.4073 - top_k_categorical_accuracy: 0.9965

 320/5000 [>.............................] - ETA: 21s - loss: 0.3834 - top_k_categorical_accuracy: 0.9969

 352/5000 [=>............................] - ETA: 20s - loss: 0.3519 - top_k_categorical_accuracy: 0.9972

 384/5000 [=>............................] - ETA: 19s - loss: 0.3314 - top_k_categorical_accuracy: 0.9974

 416/5000 [=>............................] - ETA: 18s - loss: 0.3104 - top_k_categorical_accuracy: 0.9976

 448/5000 [=>............................] - ETA: 18s - loss: 0.2997 - top_k_categorical_accuracy: 0.9978

 480/5000 [=>............................] - ETA: 17s - loss: 0.2883 - top_k_categorical_accuracy: 0.9979

 512/5000 [==>...........................] - ETA: 17s - loss: 0.2741 - top_k_categorical_accuracy: 0.9980

 544/5000 [==>...........................] - ETA: 17s - loss: 0.2625 - top_k_categorical_accuracy: 0.9982

 576/5000 [==>...........................] - ETA: 16s - loss: 0.2573 - top_k_categorical_accuracy: 0.9983

 608/5000 [==>...........................] - ETA: 16s - loss: 0.2510 - top_k_categorical_accuracy: 0.9984

 640/5000 [==>...........................] - ETA: 16s - loss: 0.2453 - top_k_categorical_accuracy: 0.9984

 672/5000 [===>..........................] - ETA: 15s - loss: 0.2426 - top_k_categorical_accuracy: 0.9985

 704/5000 [===>..........................] - ETA: 15s - loss: 0.2454 - top_k_categorical_accuracy: 0.9986

 736/5000 [===>..........................] - ETA: 15s - loss: 0.2445 - top_k_categorical_accuracy: 0.9986

 768/5000 [===>..........................] - ETA: 14s - loss: 0.2388 - top_k_categorical_accuracy: 0.9987

 800/5000 [===>..........................] - ETA: 14s - loss: 0.2386 - top_k_categorical_accuracy: 0.9988

 832/5000 [===>..........................] - ETA: 14s - loss: 0.2396 - top_k_categorical_accuracy: 0.9988

 864/5000 [====>.........................] - ETA: 14s - loss: 0.2383 - top_k_categorical_accuracy: 0.9988

 896/5000 [====>.........................] - ETA: 14s - loss: 0.2332 - top_k_categorical_accuracy: 0.9989

 928/5000 [====>.........................] - ETA: 13s - loss: 0.2340 - top_k_categorical_accuracy: 0.9989

 960/5000 [====>.........................] - ETA: 13s - loss: 0.2312 - top_k_categorical_accuracy: 0.9990

 992/5000 [====>.........................] - ETA: 13s - loss: 0.2269 - top_k_categorical_accuracy: 0.9990

1024/5000 [=====>........................] - ETA: 13s - loss: 0.2224 - top_k_categorical_accuracy: 0.9990

1056/5000 [=====>........................] - ETA: 13s - loss: 0.2180 - top_k_categorical_accuracy: 0.9991

1088/5000 [=====>........................] - ETA: 13s - loss: 0.2140 - top_k_categorical_accuracy: 0.9991

1120/5000 [=====>........................] - ETA: 12s - loss: 0.2155 - top_k_categorical_accuracy: 0.9991

1152/5000 [=====>........................] - ETA: 12s - loss: 0.2121 - top_k_categorical_accuracy: 0.9991



















































































































































































































































<keras.callbacks.History at 0x7fdc6214bc50>

In [None]:
# extract embedding column and label columns

In [None]:
# convert to X, Y

In [None]:
# fit model and test