In [1]:
# read data from data set
from utils.data_utils import jigsaw_toxix_ds_get_df
import numpy as np
from config import *
import seaborn as sns
import matplotlib.pyplot as plt
df = jigsaw_toxix_ds_get_df()
comments = df["comment_text"].tolist()
comments = [x.replace('\n', ' ') for x in comments]

classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
dump_text = '\n'.join(comments)
total_classes = 6
class_matrix = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].astype('int')
label_matrix = class_matrix.values
h_dim = 300


In [2]:
# load word2vec model
import os 
from config import model_folder
from model_factory.embeddings.w2v import w2v_load_from_keyedvectors, build_embedding_layer
w2v_name = 'google_keyed_vector_format.bin'
model_path = os.path.join(model_folder, w2v_name)
print('loading pretrained w2v', end='...')    
w2v_model = w2v_load_from_keyedvectors(model_path)
vocab = w2v_model.vocab
print('done.')
word_2_idx = dict(zip(vocab.keys(), range(len(vocab))))
max_sent_length = 80
trainable = False



(array([ 6.66666667,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  3.33333333]), array([ 1. ,  1.1,  1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ]))


Using TensorFlow backend.


loading pretrained w2v...done.


In [3]:
# tokenization
import nltk
from keras.preprocessing.sequence import pad_sequences
def tokenizer(text, word2idx, max_len=80, total=None):
    if total is not None:
        text = text[:total]
    for sentence_idx in range(len(text)):
        sentence = text[sentence_idx]
        text[sentence_idx] = nltk.word_tokenize(sentence)
 
    def _sent_to_idx(s, w2i):
        for word_idx in range(len(s)):
            word = s[word_idx]
            idx = w2i.get(word, w2i['null'])
            s[word_idx] = idx
        return s
        
    for sentence_idx in range(len(text)):
        sentence = text[sentence_idx]
        sequence = _sent_to_idx(sentence, word2idx)
        text[sentence_idx] = sequence
    
    text = pad_sequences(text, maxlen=max_len, value=word2idx['null'])   
    return text

num_samples = 5000
print('tokenizing data', end='...')
tokenized_sequence = tokenizer(comments, word_2_idx, max_sent_length, total=num_samples)
if num_samples is not None:
    label_matrix = class_matrix.values[:num_samples]
print('done.')

tokenizing data...done.


In [4]:
'''
   Created by Yubo Zhou on 28/03/19
'''

import keras
import keras.backend as K
import tensorflow as tf
from keras.losses import binary_crossentropy
import numpy as np

class SeqCLS(object):
    def __init__(self):
        self.m = None
        self.model_t = None
        self.num_classes = 0

    def configure(self, input_dim, seq_len, output_dim, h_dim, dropout=0.5,
                  loss=binary_crossentropy, pretrained_embedding=None,
                  verbose=0,
                  ):
        self.num_classes = output_dim
        # with tf.device('/cpu:0'):
        if True:
            m = keras.models.Sequential()
            if pretrained_embedding is None:
                lstm_layer = keras.layers.LSTM(
                    input_shape=(seq_len, input_dim),
                    return_sequences=False,
                    units=h_dim,
                    dropout=dropout, recurrent_dropout=dropout,
                )
                m.add(lstm_layer)
                if verbose:
                    m.summary()
            else:
                m.add(pretrained_embedding)
                m.add(
                    keras.layers.LSTM(
                        return_sequences=False,
                        units=h_dim,
                        dropout=dropout, recurrent_dropout=dropout,
                    )
                )

        dense_h = keras.layers.Dense(
            units=h_dim,
            activation='selu',
        )
        m.add(dense_h); m.add(keras.layers.AlphaDropout(0.5))
        m.add(
            keras.layers.Dense(
                units=self.num_classes,
                activation='sigmoid',

            )
        )
        m.compile(loss=loss, optimizer='adam')
        self.m = m
        tensors = K.function([self.m.layers[0].input, K.learning_phase()],
                                          [self.m.layers[-1].output])
        self.model_t = tensors
        if verbose:
            self.m.summary()

    def fit(self, X, Y, epochs=50, batch_size=32, validation_split=.0, shuffle=True, verbose=2):
        self.m.fit(X, Y, epochs=epochs, batch_size=batch_size, validation_split=validation_split,
                   shuffle=shuffle, verbose=verbose)

        tensors = K.function([self.m.layers[0].input, K.learning_phase()],
                                          [self.m.layers[-1].output])
        self.model_t = tensors


    def predict_with_uncertainty(self, X, sim=1):
        result = self.sample_output(X, n_iter=sim)
        prediction = self.m.predict(X)
        result_cpy = np.swapaxes(result, 0, 1)
        result_cpy = np.swapaxes(result_cpy, 1, 2)
        certainties = np.zeros((result_cpy.shape[0], result_cpy.shape[1]))
        for data_idx in range(result_cpy.shape[0]):
            for topic_idx in range(result_cpy[data_idx].shape[0]):
                samples = result_cpy[data_idx][topic_idx]
                ret = np.histogram(samples, normed=True)
                bins, bin_edges = ret
                norm_bins = bins/np.sum(bins)
                
                certainty_score = 0
                for idx in range(len(norm_bins)):
                    if bin_edges[idx] < prediction[data_idx][topic_idx] <= bin_edges[idx+1]:
                        certainty_score = norm_bins[idx]
                certainties[data_idx][topic_idx] = certainty_score
                print(prediction[data_idx][topic_idx], bin_edges, certainty_score)
                break
            break
        return prediction, uncertainties

    def sample_output(self, X, n_iter=1):
        result = np.zeros((n_iter,) + (X.shape[0], self.num_classes))
        for i in range(n_iter):
            result[i, :, :] = self.model_t((X, 1))[0]
        return result

    def summary(self):
        self.m.summary()
        
def eval_model(m, test_X, test_Y, sim=10, threshold = 0.5):
    pred_Y, uncertainty_Y = m.predict_with_uncertainty(test_X, sim=sim)
    topic_Y = test_Y.T
    pred_topic_Y = pred_Y.T
    # evaluate performance
    
    print(threshold)
    print(','.join(['sample size','precision','recall', 'prior']))
    # print(pred_topic_Y[0].tolist())
    for topic_idx in range(len(pred_topic_Y)):
        true_topic = topic_Y[topic_idx]
        pred_topic = pred_topic_Y[topic_idx]
        pred_topic[np.where(pred_topic >= threshold)] = 1
        pred_topic[np.where(pred_topic < threshold)] = 0
        print('%d, %.2f, %.2f, %.2f' 
              % (sum(true_topic), 
                 precision_score(true_topic, pred_topic, average='binary'), 
                 recall_score(true_topic, pred_topic), 
                 sum(true_topic)/len(true_topic)))

In [5]:
# bayes network

from keras import backend as K

def get_new_model(w2v_model):
    print('initialize embedding layer', end='...')
    embedding_layer = build_embedding_layer(w2v_model, word_2_idx, 
                                            len(vocab), max_sent_length, trainable)
    print('done.')
    m = SeqCLS()
    m.configure(None, 
                None, 
                total_classes, 300, 
                pretrained_embedding=embedding_layer,
                verbose=1,
                )
    return m


            
model_copy = get_new_model(w2v_model)

initialize embedding layer...done.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 80, 300)           900000000 
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_1 (Dense)              (None, 300)               90300     
_________________________________________________________________
alpha_dropout_1 (AlphaDropou (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 1806      
Total params: 900,813,306
Trainable params: 813,306
Non-trainable params: 900,000,000
_________________________________________________________________


In [6]:
# importance sampling

import keras
from keras import backend as K
from sklearn.metrics.classification import precision_score, recall_score

batch_size = 5000
batch_idx = 0
train_test_split = 0.9
divider = int(len(tokenized_sequence) * train_test_split)
train_X, train_Y = tokenized_sequence[:divider], label_matrix[:divider]
test_X, test_Y = tokenized_sequence[divider:], label_matrix[divider:]
total_data = train_X.shape[0]
sim = 10
total_X = []
total_Y = []


print(batch_size)
target_topic_idx = 0

while True:
    l, r = batch_idx*batch_size, min((batch_idx+1)*batch_size, total_data-1)
    if l > r:
        batch_idx = 0
        break
    this_batch_indices = np.array(range(l, r))
    selected_batch = train_X[this_batch_indices]
#     reset_weights()
    if len(total_X) > 0:        
        model_copy.fit(
            np.array(total_X), 
            np.array(total_Y), 
            epochs=20, batch_size=100, verbose=0)
    pred_Y, certainty_Y = model_copy.predict_with_uncertainty(selected_batch, sim=sim)
    

    certainty_Y_cpy = np.swapaxes(certainty_Y, 0, -1)
    
    for topic_index in range(certainty_Y_cpy.shape[0]):
        if topic_index == target_topic_idx:
            topic_certainties = certainty_Y_cpy[topic_index]
            percentile = np.percentile(topic_certainties, 10)
            print('precentile', percentile,)
            selected = np.where(topic_certainties <= percentile)
            selected_batch_indices = this_batch_indices[selected]
            selected_X = train_X[selected_batch_indices].tolist()
            selected_Y = train_Y[selected_batch_indices].tolist()
            total_X += selected_X
            total_Y += selected_Y

    eval_model(model_copy, test_X, test_Y)
    batch_idx += 1

    

print('Done!')
# print(uncertainty)

0.0221306 [ 0.08812507  0.09960809  0.1110911   0.12257411  0.13405712  0.14554014
  0.15702315  0.16850616  0.17998918  0.19147219  0.2029552 ]


UnboundLocalError: local variable 'uncertainty_score' referenced before assignment

In [7]:
print(uncertainty_Y)

[[ 0.   0.1  0.2  0.2  0.   0.2]
 [ 0.   0.3  0.2  0.   0.   0. ]
 [ 0.   0.1  0.   0.2  0.1  0.1]
 ..., 
 [ 0.1  0.   0.1  0.3  0.1  0.1]
 [ 0.   0.   0.1  0.   0.1  0.2]
 [ 0.   0.   0.3  0.   0.   0.1]]


In [6]:
# random selection strategy
import keras
from keras import backend as K
from sklearn.metrics.classification import precision_score, recall_score

batch_size = 5000
batch_idx = 0
train_test_split = 0.9
divider = int(len(tokenized_sequence) * train_test_split)
train_X, train_Y = tokenized_sequence[:divider], label_matrix[:divider]
test_X, test_Y = tokenized_sequence[divider:], label_matrix[divider:]
total_data = train_X.shape[0]
random_selected_idx = np.random.choice(range(total_data), num_samples//10, replace=False)
sim = 10

model_copy.fit(
    train_X[random_selected_idx], 
    train_Y[random_selected_idx], 
    epochs=50, batch_size=100, verbose=0)
eval_model(model_copy, test_X, test_Y)


0.5
sample size,precision,recall,prior
455, 0.66, 0.27, 0.09
41, 0.00, 0.00, 0.01
243, 0.63, 0.30, 0.05
13, 0.00, 0.00, 0.00
209, 0.57, 0.24, 0.04
40, 1.00, 0.03, 0.01


  'precision', 'predicted', average, warn_for)
