In [1]:
# read data from data set
from utils.data_utils import jigsaw_toxix_ds_get_df
import numpy as np
from config import *
import seaborn as sns
import matplotlib.pyplot as plt
df = jigsaw_toxix_ds_get_df()
comments = df["comment_text"].tolist()
comments = [x.replace('\n', ' ') for x in comments]

classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
dump_text = '\n'.join(comments)
total_classes = 6
class_matrix = df[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].astype('int')
label_matrix = class_matrix.values
h_dim = 300


In [2]:
# load word2vec model
import os 
from config import model_folder
from model_factory.embeddings.w2v import w2v_load_from_keyedvectors, build_embedding_layer
w2v_name = 'google_keyed_vector_format.bin'
model_path = os.path.join(model_folder, w2v_name)
print('loading pretrained w2v', end='...')    
w2v_model = w2v_load_from_keyedvectors(model_path)
vocab = w2v_model.vocab
print('done.')
word_2_idx = dict(zip(vocab.keys(), range(len(vocab))))
max_sent_length = 80
trainable = False



Using TensorFlow backend.


loading pretrained w2v...done.


In [None]:
# tokenization
import nltk
from keras.preprocessing.sequence import pad_sequences
def tokenizer(text, word2idx, max_len=80, total=None):
    if total is not None:
        text = text[:total]
    for sentence_idx in range(len(text)):
        sentence = text[sentence_idx]
        text[sentence_idx] = nltk.word_tokenize(sentence)
 
    def _sent_to_idx(s, w2i):
        for word_idx in range(len(s)):
            word = s[word_idx]
            idx = w2i.get(word, w2i['null'])
            s[word_idx] = idx
        return s
        
    for sentence_idx in range(len(text)):
        sentence = text[sentence_idx]
        sequence = _sent_to_idx(sentence, word2idx)
        text[sentence_idx] = sequence
    
    text = pad_sequences(text, maxlen=max_len, value=word2idx['null'])   
    return text

num_samples = 50000
print('tokenizing data', end='...')
tokenized_sequence = tokenizer(comments, word_2_idx, max_sent_length, total=num_samples)
if num_samples is not None:
    label_matrix = class_matrix.values[:num_samples]
print('done.')

tokenizing data...

In [None]:
# bayes network

from model_factory.lstm_mc_dropout import SeqCLS
from keras import backend as K

def get_new_model(w2v_model):
    print('initialize embedding layer', end='...')
    embedding_layer = build_embedding_layer(w2v_model, word_2_idx, 
                                            len(vocab), max_sent_length, trainable)
    print('done.')
    m = SeqCLS()
    m.configure(None, 
                None, 
                total_classes, 300, 
                pretrained_embedding=embedding_layer,
                verbose=1,
                )
    return m


            
model_copy = get_new_model(w2v_model)

In [None]:
def eval_model(m, test_X, test_Y, sim=10, threshold = 0.5):
    pred_Y, uncertainty_Y = m.predict_with_uncertainty(test_X, sim=sim)
    topic_Y = test_Y.T
    pred_topic_Y = pred_Y.T
    # evaluate performance
    
    print(threshold)
    print(','.join(['sample size','precision','recall', 'prior']))
    # print(pred_topic_Y[0].tolist())
    for topic_idx in range(len(pred_topic_Y)):
        true_topic = topic_Y[topic_idx]
        pred_topic = pred_topic_Y[topic_idx]
        pred_topic[np.where(pred_topic >= threshold)] = 1
        pred_topic[np.where(pred_topic < threshold)] = 0
        print('%d, %.2f, %.2f, %.2f' 
              % (sum(true_topic), 
                 precision_score(true_topic, pred_topic, average='binary'), 
                 recall_score(true_topic, pred_topic), 
                 sum(true_topic)/len(true_topic)))

In [6]:
# importance sampling

import keras
from keras import backend as K
from sklearn.metrics.classification import precision_score, recall_score

batch_size = 5000
batch_idx = 0
train_test_split = 0.9
divider = int(len(tokenized_sequence) * train_test_split)
train_X, train_Y = tokenized_sequence[:divider], label_matrix[:divider]
test_X, test_Y = tokenized_sequence[divider:], label_matrix[divider:]
total_data = train_X.shape[0]
sim = 10
total_X = []
total_Y = []


print(batch_size)
target_topic_idx = 0

while True:
    l, r = batch_idx*batch_size, min((batch_idx+1)*batch_size, total_data-1)
    if l > r:
        batch_idx = 0
        break
    this_batch_indices = np.array(range(l, r))
    selected_batch = train_X[this_batch_indices]
#     reset_weights()
    if len(total_X) > 0:        
        model_copy.fit(
            np.array(total_X), 
            np.array(total_Y), 
            epochs=20, batch_size=100, verbose=0)
    pred_Y, uncertainty_Y = model_copy.predict_with_uncertainty(selected_batch, sim=sim)
    

    uncertainty_Y_cpy = np.swapaxes(uncertainty_Y, 0, -1)
    
    for topic_index in range(uncertainty_Y_cpy.shape[0]):
        if topic_index == target_topic_idx:
            topic_uncertainties = uncertainty_Y_cpy[topic_index]
            percentile = np.percentile(topic_uncertainties, 90)
            print(percentile, end=',')
            selected = np.where(topic_uncertainties > percentile)
            selected_batch_indices = this_batch_indices[selected]
            selected_X = train_X[selected_batch_indices].tolist()
            selected_Y = train_Y[selected_batch_indices].tolist()
            total_X += selected_X
            total_Y += selected_Y

    eval_model(model_copy, test_X, test_Y)
    batch_idx += 1

    

print('Done!')
# print(uncertainty)

5000
0.259034812883,0.5
sample size,precision,recall,prior
455, 0.09, 0.53, 0.09
41, 0.01, 0.44, 0.01
243, 0.05, 0.48, 0.05
13, 0.00, 0.77, 0.00
209, 0.04, 0.51, 0.04
40, 0.01, 0.45, 0.01
0.0614055494291,0.5
sample size,precision,recall,prior
455, 0.00, 0.00, 0.09
41, 0.00, 0.00, 0.01
243, 0.00, 0.00, 0.05
13, 0.00, 0.00, 0.00
209, 0.00, 0.00, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.0733085202926,0.5
sample size,precision,recall,prior
455, 0.29, 0.01, 0.09
41, 0.00, 0.00, 0.01
243, 1.00, 0.00, 0.05
13, 0.00, 0.00, 0.00
209, 0.00, 0.00, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.138568742691,0.5
sample size,precision,recall,prior
455, 0.29, 0.04, 0.09
41, 0.00, 0.00, 0.01
243, 0.37, 0.04, 0.05
13, 0.00, 0.00, 0.00
209, 0.50, 0.04, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.0955973832715,0.5
sample size,precision,recall,prior
455, 0.62, 0.04, 0.09
41, 0.00, 0.00, 0.01
243, 0.64, 0.04, 0.05
13, 0.00, 0.00, 0.00
209, 0.71, 0.02, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.0997819102574,0.5
sample size,precision,recall,prior
455, 0.67, 0.07, 0.09
41, 0.00, 0.00, 0.01
243, 0.67, 0.08, 0.05
13, 0.00, 0.00, 0.00
209, 0.74, 0.08, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.152532349041,0.5
sample size,precision,recall,prior
455, 0.69, 0.14, 0.09
41, 0.50, 0.02, 0.01
243, 0.67, 0.14, 0.05
13, 0.00, 0.00, 0.00
209, 0.68, 0.17, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.178650489239,0.5
sample size,precision,recall,prior
455, 0.63, 0.19, 0.09
41, 0.20, 0.02, 0.01
243, 0.62, 0.16, 0.05
13, 0.00, 0.00, 0.00
209, 0.70, 0.18, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.171107297787,0.5
sample size,precision,recall,prior
455, 0.73, 0.22, 0.09
41, 0.00, 0.00, 0.01
243, 0.71, 0.19, 0.05
13, 0.00, 0.00, 0.00
209, 0.67, 0.19, 0.04
40, 0.00, 0.00, 0.01


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [7]:
print('hello')

hello


In [None]:
# random selection strategy
import keras
from keras import backend as K
from sklearn.metrics.classification import precision_score, recall_score

batch_size = 5000
batch_idx = 0
train_test_split = 0.9
divider = int(len(tokenized_sequence) * train_test_split)
train_X, train_Y = tokenized_sequence[:divider], label_matrix[:divider]
test_X, test_Y = tokenized_sequence[divider:], label_matrix[divider:]
total_data = train_X.shape[0]
random_selected_idx = np.random.choice(range(total_data), 5000, replace=False)
sim = 10

model_copy.fit(
    train_X[random_selected_idx], 
    train_Y[random_selected_idx], 
    epochs=50, batch_size=100, verbose=0)
eval_model(model_copy, test_X, test_Y)
