# Detecting toxicity in Wikipedia comments

This notebook develops a model to tackle the problem posed by the [toxic comment classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) hosted on Kaggle.

We will make use of a pre-trained GloVe embedding available on the [GloVe website](https://nlp.stanford.edu/projects/glove/).

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
TRAIN_CSV = 'data/train.csv'
TEST_CSV = 'data/test.csv'

In [3]:
GLOVE_EMBEDDING = 'data/glove.6B.50d.txt'

In [4]:
MAX_DOCUMENT_LENGTH = 100
EMBEDDING_SIZE = 50
RNN_CELL_SIZE = 128
BATCH_SIZE = 256
ATTENTION_RELU_LAYERS = [32, 16]
EMBEDDING_TRAINABLE = True
LABEL_KEYS = ('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
LEARNING_RATE=0.01

In [5]:
UNKOWN_TOKEN = '<UNK>'
UNKNOWN_TOKEN_EMBEDDING = [-1.0 for _ in range(EMBEDDING_SIZE)]

In [6]:
with tf.gfile.Open(GLOVE_EMBEDDING) as glove_file:
    glove_lines = glove_file.readlines()

words = []
embeddings_list = [UNKNOWN_TOKEN_EMBEDDING]

for line in glove_lines:
    items = line.strip().split(' ')
    words.append(items[0])
    embeddings_list.append([float(x) for x in items[1:]])

embeddings = np.asarray(embeddings_list)

In [7]:
embeddings.shape

(400001, 50)

In [8]:
training_data = pd.read_csv(TRAIN_CSV)
test_data = pd.read_csv(TEST_CSV)

In [9]:
training_data.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [10]:
test_data.head(5)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [11]:
preprocessor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
preprocessor.fit(words)

<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor at 0x7f6153aa3668>

In [12]:
preprocessor.vocabulary_.get(UNKOWN_TOKEN)

0

In [13]:
def generate_training_input_fn(dataframe, preprocessor, shuffle=True):
    comments = {'comments': np.asarray(list(preprocessor.transform(dataframe.comment_text)))}
    labels = {
        key:np.reshape(np.asarray(dataframe[key], dtype=np.float64), (-1, 1))
        for key in dataframe.columns[2:]
    }
    
    def input_fn():
        dataset = tf.data.Dataset.from_tensor_slices((comments, labels))
        dataset = dataset.batch(BATCH_SIZE)
        if shuffle:
            dataset = dataset.shuffle(2048)
        iterator = dataset.make_one_shot_iterator()
        inputs = iterator.get_next()
        return inputs
    
    return input_fn

In [38]:
def generate_model_fn(initial_embedding, embedding_trainable=True):
    def model_fn(features, labels, mode, params=None, config=None):
        if params is None:
            params = {}
        rnn_cell_size = params.get('rnn_cell_size', RNN_CELL_SIZE)
        attention_relu_layers = params.get('attention_relu_layers', ATTENTION_RELU_LAYERS)
        label_keys = params.get('label_keys', LABEL_KEYS)
        learning_rate = params.get('learning_rate', LEARNING_RATE)
        
        comments = features.get('comments')
        
        with tf.variable_scope('embed'):
            embedding = tf.Variable(initial_embedding, trainable=embedding_trainable, name='embedding')
            embedded_comments = tf.nn.embedding_lookup(embedding, comments, name='embedded_documents')
        
        with tf.variable_scope('encode'):
            forward_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_cell_size, name='forward_rnn_cell')
            backward_rnn_cell = tf.nn.rnn_cell.GRUCell(rnn_cell_size, name='backward_rnn_cell')
            encoder_outputs, rnn_states = tf.nn.bidirectional_dynamic_rnn(
                forward_rnn_cell,
                backward_rnn_cell,
                embedded_comments,
                dtype=tf.float64,
                time_major=False
            )
        
        with tf.variable_scope('attend'):
            attention_inputs = tf.concat(encoder_outputs, axis=2, name='inputs')
            comment_length = attention_inputs.shape[1].value
            final_layer_size = attention_inputs.shape[2].value
            current_layer = tf.reshape(attention_inputs, [-1, final_layer_size], name='wtf')
            for layer_out_size in attention_relu_layers:
                current_layer = tf.layers.dense(current_layer, layer_out_size, activation=tf.nn.relu)
            current_layer = tf.layers.dense(current_layer, 1, activation=tf.nn.relu)
            attention_logits = tf.reshape(current_layer, [-1, comment_length, 1], name='logits')
            alphas = tf.nn.softmax(attention_logits, name='alphas', axis=1)
            attention_outputs = tf.reduce_sum(
                tf.multiply(attention_inputs, alphas),
                axis=1,
                name='outputs'
            )
        
        with tf.variable_scope('predict'):
            logits_dict = {}
            for key in label_keys:
                logits_dict[key] = tf.layers.dense(attention_outputs, 1, name='{}_logits'.format(key))
            
            predictions_dict = {
                key:tf.sigmoid(logits_dict[key], name='{}_probability'.format(key))
                for key in logits_dict
            }
            
            prediction_outputs = tf.estimator.export.PredictOutput(predictions_dict)
            export_outputs = {
                tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_outputs
            }
            
        loss = None
        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            losses_dict = {}
            for key in logits_dict:
                losses_dict[key] = tf.nn.sigmoid_cross_entropy_with_logits(
                    labels = labels[key],
                    logits = logits_dict[key],
                    name='{}_cross_entropy_loss'.format(key)
                )
            loss = tf.add_n(list(losses_dict.values()))
        
        train_op = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
            global_step = tf.train.get_global_step()
            train_op = optimizer.minimize(loss, global_step=global_step)
        
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions_dict,
            loss=loss,
            train_op=train_op,
            export_outputs=export_outputs
        )
    
    return model_fn

In [39]:
training_input_fn = generate_training_input_fn(training_data, preprocessor)
eval_input_fn = generate_training_input_fn(training_data, preprocessor)

In [40]:
estimator = tf.estimator.Estimator(
    model_fn=generate_model_fn(embeddings),
    model_dir='toxicity-model'
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_num_worker_replicas': 1, '_evaluation_master': '', '_num_ps_replicas': 0, '_is_chief': True, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_global_id_in_cluster': 0, '_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_task_id': 0, '_task_type': 'worker', '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_master': '', '_log_step_count_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f614d354c50>, '_tf_random_seed': None, '_model_dir': 'toxicity-model'}


In [41]:
train_spec = tf.estimator.TrainSpec(training_input_fn, max_steps=10)
eval_spec = tf.estimator.EvalSpec(eval_input_fn)

In [42]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


InvalidArgumentError: Input to reshape is a tensor with 256 values, but the requested shape has 1
	 [[Node: Reshape = Reshape[T=DT_DOUBLE, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](AddN, Reshape/shape)]]
	 [[Node: Reshape/_161 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1540_Reshape", tensor_type=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'Reshape', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tornado/ioloop.py", line 832, in start
    self._run_callback(self._callbacks.popleft())
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tornado/ioloop.py", line 605, in _run_callback
    ret = callback()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-42-30df9d6a3d21>", line 1, in <module>
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/training.py", line 421, in train_and_evaluate
    executor.run()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/training.py", line 494, in run
    self.run_local()
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/training.py", line 626, in run_local
    hooks=train_hooks)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 352, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 812, in _train_model
    features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/estimator.py", line 793, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "<ipython-input-38-5145b17477be>", line 80, in model_fn
    export_outputs=export_outputs
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/estimator/model_fn.py", line 185, in __new__
    loss = array_ops.reshape(loss, [])
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3903, in reshape
    "Reshape", tensor=tensor, shape=shape, name=name)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3271, in create_op
    op_def=op_def)
  File "/home/fuzzyfroghunter/.virtualenvs/toxicity/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1650, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Input to reshape is a tensor with 256 values, but the requested shape has 1
	 [[Node: Reshape = Reshape[T=DT_DOUBLE, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](AddN, Reshape/shape)]]
	 [[Node: Reshape/_161 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1540_Reshape", tensor_type=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [37]:
import gc

tf.reset_default_graph()
gc.collect()

0