In [1]:
import pandas as pd
from keras.optimizers import RMSprop
import tensorflow as tf

Using TensorFlow backend.


In [2]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
data_path     = 'drive/My Drive/Colab Notebooks/adaptHAN_TCNN/data'
codes_path    = 'drive/My Drive/Colab Notebooks/adaptHAN_TCNN/codes'

In [0]:
train           = pd.read_csv(f'{data_path}/train_cleaned_no_punkt.csv')
test_labelled   = pd.read_csv(f'{data_path}/test_labelled_cleaned_no_punkt.csv')
test_unlabelled = pd.read_csv(f'{data_path}/test_unlabelled_cleaned_no_punkt.csv')

In [0]:
train['mal'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

test_labelled['mal'] = test_labelled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
test_labelled.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test_labelled.comment_text.fillna("empty", inplace=True)

test_unlabelled.comment_text.fillna("empty", inplace=True)


In [0]:
train['len_words'] = train.comment_text.apply(lambda x: len(x.split()))
train['len_chars'] = train.comment_text.apply(lambda x: len(x))

In [0]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNGRU, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras import backend as K
from tqdm import tqdm_notebook as tqdm
import pickle
import gc

In [0]:
# from https://github.com/philipperemy/keras-tcn


import keras.backend as K
import keras.layers
from keras import optimizers
from keras.engine.topology import Layer
from keras.layers import Activation, Lambda
from keras.layers import Conv1D, SpatialDropout1D
from keras.layers import Convolution1D, Dense
from keras.models import Input, Model
from typing import List, Tuple


def channel_normalization(x):
    # type: (Layer) -> Layer
    """ Normalize a layer to the maximum activation
    This keeps a layers values between zero and one.
    It helps with relu's unbounded activation
    Args:
        x: The layer to normalize
    Returns:
        A maximal normalized layer
    """
    max_values = K.max(K.abs(x), 2, keepdims=True) + 1e-5
    out = x / max_values
    return out


def wave_net_activation(x):
    # type: (Layer) -> Layer
    """This method defines the activation used for WaveNet
    described in https://deepmind.com/blog/wavenet-generative-model-raw-audio/
    Args:
        x: The layer we want to apply the activation to
    Returns:
        A new layer with the wavenet activation applied
    """
    tanh_out = Activation('tanh')(x)
    sigm_out = Activation('sigmoid')(x)
    return keras.layers.multiply([tanh_out, sigm_out])


def residual_block(x, s, i, activation, nb_filters, kernel_size, padding, dropout_rate=0, name=''):
    # type: (Layer, int, int, str, int, int, float, str) -> Tuple[Layer, Layer]
    """Defines the residual block for the WaveNet TCN
    Args:
        x: The previous layer in the model
        s: The stack index i.e. which stack in the overall TCN
        i: The dilation power of 2 we are using for this residual block
        activation: The name of the type of activation to use
        nb_filters: The number of convolutional filters to use in this block
        kernel_size: The size of the convolutional kernel
        padding: The padding used in the convolutional layers, 'same' or 'causal'.
        dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
        name: Name of the model. Useful when having multiple TCN.
    Returns:
        A tuple where the first element is the residual model layer, and the second
        is the skip connection.
    """

    original_x = x
    conv = Conv1D(filters=nb_filters, kernel_size=kernel_size,
                  dilation_rate=i, padding=padding,
                  name=name + '_dilated_conv_%d_tanh_s%d' % (i, s))(x)
    if activation == 'norm_relu':
        x = Activation('relu')(conv)
        x = Lambda(channel_normalization)(x)
    elif activation == 'wavenet':
        x = wave_net_activation(conv)
    else:
        x = Activation(activation)(conv)

    x = SpatialDropout1D(dropout_rate, name=name + '_spatial_dropout1d_%d_s%d_%f' % (i, s, dropout_rate))(x)

    # 1x1 conv.
    x = Convolution1D(nb_filters, 1, padding='same')(x)
    res_x = keras.layers.add([original_x, x])
    return res_x, x


def process_dilations(dilations):
    def is_power_of_two(num):
        return num != 0 and ((num & (num - 1)) == 0)

    if all([is_power_of_two(i) for i in dilations]):
        return dilations

    else:
        new_dilations = [2 ** i for i in dilations]
        # print(f'Updated dilations from {dilations} to {new_dilations} because of backwards compatibility.')
        return new_dilations


class TCN(Layer):
    """Creates a TCN layer.
        Args:
            input_layer: A tensor of shape (batch_size, timesteps, input_dim).
            nb_filters: The number of filters to use in the convolutional layers.
            kernel_size: The size of the kernel to use in each convolutional layer.
            dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
            nb_stacks : The number of stacks of residual blocks to use.
            activation: The activations to use (norm_relu, wavenet, relu...).
            padding: The padding to use in the convolutional layers, 'causal' or 'same'.
            use_skip_connections: Boolean. If we want to add skip connections from input to each residual block.
            return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
            dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
            name: Name of the model. Useful when having multiple TCN.
        Returns:
            A TCN layer.
        """

    def __init__(self,
                 nb_filters=64,
                 kernel_size=2,
                 nb_stacks=1,
                 dilations=None,
                 activation='norm_relu',
                 padding='causal',
                 use_skip_connections=True,
                 dropout_rate=0.0,
                 return_sequences=True,
                 name='tcn'):
        super().__init__()
        self.name = name
        self.return_sequences = return_sequences
        self.dropout_rate = dropout_rate
        self.use_skip_connections = use_skip_connections
        self.activation = activation
        self.dilations = dilations
        self.nb_stacks = nb_stacks
        self.kernel_size = kernel_size
        self.nb_filters = nb_filters
        self.padding = padding

        # backwards incompatibility warning.
        # o = tcn.TCN(i, return_sequences=False) =>
        # o = tcn.TCN(return_sequences=False)(i)

        if padding != 'causal' and padding != 'same':
            raise ValueError("Only 'causal' or 'same' paddings are compatible for this layer.")

        if not isinstance(nb_filters, int):
            print('An interface change occurred after the version 2.1.2.')
            print('Before: tcn.TCN(i, return_sequences=False, ...)')
            print('Now should be: tcn.TCN(return_sequences=False, ...)(i)')
            print('Second solution is to pip install keras-tcn==2.1.2 to downgrade.')
            raise Exception()

    def __call__(self, inputs):
        if self.dilations is None:
            self.dilations = [1, 2, 4, 8, 16, 32]
        x = inputs
        #x = Convolution1D(self.nb_filters, 1, padding=self.padding, name=self.name + '_initial_conv')(x)
        x = Dense(self.nb_filters, name=self.name + '_initial_conv')(x)
        skip_connections = []
        for s in range(self.nb_stacks):
            for i in self.dilations:
                x, skip_out = residual_block(x, s, i, self.activation, self.nb_filters,
                                             self.kernel_size, self.padding, self.dropout_rate, name=self.name)
                skip_connections.append(skip_out)
        if self.use_skip_connections:
            x = keras.layers.add(skip_connections)
        x = Activation('relu')(x)

        if not self.return_sequences:
            output_slice_index = -1
            x = Lambda(lambda tt: tt[:, output_slice_index, :])(x)
        return x

In [0]:
# CHANGE TRAIN AND TEST, MIX TO GET SIMILAR DISTRIBUTION
from sklearn.model_selection import train_test_split
rs = 43
X_train1, X_test1, y_train1, y_test1  = train_test_split(train.drop('mal', axis=1), train.mal, stratify=train.mal, test_size=0.29, random_state=rs )
X_train2, X_test2, y_train2, y_test2  = train_test_split(test_labelled.drop('mal', axis=1), test_labelled.mal, stratify=test_labelled.mal, test_size=0.29, random_state=rs)

X = np.concatenate((X_train1.comment_text, X_train2.comment_text))
y = np.concatenate((y_train1, y_train2))

X_test = np.concatenate((X_test1.comment_text, X_test2.comment_text))
y_test = np.concatenate((y_test1, y_test2))

In [0]:
max_features       = 50000
maxlen             = 400
dropout_rate       = 0.25
rs                 = 42
epochs             = 8
batch_size         = 128
embed_dim          = 50
rec_units          = 150
TCN_UNITS          = 75
DENSE_HIDDEN_UNITS = TCN_UNITS*2

In [0]:
def make_hat(maxlen=maxlen, embed_dim=embed_dim, max_features=max_features, dropout_rate=dropout_rate, 
            DENSE_HIDDEN_UNITS=DENSE_HIDDEN_UNITS, TCN_UNITS=TCN_UNITS):
    words = Input(shape=(maxlen,))
    x = Embedding(max_features+1, embed_dim, trainable=True)(words)
    x = SpatialDropout1D(dropout_rate)(x)
    x1 = TCN(TCN_UNITS, return_sequences=True, dilations = [1, 2, 4, 8, 16], name = 'tnc1_forward')(x) #, activation = 'wavenet'
    x2 = Lambda(lambda z: K.reverse(z,axes=-1))(x)
    x2 = TCN(TCN_UNITS, return_sequences=True, dilations = [1, 2, 4, 8, 16],name = 'tnc1_backward')(x2) #,dilations = [1, 2, 4]
    x = add([x1,x2])
    x1 = TCN(TCN_UNITS, return_sequences=True, dilations = [1, 2, 4, 8, 16], name = 'tnc2_forward')(x)
    x2 = Lambda(lambda z: K.reverse(z,axes=-1))(x)
    x2 = TCN(TCN_UNITS, return_sequences=True, dilations = [1, 2, 4, 8, 16],name = 'tnc2_backward')(x2)
    x = add([x1,x2])
    #x = concatenate([GlobalMaxPooling1D()(x),GlobalAveragePooling1D()(x)])
    hidden = concatenate([GlobalMaxPooling1D()(x),GlobalAveragePooling1D()(x)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)


    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(clipvalue=1, clipnorm=1), metrics=['acc'])
    return model

In [0]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, random_state=rs)
auc = []
roc = []
c = 0
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

In [0]:
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import RMSprop, Adam
from keras.engine.topology import Layer, InputSpec
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from nltk import tokenize 

from keras.models import Model
from keras import backend as K

from keras.engine.topology import Layer, InputSpec
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback

class AucPrRecEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = average_precision_score(self.y_val, y_pred)
            print("\n AUC-Precision Recall - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [0]:
from sklearn.metrics import average_precision_score, roc_auc_score
from keras.preprocessing.sequence import pad_sequences

for train_index, val_index in kf.split(X, y):
    print(f' fold {c}')
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index] 
    
    X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=maxlen, padding='post')
    X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=maxlen, padding='post')
    #exipdb.set_trace()
    #X_train = np.array([line.reshape(max_sent_amount,max_sen_len) for line in X_train])
    #X_val = np.array([line.reshape(max_sent_amount,max_sen_len) for line in X_val])
    
    PrAuc = AucPrRecEvaluation(validation_data=(X_val, y_val), interval=1)
    callbacks_list = [PrAuc]
    
    model = make_hat()
    #print(model.summary())
    # model = tf.contrib.tpu.keras_to_tpu_model(
    #          model,
    #          strategy=tf.contrib.tpu.TPUDistributionStrategy(
    #              tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))
    #break
    
    #X_train = np.array(train_posts)
    y_train = np.array(y_train)
    #X_val =  np.array(val_posts)
    y_val = np.array(y_val)
   
    print('Fitting')
    model.fit(X_train, y_train, batch_size=batch_size, epochs=4, validation_data=(X_val, y_val), callbacks=callbacks_list, shuffle=False, verbose=1)
    probs = model.predict(X_val, batch_size=batch_size, verbose=1)
    
    average_avpr = []
    average_auc = []
    
    
    auc_f = average_precision_score(y_val, probs)
    auc.append(auc_f)
    roc_f = roc_auc_score(y_val, probs)
    roc.append(roc_f)
    print(f' average precision {auc_f}')
    print(f' roc auc {roc_f}')
    c += 1
    del model

 fold 0
Fitting
Train on 126974 samples, validate on 31745 samples
Epoch 1/4

 AUC-Precision Recall - epoch: 1 - score: 0.842843 

Epoch 2/4

 AUC-Precision Recall - epoch: 2 - score: 0.855718 

Epoch 3/4

 AUC-Precision Recall - epoch: 3 - score: 0.856684 

Epoch 4/4

 AUC-Precision Recall - epoch: 4 - score: 0.856008 

 average precision 0.8560081496917137
 roc auc 0.963142978671079
 fold 1
Fitting
Train on 126974 samples, validate on 31745 samples
Epoch 1/4

 AUC-Precision Recall - epoch: 1 - score: 0.852918 

Epoch 2/4

 AUC-Precision Recall - epoch: 2 - score: 0.847828 

Epoch 3/4
  4480/126974 [>.............................] - ETA: 3:20 - loss: 0.1220 - acc: 0.9589

In [0]:
np.array(auc).mean()