In [None]:
import numpy as np
import pandas as pd
import time, re

from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, Flatten, LSTM, Bidirectional, Conv1D
from keras.layers import Dropout, Activation, MaxPooling1D, SpatialDropout1D
from keras.layers.embeddings import Embedding

from keras.callbacks import EarlyStopping
from keras import regularizers, optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/gdrive')

!pip install ktrain
import ktrain
from ktrain import text

# **Preprocessing**

**Source Set**

In [None]:
cols = ['sentiment','text']
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/hate25000.csv', encoding='latin-1',\
                 header=None, names=cols, usecols=[5,6])
df = df[['text','sentiment']]
df.insert(0,'source','hate data')
df.head()
X = df['text'].fillna('').tolist()
X = [str(i) for i in X]
X = [re.sub('@[^\s]+','',i) for i in X]
X = [re.sub('RT','',i) for i in X]
X = X[1:]

y = df['sentiment'].values
y = y[1:]

print('Source text:',X[0])
print('Source label:',y[0])

Source text: !!!   As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
Source label: 2


In [None]:
print('Loading data...')
# Balance classes 
class_weights = compute_class_weight('balanced', np.unique(y), y)
class_weights = dict(enumerate(class_weights))

# Change for each model and iteration!
seed = np.random.seed(42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

# Clean text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to integer sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

vocab_size = len(tokenizer.word_index) + 1  
print('Found %s unique tokens.' % vocab_size)
maxlen = max(len(X) for X in X_train) 

# Sequences that are shorter than the max length are padded with value
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Loading data...
Found 19333 unique tokens.
X_train shape: (16604, 75)
X_test shape: (8179, 75)


**Out-of-Sample Set**

In [None]:
def decontracted(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"it\Õs", "it is", phrase)
    phrase = re.sub(r"don\Õt", "do not", phrase)
    phrase = re.sub(r"isn\Õt", "is not", phrase)
    phrase = re.sub(r"I\Õm", "I am", phrase)
    phrase = re.sub(r"can\Õt", "can not", phrase)
    phrase = re.sub(r"\Õve", "have", phrase)
    return phrase

cols_target = ["target","text","label"]
target_df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/multiclass_manifesto.csv', encoding='latin-1', header=None, \
                     names=cols_target, usecols=[0,1,2])
target_df = target_df[['target','text',"label"]]
X_target = target_df['text'].fillna('').tolist()
X_target = [decontracted(str(i)) for i in X_target]
targ_title = target_df['target'].fillna('').tolist()
y_target = target_df["label"].values

In [None]:
print("Loading data...")
# Change for each model and iteration!
seed = np.random.seed(7)

# Balance class weight
class_weights_B = compute_class_weight('balanced', np.unique(y_target), y_target)
class_weights_B = dict(enumerate(class_weights_B))

# Further clean text
tokenizer.fit_on_texts(X_target)

# Split data
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_target, y_target, test_size=0.10, random_state=seed)

# Convert text to integer sequences
X_train_B = tokenizer.texts_to_sequences(X_train_B)
X_test_B = tokenizer.texts_to_sequences(X_test_B)

y_train_B = to_categorical(y_train_B, 3)
y_test_B = to_categorical(y_test_B, 3)

# Sequences that are shorter than the max length are padded with value
X_train_B = pad_sequences(X_train_B, padding='post', maxlen=maxlen)
X_test_B = pad_sequences(X_test_B, padding='post', maxlen=maxlen)

print('X_train shape:', X_train_B.shape)
print('X_test shape:', X_test_B.shape)

print('y_train shape:', y_train_B.shape)
print('y_test shape:', y_test_B.shape)

# **Embeddings**

**GloVe**

In [None]:
glove_path = '/content/gdrive/My Drive/Colab Notebooks/Data/glove.twitter.27B/glove.twitter.27B.200d.txt'

embeddings_index = dict()
with open(glove_path,
          encoding="utf8") as glove:
  for line in glove:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
  glove.close()

In [None]:
embedding_matrix = np.zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    if index > vocab_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

# **TextCNN**

**Pipeline**

In [None]:
%%time

#Parameters
maxlen = maxlen
num_filters = 64
weight_decay = 1e-4
embedding_dim = 200
batch_size = 64
epochs = 20

print('Build CNN model...')
model = Sequential()
# First layer
model.add(Embedding(vocab_size, embedding_dim, 
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D())
# Second layer
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D())
# Third layer
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D())
model.add(Flatten())
# CLASSIFICATION
# Fully connected layer
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dropout(0.5))
# Output layer w/ softmax
model.add(Dense(3, activation='softmax'))

# Compile the model
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=0)
callbacks = [early_stopping]

cnn_history = model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_data=(X_test, y_test),
              class_weight=class_weights)

model.save('my_cnn.h5')

**Retraining**

In [None]:
cnn_model_A = load_model('my_cnn.h5')

cnn_model_B_on_A = Sequential(cnn_model_A.layers[:-1])
cnn_model_B_on_A.add(Dense(3, activation="softmax"))

for layer in cnn_model_B_on_A.layers[:-1]:
    trainable = True

adam = optimizers.Adam(lr=0.007, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
cnn_model_B_on_A.compile(loss="categorical_crossentropy", optimizer=adam,
                     metrics=["accuracy"])

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=0)
callbacks = [early_stopping]

cnn_B_history = cnn_model_B_on_A.fit(X_train_B, y_train_B, epochs=30,
                           validation_data=(X_test_B, y_test_B),
                           batch_size=16,
                           callbacks=callbacks,
                           class_weight=class_weights_B)

loss, acc = model.evaluate(X_train_B, y_train_B, verbose=True)
print("Training Accuracy: {:.4f}".format(acc))
loss, acc = model.evaluate(X_test_B, y_test_B, verbose=False)
print("Testing Accuracy:  {:.4f}".format(acc))

cnn_model_B_on_A.save('my_cnn_B.h5')

**Evaluation**

In [None]:
cnn_model = load_model('my_cnn_B.h5')

cnn_pred = cnn_model.predict_classes(X_test_B)

# Show the inputs and predicted outputs
for i in range(850):
  print("X=%s, Predicted=%s\n" % (tokenizer.sequences_to_texts(X_test_B)[i], cnn_pred[i]))

In [None]:
def argmax_keepdims(x, axis):
    output_shape = list(x.shape)
    output_shape[axis] = 1
    return np.argmax(x, axis=axis).reshape(output_shape)

y_test_B_fit = argmax_keepdims(y_test_B, axis=1)

target_names = ['hate speech', 'offensive language','neither']
print('----------------------EVALUATION----------------------\n')
print(classification_report(y_test_B_fit, cnn_pred, target_names=target_names))

# **BiLSTM w/ Attention**

In [None]:
from keras.engine import Layer, InputSpec
from keras import backend as K
from keras import initializers
from keras import regularizers
from keras import constraints


def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatibl|e with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


def create_custom_objects():
    instance_holder = {"instance": None}

    class ClassWrapper(AttentionWithContext):
        def __init__(self, *args, **kwargs):
            instance_holder["instance"] = self
            super(ClassWrapper, self).__init__(*args, **kwargs)

    def loss(*args):
        method = getattr(instance_holder["instance"], "loss")
        return method(*args)

    def acc(*args):
        method = getattr(instance_holder["instance"], "acc")
        return method(*args)
    return {"ClassWrapper": ClassWrapper ,"AttentionWithContext": ClassWrapper, "loss": loss,
            "acc":acc}

**Pipeline**

In [None]:
%%time

#Parameters
maxlen = maxlen
lstm_output_size = 70
embedding_dim = 200
batch_size = 256
kernel_size = 4
epochs = 10

print('Build LSTM model...')
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, 
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(lstm_output_size, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(AttentionWithContext())
# Output layer w/ softmax
model.add(Dense(3, activation='softmax'))

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=0)
callbacks = [early_stopping]

att_history = model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_data=(X_test, y_test),
              class_weight=class_weights)
loss, acc = model.evaluate(X_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(acc))
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print("Testing Accuracy:  {:.4f}".format(acc))

model.save('my_att.h5')

**Retraining**

In [None]:
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split(X_oos, y_oos, test_size=0.33, random_state=33)

att_model_B_on_A = Sequential(att_model_A.layers[:-1])
att_model_B_on_A.add(Dense(3, activation="softmax"))

for layer in lstm_model_B_on_A.layers[:-1]:
    trainable = True

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
att_model_B_on_A.compile(loss="categorical_crossentropy", optimizer=adam,
                     metrics=["accuracy"])

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=0)
callbacks = [early_stopping]

att_B_history = att_model_B_on_A.fit(X_train_B, y_train_B, epochs=10,
                           validation_data=(X_test_B, y_test_B),
                           callbacks=callbacks)

loss, acc = model.evaluate(X_train_B, y_train_B, verbose=True)
print("Training Accuracy: {:.4f}".format(acc))
loss, acc = model.evaluate(X_test_B, y_test_B, verbose=False)
print("Testing Accuracy:  {:.4f}".format(acc))

att_model_B_on_A.save('my_att_B.h5')

**Evaluation**

In [None]:
att_model = load_model('my_att.h5', custom_objects=create_custom_objects())

att_pred = att_model.predict_classes(X_oos)

# Show the inputs and predicted outputs
for i in range(5):
  print("X=%s, Predicted=%s" % (tokenizer.sequences_to_texts(X_oos)[i], att_pred[i]))

In [None]:
def argmax_keepdims(x, axis):
    output_shape = list(x.shape)
    output_shape[axis] = 1
    return np.argmax(x, axis=axis).reshape(output_shape)

att_pred_fit = argmax_keepdims(att_pred_prob, axis=1)

target_names = ['hate speech', 'offensive language','neither']
print('----------------------EVALUATION----------------------\n')
print(classification_report(y_test_B, att_pred_fit, target_names=target_names))