In [67]:
### Imports

import copy
import os
import requests
import tempfile
import urllib
import zipfile
import string
import re

import tensorflow_model_remediation.min_diff as md
from google.protobuf import text_format
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_model_analysis as tfma
import tensorflow_data_validation as tfdv
from tensorflow_model_analysis.addons.fairness.post_export_metrics import fairness_indicators
from tensorflow_model_analysis.addons.fairness.view import widget_view
import sys
import os
from sklearn.feature_extraction.text import TfidfVectorizer,  CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tensorflow.python.keras import models
# from transformers import BertTokenizer, glue_convert_examples_to_features
# from transformers import TFBertForSequenceClassification
# import tfrecorder #https://github.com/google/tensorflow-recorder

In [47]:
sys.path.insert(0, os.path.abspath('../')) # needed to import src

is_local = True #@param {type:"boolean"}
local_data_path = '../data/twitter_datasets/combined_harassment/'
gdrive_data_path = 'drive/My\ Drive/Hate\ Speech\ Research/contextualizing-hate-speech-models-with-explanations-master/data/twitter/combined_harassment/'
np.random.seed(1)
tf.random.set_seed(1)

data_path = local_data_path if is_local else gdrive_data_path

dev_pd = pd.read_csv(f'{data_path}dev.csv', index_col=None).dropna()
train_pd = pd.read_csv(f'{data_path}train.csv', index_col=None).dropna()
test_pd = pd.read_csv(f'{data_path}test.csv', index_col=None).dropna()

In [48]:
LABEL = 'is_harassment'
TEXT_FEATURE = 'cleaned_tweet'
BATCH_SIZE = 512

In [4]:
# Convert the pandas DataFrames to Datasets.
dataset_train = tf.data.Dataset.from_tensor_slices(
    (train_pd[TEXT_FEATURE].values,
     train_pd.pop(LABEL).values.reshape(-1,1) * 1.0)).batch(BATCH_SIZE)
dataset_dev = tf.data.Dataset.from_tensor_slices(
    (dev_pd[TEXT_FEATURE].values,
     dev_pd.pop(LABEL).values.reshape(-1,1) * 1.0)).batch(BATCH_SIZE)
dataset_test = tf.data.Dataset.from_tensor_slices(
    (test_pd[TEXT_FEATURE].values,
     test_pd.pop(LABEL).values.reshape(-1,1) * 1.0)).batch(BATCH_SIZE)

In [5]:
#todo
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


NameError: name 'TFBertForSequenceClassification' is not defined

In [121]:
# todo put in util
## tokenize to unigram + bigram

def strip_punc_hp(s):
    return str(s).translate(str.maketrans('', '', string.punctuation))

def remove_punctuation_tweet(text_array):
    # get rid of punctuation (except periods!)
    punctuation_no_period = "[" + re.sub("\.","",string.punctuation) + "]"
    return np.array([re.sub(punctuation_no_period, "", text) for text in text_array])

def tfidf_vectorize(train_texts: np.ndarray,
                    train_labels: np.ndarray,
                    val_texts: np.ndarray,
                    test_texts: np.ndarray,
                    ngram_range: tuple = (1,2),
                    top_k: int = 20000,
                    token_mode: str = 'word',
                    min_document_frequency: int = 2,
                    tf_idf: bool = True) -> tuple:
    """
    Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        @:param train_texts: list, training text strings.
        @:param train_labels: np.ndarray, training labels.
        @:param val_texts: list, validation text strings.
        @:param ngram_range Range: (inclusive) of n-gram sizes for tokenizing text.
        @:param top_k: Limit on the number of features. We use the top 20K features.
        @:param token_mode:  Whether text should be split into word or character n-grams. One of 'word', 'char'.
        @:param min_document_frequency: Minimum document/corpus frequency below which a token will be discarded.

    # Returns
        x_train, x_val: vectorized training and validation texts

    # adapted from: https://developers.google.com/machine-learning/guides/text-classification/step-3
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': ngram_range,
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': token_mode,
            'min_df': min_document_frequency,
    }

    vectorizer = TfidfVectorizer(**kwargs) if tf_idf else CountVectorizer(**kwargs)
    train_texts = remove_punctuation_tweet(train_texts)
    val_texts = remove_punctuation_tweet(val_texts)
    test_texts = remove_punctuation_tweet(test_texts)
    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation and test texts.
    x_val = vectorizer.transform(val_texts)
    x_test = vectorizer.transform(test_texts)


    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(top_k, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    x_test = selector.transform(x_test).astype('float32')

    return x_train, x_val, x_test

In [122]:
def logistic_regression_model():
    return models.Sequential([
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

def train_ngram_logreg(train_data,
                       validation_data,
                       test_data,
                       learning_rate=1e-3,
                       epochs=1000,
                       batch_size=512,
                       tf_idf=True,
                       ngram_range=(1,2)):
    """

    :param train_data: pandas dataframe of the training data
    :param validation_data: pandas dataframe of the validation data
    :param test_data: pandas dataframe of the test data
    :param learning_rate: float, learning rate for training model.
    :param epochs: int, number of epochs.
    :param batch_size: int, number of samples per batch.
    :param tf_idf: bool, whether to encode tf-idf or n-gram
    :return:
    """
    train_texts, y_train = train_data[TEXT_FEATURE].values, train_data[LABEL].values
    dev_texts, y_dev = validation_data[TEXT_FEATURE].values, validation_data[LABEL].values
    test_texts, y_test = test_data[TEXT_FEATURE].values, test_data[LABEL].values

    x_train, x_dev, x_test = tfidf_vectorize(train_texts=train_texts,
                                             train_labels=y_train,
                                             val_texts=dev_texts,
                                             test_texts=test_texts,
                                             tf_idf=tf_idf,
                                             ngram_range=ngram_range)
    model = logistic_regression_model()
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(loss='bce', optimizer=optimizer,metrics=['acc'])
    # early stopping if validation loss does not decrease in 2 consecutive tries.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]
    history = model.fit(x_train,
              y_train,
              epochs=epochs,
              callbacks=callbacks,
              validation_data=(x_dev, y_dev),
              verbose=2, #once per epoch
              batch_size=batch_size,
              )

    # Print results.
    history = history.history
    val_acc = history["val_acc"][-1]
    print(f'Validation accuracy: {val_acc}, loss: {history["val_loss"][-1]}')

    y_hat_test = model.predict(x_test)
    test_acc = np.mean(y_hat_test == y_test)
    print(f'Test accuracy: {test_acc}')

    # Save model.
    model.save(f'../models/mindiff/n_gram_logreg_lr_{strip_punc_hp(learning_rate)}_batch_{str(batch_size)}_valacc_{str(strip_punc_hp(val_acc))}_testacc_{str(strip_punc_hp(test_acc))}.h5')
    return val_acc, history['val_loss'][-1]

In [123]:
train_ngram_logreg(train_data=train_pd, validation_data=dev_pd, test_data=test_pd, tf_idf=False, ngram_range=(1,2))
train_ngram_logreg(train_data=train_pd, validation_data=dev_pd, test_data=test_pd, tf_idf=True, ngram_range=(1,2))


Epoch 1/1000
211/211 - 1s - loss: 0.5917 - acc: 0.7499 - val_loss: 0.5303 - val_acc: 0.8054
Epoch 2/1000
211/211 - 1s - loss: 0.4853 - acc: 0.8300 - val_loss: 0.4668 - val_acc: 0.8372
Epoch 3/1000
211/211 - 1s - loss: 0.4330 - acc: 0.8503 - val_loss: 0.4305 - val_acc: 0.8489
Epoch 4/1000
211/211 - 1s - loss: 0.3998 - acc: 0.8615 - val_loss: 0.4064 - val_acc: 0.8562
Epoch 5/1000
211/211 - 1s - loss: 0.3760 - acc: 0.8690 - val_loss: 0.3888 - val_acc: 0.8621
Epoch 6/1000
211/211 - 1s - loss: 0.3580 - acc: 0.8755 - val_loss: 0.3754 - val_acc: 0.8655
Epoch 7/1000
211/211 - 1s - loss: 0.3436 - acc: 0.8804 - val_loss: 0.3646 - val_acc: 0.8680
Epoch 8/1000
211/211 - 1s - loss: 0.3317 - acc: 0.8843 - val_loss: 0.3560 - val_acc: 0.8707
Epoch 9/1000
211/211 - 1s - loss: 0.3217 - acc: 0.8877 - val_loss: 0.3485 - val_acc: 0.8743
Epoch 10/1000
211/211 - 1s - loss: 0.3130 - acc: 0.8904 - val_loss: 0.3424 - val_acc: 0.8767
Epoch 11/1000
211/211 - 2s - loss: 0.3055 - acc: 0.8931 - val_loss: 0.3371 - va

(0.8923420310020447, 0.3042904734611511)

In [65]:
def n_gram_vectorize(train_texts: np.ndarray,
                     train_labels: np.ndarray,
                     val_texts: np.ndarray,
                     ngrams: tuple =(1,1),
                     top_k: int=20000) -> tuple:

    # Instantiate TextVectorization with "binary" output_mode (multi-hot)
    # todo experiment with bigram/unigram
    vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(ngrams=ngrams, output_mode='binary', max_tokens=top_k)
    vectorize_layer.adapt(train_texts)

    x_train = vectorize_layer(train_texts).numpy()
    x_dev = vectorize_layer(val_texts).numpy()

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(top_k, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_dev).astype('float32')
    x_train, x_val

n_gram_vectorize(train_texts=train_pd[TEXT_FEATURE].values,
                 train_labels=train_pd[LABEL].values,
                 val_texts=dev_pd[TEXT_FEATURE].values,)