In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import nltk
import numpy as np
import re
import string

In [None]:
PATH = 'jigsaw_train_hateoffensive.csv'
df = pd.read_csv(PATH)

In [None]:
def remove_punct(text):
    new_words = []
    for word in text:
        w = re.sub(r'[^\w\s]','',word) #remove everything except words and space
        w = re.sub(r'_','',w) #how to remove underscore as well
        new_words.append(w)
    return "".join(new_words)

def remove_extras(text):
    text = remove_punct(text)
    text = re.sub("\n+", "", text)
    text = re.sub("\s+$", "", text)
    text = re.sub("\s+", " ", text)
    text = re.sub(f"http\S+", "", text)

    return text

In [None]:
label_to_num = {
    "neutral": 0,
    "offensive": 1,
    "hate_speech": 2
}

In [None]:
texts = df["text"].map(remove_extras), 
texts.shape

In [None]:
labels = df[df.columns[3:]]
labels = tf.keras.utils.to_categorical(labels["hateoffensive_class"].map(lambda x: label_to_num[x]))


In [None]:
seq_len = 512
num_samples = len(texts)

Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))
# labels = np.expand_dims(, axis=0).T

Xids.shape, labels.shape

In [None]:
from transformers import BertTokenizer
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(tqdm(texts)):
    token = tokenizer.encode_plus(
        phrase, max_length=seq_len, add_special_tokens=True, 
        padding="max_length", truncation=True, return_tensors='tf')

    Xids[i, :] = token['input_ids']
    Xmask[i, :] = token['attention_mask']

In [None]:
def map_func(inputs_ids, masks, labels):
    return {
        'input_ids': inputs_ids,
        'attention_mask': masks
    }, labels

In [None]:
batch_size = 32
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
dataset = dataset.map(map_func)
dataset = dataset.shuffle(buffer_size=1000).batch(batch_size, drop_remainder=True)
dataset.take(1)

In [None]:
split = 0.9
size = int((num_samples/batch_size) * split)

In [None]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

# del [dataset, Xids, Xmask, labels]

In [None]:
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

# make untrainable
bert.trainable = False

In [None]:
input_ids = keras.layers.Input(shape=(seq_len,), name="input_ids", dtype="int32")
attention_mask = keras.layers.Input(shape=(seq_len,), name="attention_mask", dtype="int32")

embeddings = bert.bert(input_ids, attention_mask=attention_mask)[1]

# x = layers.Dense(1024, activation="relu")(embeddings)
# x = layers.Dropout(0.5)(x)
x = layers.Dense(3, activation="softmax")(embeddings)

In [None]:
model = keras.Model(inputs=[input_ids, attention_mask], outputs=x)

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer = keras.optimizers.Adam(),
    loss = "categorical_crossentropy",
    metrics=["accuracy"]
)

In [None]:
import numpy as np
def get_bert_embedding(texts):

    if type(texts) == str:
        texts = [texts]

    num_samples = len(texts)

    Xids = np.zeros((num_samples, seq_len))
    Xmask = np.zeros((num_samples, seq_len))

    for i, phrase in enumerate(texts):
        token = tokenizer.encode_plus(
        phrase, max_length=seq_len, add_special_tokens=True, 
        padding="max_length", truncation=True, return_tensors='tf')

    Xids[i, :] = token['input_ids']
    Xmask[i, :] = token['attention_mask']

    return Xids, Xmask

In [None]:
ids, mask = get_bert_embedding("There is such a violent earthquake here in my city of Berlin")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
### from models.py
from models import *
tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two")
model = Model_Rational_Label.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two")
inputs = tokenizer('He is a great guy', return_tensors="pt")
prediction_logits, _ = model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'])


In [None]:
_