In [18]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import tensorflow as tf

In [61]:
# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [66]:
def serialize(text, label):
    feature = {
        "text": _bytes_feature(text),
        "label": _int64_feature(label)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def tf_serialize_example(text, label):
    tf_string = tf.py_function(
    serialize,
    (text, label),  # pass these args to the above function.
    tf.string)      # the return type is `tf.string`.
    return tf.reshape(tf_string, ())

# Zeerak hatespeech

In [9]:
df = pd.read_csv("data/hatespeech/NAACL_SRW_2016.csv")
df["racism"]

1969

# Supremacist

In [16]:
comments = dict()
for p in tqdm(Path("data/hate-speech-dataset/all_files/").glob("*.txt")):
    with p.open("r") as f:
        t = f.read()
    comments[p.stem] = t


10944it [00:00, 29155.12it/s]


In [20]:
df = pd.read_csv("data/hate-speech-dataset/annotations_metadata.csv")
df


Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label
0,12834217_1,572066,1346,0,noHate
1,12834217_2,572066,1346,0,noHate
2,12834217_3,572066,1346,0,noHate
3,12834217_4,572066,1346,0,hate
4,12834217_5,572066,1346,0,noHate
...,...,...,...,...,...
10939,33676864_5,734541,1388,0,noHate
10940,33677019_1,735154,1388,0,noHate
10941,33677019_2,735154,1388,0,noHate
10942,33677053_1,572266,1388,0,hate


In [72]:
df_train = df[:8000]
df_val = df[8000:]

In [73]:
def iter_comments(df):
    def iterator():
        for i, row in df.iterrows():
            comment = comments[row.file_id]
            label = int(row.label == "hate")
            yield (comment, label)
    return iterator

In [74]:
ds_train = tf.data.Dataset.from_generator(iter_comments(df_train), output_types=(tf.string, tf.int32))
ds_val = tf.data.Dataset.from_generator(iter_comments(df_val), output_types=(tf.string, tf.int32))


In [75]:
writer_train = tf.data.experimental.TFRecordWriter("supremacists_train.tfrecord")
writer_train.write(ds_train.map(tf_serialize_example))

In [76]:
writer_val = tf.data.experimental.TFRecordWriter("supremacists_val.tfrecord")
writer_val.write(ds_val.map(tf_serialize_example))

In [87]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nltk

In [90]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mprzewie/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [99]:
for x, y in ds_val:
    text = x.numpy().decode()
    break

In [101]:

aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
Thats what I tried to tell those nuts over on the health a fitness thread and was attacked for it in post ????
Augmented Text:
Thats what I tried to tell those nuts terminated on the health a fitness yarn and be attacked for it in mail service ? ? ? ?


In [133]:
aug = naw.AntonymAug()
augmented_text = aug.augment(text,)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
Thats what I tried to tell those nuts over on the health a fitness thread and was attacked for it in post ????
Augmented Text:
Thats what I tried to tell those nuts over off the health a fitness thread and was attacked for it in post ? ? ? ?


In [141]:
def augment(x):
    x = x.numpy().decode()
    x = aug.augment(x)
    return x

In [144]:
for x in ds_train.map(
    lambda x,y: (tf.py_function(augment, [x], tf.string), y)
):
    print(x)
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b'As of March 13th , 2014 , the booklet lack been downloaded over 18 , 300 times and counting .'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
