In [27]:
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [45]:
subset_size = 10 * 10**2  # 1 MB in bytes

with open('hate.txt', 'r') as f:
    hate_speech_data = f.readlines()[:subset_size]

with open('non.txt', 'r') as f:
    non_hate_speech_data = f.readlines()[:subset_size]

hate_speech_labels = [1] * len(hate_speech_data)
non_hate_speech_labels = [0] * len(non_hate_speech_data)

all_data = hate_speech_data + non_hate_speech_data
all_labels = hate_speech_labels + non_hate_speech_labels


In [46]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(all_data, all_labels, test_size=0.2, random_state=42)

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_sentences, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_sentences, truncation=True, padding=True, max_length=256)

In [48]:
tf.config.experimental_run_functions_eagerly(True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)


In [49]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(train_dataset, epochs=3, validation_data=test_dataset)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f093c3623a0>

In [50]:
model.save_weights('hate_speech_model_weights.h5')
model.load_weights('hate_speech_model_weights.h5')

In [52]:
predictions = model.predict(test_dataset)
predicted_labels = np.argmax(predictions.logits, axis=1)
accuracy = np.mean(np.array(test_labels) == predicted_labels)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.88


In [53]:
for sentence, label in zip(test_sentences, predicted_labels):
    if label == 1:
        print(sentence)

The fact is that I made this subreddit 18 days ago and it has absolutely nothing to do with the deletion of another subreddit.

&gt;Any questions?  Yes I have a few.  I am 5ft1 - How can I be taller?  I have the body/frame of a child - How do I appear more manly?  My face is 3/10 - How do I become more attractive?  I am a mentalcel and medicine makes me worse - How do I become normal?  What woman would ever date me?  

Basically *all* women believe in that insanely stupid shit. Almost no men do. 

Interesting. None of those results strike me as particularly surprising.   Red hair does seem universally unattractive on men, though. I have red hair and I'm very unhappy about it, tbh. I've thought about dyeing it, but that would take too much work.

Well, OK, so if a guy only rubs up against a woman, that's OK, because he's not entering her?  You're either being purposefully obtuse or you don't understand how state coercion works.  (1) You jaywalk (2) Constable says "stop!" (3) You continu