In [None]:
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
subset_size = 10 * 10**2  # 1 MB in bytes

with open('hate.txt', 'r') as f:
    hate_speech_data = f.readlines()[:subset_size]

with open('non.txt', 'r') as f:
    non_hate_speech_data = f.readlines()[:subset_size]

hate_speech_labels = [1] * len(hate_speech_data)
non_hate_speech_labels = [0] * len(non_hate_speech_data)

all_data = hate_speech_data + non_hate_speech_data
all_labels = hate_speech_labels + non_hate_speech_labels


In [None]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(all_data, all_labels, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_sentences, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_sentences, truncation=True, padding=True, max_length=256)

In [None]:
tf.config.experimental_run_functions_eagerly(True)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(16)


In [None]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(train_dataset, epochs=3, validation_data=test_dataset)


In [None]:
model.save_weights('hate_speech_model_weights.h5')
model.load_weights('hate_speech_model_weights.h5')

In [None]:
predictions = model.predict(test_dataset)
predicted_labels = np.argmax(predictions.logits, axis=1)
accuracy = np.mean(np.array(test_labels) == predicted_labels)
print("Test Accuracy:", accuracy)

In [None]:
for sentence, label in zip(test_sentences, predicted_labels):
    if label == 1:
        print(sentence)