#Hate Speech Detection (BERT) and Topic Modelling (BERTopic)

Import relevant libraries

In [2]:
"""Author: Melwyn D Souza, Reg No: R00209495"""
!pip install tensorflow-text
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras
from tensorflow.keras.layers import *

import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 100

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.11.0


Data Pre-Processing

The data is collected from 5 differnent online sources, the data is all the tweets from differnt users all over the world, this data is not configured (some files have multiilabelled data, some are labeled with sentiments etc) the way I want it for my HSD model, so the data has to be firstly made into two columns
1.   Tweets (string format)
2.   Lable - Hate/Non-Hate

Data is manually prepared by mergining data from 5 sources and contains almost balanced dataset
1.   #Total instances: 76679
2.   #Hate tweets: 30147
3.   #Non-Hate tweets: 46532

In [3]:
from google.colab import drive
drive.mount("/content/gdrive/")
!ls

Mounted at /content/gdrive/
gdrive	sample_data


The data is imbalanced and we will sample only 30k tweets randomly out of both labels Hate & Non-Hate

We will call use two seperate dataframes to sample (calling them df_hs & df_nhs) then merge them both 

In [4]:
df= pd.read_csv("gdrive/My Drive/Thesis/dataset/data.csv", names=["tweet", "label"])
# df.head()
df.groupby("label").describe()
print(" - - "*20)
print(f"Imbalanced data:\n{df['label'].value_counts()}")
print(" - - "*20)

df_hs = df[df['label']==1]
df_nhs = df[df['label']==0]

df_hs = df_hs.sample(30000)
df_nhs = df_nhs.sample(30000)


df_hs = df_hs.sample(1000)
df_nhs = df_nhs.sample(1000)

df_balanced  = pd.concat([df_hs,df_nhs])

df_balanced['label'].value_counts()
df_balanced.head()
df_balanced.tail()

Unnamed: 0_level_0,tweet,tweet,tweet,tweet
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,46532,44225,#model i love u take with u all the time in ...,319
1,30147,28303,@user you might be a libtard if... #libtard #...,40


 - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - - 
Imbalanced data:
0    46532
1    30147
Name: label, dtype: int64
 - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - - 


1    1000
0    1000
Name: label, dtype: int64

Unnamed: 0,tweet,label
55385,You still can't turn a hoe into a house wife !,1
38395,@kieffer_jason bitch am not your fam and wtf i...,1
9119,@user #allahsoil the media distos the threat o...,1
69322,"@exJizyacolector @abunaseeha2 Yes, only Islam ...",1
48626,RT @NoBeeetch: *comes home late*\n\nBae: Was h...,1


Unnamed: 0,tweet,label
76344,@RealRobBrydon can't wait for Friday.. I'm com...,0
66853,"Kat and Andre are behaving in such a spiteful,...",0
73446,Well I called it #evilwins #ratings #MKR,0
5321,this wednesday is killing me. @user #malamia b...,0
546,#space place color #blue #fabricsourcing o...,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['tweet'],df_balanced['label'], stratify=df_balanced['label'])
# X_train.head()  

In [6]:
bert_pre_process = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [7]:
def get_txt_embeddings(txt):
  pre_processed_txt = bert_pre_process(txt)
  return bert_encoder(pre_processed_txt)["pooled_output"]

The BERT models return a map with 3 important keys: pooled_output, sequence_output, encoder_outputs:

pooled_output represents each input sequence as a whole. The shape is [batch_size, H]. You can think of this as an embedding for the entire movie review.

default represents each input token in the context. The shape is [batch_size, seq_length, H]. You can think of this as a contextual embedding for every token in the movie review.

encoder_outputs are the intermediate activations of the L Transformer blocks. outputs["encoder_outputs"][i] is a Tensor of shape [batch_size, seq_length, 1024] with the outputs of the i-th Transformer block, for 0 <= i < L. The last value of the list is equal to sequence_output

In [8]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
txt_embeddings = get_txt_embeddings(text_input)

# Neural network layers
l1 = tf.keras.layers.Dropout(0.1, name="dropout")(txt_embeddings)
l2 = tf.keras.layers.Dense(1, activation='sigmoid', name="final_layer")(l1)
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l2])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=2, batch_size = 32)

y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
print(y_predicted)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f76e92e24c0>

[0.6976831  0.6244147  0.38245067 0.56253386 0.43730146 0.65674883
 0.5116924  0.43742466 0.5742295  0.616698   0.4112667  0.53899944
 0.7043219  0.44537598 0.6906271  0.5185923  0.6579576  0.42180577
 0.45116162 0.4954587  0.48395735 0.71991116 0.5756991  0.46698722
 0.4168948  0.5872016  0.7689416  0.4755058  0.5847362  0.52751
 0.53566337 0.5676436  0.57102764 0.56139964 0.44211614 0.4824658
 0.2862225  0.5195576  0.69221365 0.6575153  0.3533399  0.5007141
 0.55344874 0.44343495 0.55547917 0.69727874 0.44654605 0.5809901
 0.57709765 0.5327927  0.41905972 0.44267422 0.65541786 0.37237582
 0.6808014  0.444547   0.7956441  0.57922745 0.751202   0.3669289
 0.56513286 0.42160672 0.5853885  0.37112105 0.5864686  0.6541376
 0.37565553 0.67221737 0.54681927 0.45480567 0.5332786  0.43234766
 0.5078274  0.4791039  0.42099717 0.5791958  0.44054863 0.36581758
 0.41577896 0.4891329  0.6544523  0.76919067 0.73705184 0.47374654
 0.51355743 0.59517145 0.5248911  0.5113375  0.43184182 0.63960373
 0.