<a href="https://colab.research.google.com/github/oriolao/AbusiveLanguage/blob/master/BERT_Ensemble_Hate_Speech_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bert-for-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/a1/acb891630749c56901e770a34d6bac8a509a367dd74a05daf7306952e910/bert-for-tf2-0.14.9.tar.gz (41kB)
[K     |████████                        | 10kB 17.4MB/s eta 0:00:01[K     |████████████████                | 20kB 13.3MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 9.7MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 8.9MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.9MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/aa/e0/4f663d8abf83c8084b75b995bd2ab3a9512ebc5b97206fde38cef906ab07/py-params-0.10.2.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [

In [None]:
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)

TensorFlow Version: 2.4.1
Hub version:  0.12.0


In [None]:
# Functions for constructing BERT Embeddings: input_ids, input_masks, input_segments and Inputs

MAX_SEQ_LEN=512 # max sequence length

def get_masks(tokens):
    """Masks: 1 for real tokens and 0 for paddings"""
    return [1]*len(tokens) + [0] * (MAX_SEQ_LEN - len(tokens))
 
def get_segments(tokens):
    """Segments: 0 for the first sequence, 1 for the second"""  
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (MAX_SEQ_LEN - len(tokens))

def get_ids(tokens, tokenizer):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (MAX_SEQ_LEN - len(token_ids))
    return input_ids

def create_single_input(sentence, tokenizer, max_len):
    """Create an input from a sentence"""
    stokens = tokenizer.tokenize(sentence)
    stokens = stokens[:max_len]
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
 
    ids = get_ids(stokens, tokenizer)
    masks = get_masks(stokens)
    segments = get_segments(stokens)

    return ids, masks, segments
 
def convert_sentences_to_features(sentences, tokenizer):
    """Convert sentences to features: input_ids, input_masks and input_segments"""
    input_ids, input_masks, input_segments = [], [], []
 
    for sentence in tqdm(sentences,position=0, leave=True):
      ids,masks,segments=create_single_input(sentence,tokenizer,MAX_SEQ_LEN-2)
      assert len(ids) == MAX_SEQ_LEN
      assert len(masks) == MAX_SEQ_LEN
      assert len(segments) == MAX_SEQ_LEN
      input_ids.append(ids)
      input_masks.append(masks)
      input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
          np.asarray(input_masks, dtype=np.int32), 
          np.asarray(input_segments, dtype=np.int32)]

In [None]:
def create_tonkenizer(bert_layer):
    """Instantiate Tokenizer with vocab"""
    vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case=bert_layer.resolved_object.do_lower_case.numpy() 
    tokenizer=bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    return tokenizer

In [None]:
def nlp_model(callable_object):
    # Load the pre-trained BERT base model
    bert_layer = hub.KerasLayer(handle=callable_object, trainable=True)  
   
    # BERT layer three inputs: ids, masks and segments
    input_ids = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_ids")           
    input_masks = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="input_masks")       
    input_segments = Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32, name="segment_ids")
    
    inputs = [input_ids, input_masks, input_segments] # BERT inputs
    pooled_output, sequence_output = bert_layer(inputs) # BERT outputs
    
    # Add a hidden layer
    x = Dense(units=768, activation='relu')(pooled_output)
    x = Dropout(0.1)(x)
 
    # Add output layer
    outputs = Dense(3, activation="softmax")(x)

    # Construct a new model
    model = Model(inputs=inputs, outputs=outputs)
    return model

model = nlp_model("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1")
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Fast_N-Gram.csv to Fast_N-Gram.csv


In [None]:
# import io
FastNgram = "Fast_N-Gram.csv"
df = pd.read_csv(FastNgram, encoding='ISO-8859-1', header=0)
XFast = pd.DataFrame(data=df, columns=['Tweet', 'Class'])
XFast.head()

Unnamed: 0,Tweet,Class
0,"Orania,the whites-only town did the unfathomable in 2019 Elections: EFF secure 3.73% of the vote in Orania None of us expected this one: There was a...",1
1,"Apart from maybe one small assessment of hers I disagree with (with regards Orania), this is an incredible opinion piece from Helen Zille and she hits many nails right on their heads . Everyone should read it!!! RT @dailymaverick: OPINIONISTA: From the Inside: The DA and the ANC took a knock  both require some soul-searching By Helen Zille @helenzille",1
2,Did #Orania participate on #Elections2019 ?,1
3,#DateMyFamily No wonder he's single coz his attitude is a TTO. He'll get a sbanxa girl or e Orania called Desperate van M'sunu,2
4,The EFF in the Northern Cape says it is unacceptable to allow areas like Orania to be exclusive to a particular race group. The EFF bussed about 50 supporters to Orania on May eighth and 21 of them voted for the party in Orania. #sabcnews,1


In [None]:
import collections
c = collections.Counter(XFast.Class.values)
c

Counter({1: 13238, 2: 1290, 3: 368})

In [None]:
import sklearn
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report



In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')
print(stoplist)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
Lemmatizer = WordNetLemmatizer()
tokenizer= TweetTokenizer()
XFast['Tweet'] = XFast['Tweet'].replace('[^a-zA-Z ]', '', regex=True)
XF = XFast.apply(lambda x: x.astype(str).str.lower())
def lemmatize_text(Tweet):
    return [Lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(Tweet) if not w in set(stopwords.words('english'))]
XF['Tweet']=XF.Tweet.apply(lemmatize_text)
fast1= XF['Tweet']
fast1

0                                                                                                                                                                        [oraniathe, whitesonly, town, unfathomable, election, eff, secure, vote, orania, none, u, expected, one]
1        [apart, maybe, one, small, assessment, disagree, regard, orania, incredible, opinion, piece, helen, zille, hit, many, nail, right, head, everyone, read, rt, dailymaverick, opinionista, inside, da, anc, took, knock, require, soulsearching, helen, zille, helenzille]
2                                                                                                                                                                                                                                                 [orania, participate, election]
3                                                                                                                                                       [datemyfamily, wonder, he,

In [None]:
fas1 = [' '.join(x) for x in fast1]
fas1

['oraniathe whitesonly town unfathomable election eff secure vote orania none u expected one',
 'apart maybe one small assessment disagree regard orania incredible opinion piece helen zille hit many nail right head everyone read rt dailymaverick opinionista inside da anc took knock require soulsearching helen zille helenzille',
 'orania participate election',
 'datemyfamily wonder he single coz attitude tto hell get sbanxa girl e orania called desperate van msunu',
 'eff northern cape say unacceptable allow area like orania exclusive particular race group eff bussed supporter orania may eighth voted party orania sabcnews',
 'support effsouthafrica one voting district doesnt sound like amazing feat take new meaning knowing party saw support whitesonly settlement oraniahomeland election',
 'special resident orania much free airtime people interesting south african medium tiring',
 'racist really want preferential treatment admission government school honestlyfampk right tax must fk orani

In [None]:
y=XFast['Class']
y

0        1
1        1
2        1
3        2
4        1
        ..
14891    1
14892    1
14893    1
14894    1
14895    1
Name: Class, Length: 14896, dtype: int64

In [None]:
c = collections.Counter(y)
c

Counter({1: 13238, 2: 1290, 3: 368})

In [None]:
fas_train, fas_test, y_train, y_test = train_test_split(fas1, y, test_size =0.25, random_state = 0)

In [None]:
import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
#fas_train = fas1[:11172]
#fas_train

['oraniathe whitesonly town unfathomable election eff secure vote orania none u expected one',
 'apart maybe one small assessment disagree regard orania incredible opinion piece helen zille hit many nail right head everyone read rt dailymaverick opinionista inside da anc took knock require soulsearching helen zille helenzille',
 'orania participate election',
 'datemyfamily wonder he single coz attitude tto hell get sbanxa girl e orania called desperate van msunu',
 'eff northern cape say unacceptable allow area like orania exclusive particular race group eff bussed supporter orania may eighth voted party orania sabcnews',
 'support effsouthafrica one voting district doesnt sound like amazing feat take new meaning knowing party saw support whitesonly settlement oraniahomeland election',
 'special resident orania much free airtime people interesting south african medium tiring',
 'racist really want preferential treatment admission government school honestlyfampk right tax must fk orani

In [None]:
#y_train = y[:11172]
#y_train

0        1
1        1
2        1
3        2
4        1
        ..
11167    1
11168    2
11169    1
11170    1
11171    1
Name: Class, Length: 11172, dtype: int64

In [None]:
#fas_test = fas1[11172:]
#fas_test

['man quit playin wit em rt jarond oooooo leg still fromthegunline',
 'lrt see man gotta love hate ting wit droogs',
 'man yall pussy yall almost man yall want gotta go beyond chance wit yall talkin bout cakewalk rt libgyal know many woman want man man job man kind heart man claim men cakewalk there surplus woman majority socialised desperate',
 'wouldnt even try girl relationship cause shes wit thats much even cause care man way good liar orchestrate',
 'aye man whats wrong wit melly',
 'lrt da whole dmv took flock flow ran wit smh free dat man freebigflock',
 'day started man tap wit',
 'cant hang wit nigga constantly askin wea da hoe fuck dem hoe man get bag',
 'man as fat u couldnt palm wit shaquille hand rt notoriousel scarface x fuck face one best love song history',
 'successfully kicked cold day without man made medicine currently fierce battle wit allergy prevail dont need pill need body build resistance allergen',
 'gay man wit facial hair',
 'aye greenbillshawty street need 

In [None]:
#y_test = y[11172:]
#y_test

11172    1
11173    1
11174    2
11175    1
11176    1
        ..
14891    1
14892    1
14893    1
14894    1
14895    1
Name: Class, Length: 3724, dtype: int64

In [None]:
from pandas import DataFrame
df_x_train = DataFrame (fas_train,columns=['sentence']) 
df_x_test = DataFrame (fas_test,columns=['sentence']) 
#df_y_train = DataFrame (y_train,columns=['class']) 
#df_y_test = DataFrame (y_test,columns=['class'])

In [None]:
tokenizer = create_tonkenizer(model.layers[3])
print(tokenizer.tokenize("This here's an example of using the BERT tokenizer"))
X1_train = convert_sentences_to_features(df_x_train['sentence'].values, tokenizer)

  2%|▏         | 243/11172 [00:00<00:04, 2418.83it/s]

['this', 'here', "'", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer']


100%|██████████| 11172/11172 [00:04<00:00, 2354.89it/s]


In [None]:
X1_test = convert_sentences_to_features(df_x_test['sentence'].values, tokenizer)

100%|██████████| 3724/3724 [00:01<00:00, 2366.80it/s]


In [None]:
y_train_df = pd.get_dummies(y_train).values  # Convert categorical variable into dummy/indicator variables.
print(y_train_df.shape)

y_test_df = pd.get_dummies(y_test).values  # Convert categorical variable into dummy/indicator variables.
print(y_test_df.shape)

(11172, 3)
(3724, 3)


In [None]:
%%time

# Train the model
BATCH_SIZE = 16
EPOCHS = 1

# Use Adam optimizer to minimize the categorical_crossentropy loss
opt = Adam(learning_rate=1e-5)
model.compile(optimizer=opt, 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Fit the data to the model
history = model.fit(X1_train, y_train_df,
                    validation_data=(X1_test, y_test_df),
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    verbose = 1)

# Save the trained model
model.save('nlp_model.h5')

CPU times: user 2d 20h 35min 37s, sys: 1h 20min 13s, total: 2d 21h 55min 50s
Wall time: 2h 41min 43s


In [None]:
# Load the pretrained nlp_model
from tensorflow.keras.models import load_model
new_model = load_model('nlp_model.h5',custom_objects={'KerasLayer':hub.KerasLayer})

In [None]:
# Predict on test dataset
from sklearn.metrics import classification_report
pred_test = np.argmax(new_model.predict(X1_test), axis=1)
print(classification_report(np.argmax(y_test_df,axis=1), pred_test))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      3292
           1       0.81      0.76      0.78       347
           2       0.00      0.00      0.00        85

    accuracy                           0.94      3724
   macro avg       0.59      0.58      0.58      3724
weighted avg       0.92      0.94      0.93      3724



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
np.set_printoptions(edgeitems=50000)
np.core.arrayprint._line_width = 50000

In [None]:
pred_test_Bert = np.array(pred_test)
pred_test_Bert

array([0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
print(metrics.confusion_matrix(np.argmax(y_test_df,axis=1),pred_test))

[[2953  172    0]
 [ 164  155    0]
 [ 268   12    0]]
