# Training Posts Classification Model Using BERT

In [192]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
from collections import namedtuple
import pandas as pd
import re
import random
from sklearn.preprocessing import OneHotEncoder

## Load Labeled Data

In [250]:
df = pd.read_csv('/home3/r05322021/Desktop/FB_hatecrime/Data/label/immigration_label.csv', encoding='utf-8', engine='python')
df = df[(df.immigration_related.isin([0,1])) & (df.Mexican_related.isin([0,1]) & (df.Muslim_related.isin([0,1]))]
df = df.fillna('')

## Data Preprocessing

In [197]:
def text_clean(df, target_col):
    df = df[df[target_col].notnull()]
    df = df.fillna('')
    for index in df.index:
        for column in ['post_name', 'post_message', 'post_description']:
            text = df.loc[index, column]
            text = re.sub(r'http\S+', '', str(text))
            text = re.sub(r'\@\w+', '', str(text))
            text = re.sub(r'#\w+', '', str(text))
            text = re.sub(r'\[\w+\]', '', str(text))
            text = re.sub(r'\n', '', str(text))
            text = re.sub(r'\r', '', str(text))
            df.loc[index, column] = text
    
    return df

In [198]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        #Cutting down the excess length
        tokens = tokens[0:max_seq_length]
        return [1]*len(tokens)
    else :
      return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
      #Cutting down the excess length
      tokens = tokens[:max_seq_length]
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments
    else:
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):    
    if len(tokens) > max_seq_length:
      tokens = tokens[:max_seq_length]
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      return token_ids
    else:
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
      return input_ids

def preprocess_text(s, tokenizer, max_length):
    stokens = tokenizer.tokenize(s)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    input_ids = get_ids(stokens, tokenizer, max_length)
    input_masks = get_masks(stokens, max_length)
    input_segments = get_segments(stokens, max_length)
    return input_ids, input_masks, input_segments

def preprocess_whole(df, tokenizer, max_length, y_col):
    
    df = text_clean(df, y_col)
    input_id, input_mask, input_segment = [], [], []
    
    for _, index in enumerate(tqdm(df.index.tolist(), total=len(df)), 1):
        text = df['post_name'].loc[index] + ' ' + df['post_message'].loc[index] + ' ' + df['post_description'].loc[index]
        ids, masks, segments = preprocess_text(text, tokenizer, max_length)
        
        input_id.append(ids)
        input_mask.append(masks)
        input_segment.append(segments)
        
    return [np.array(input_id, dtype=np.int32), 
            np.array(input_mask, dtype=np.int32), 
            np.array(input_segment, dtype=np.int32)]

In [199]:
import random
def train_test_split(df, sample_num=100):
    index = df.index.tolist()
    test = random.sample(index, sample_num)
    index = set(index) - set(test)
    val = random.sample(index, sample_num)
    train = set(index) - set(val)
    
    train = df.loc[train]
    val = df.loc[val]
    test = df.loc[test]
    return train, val, test

def training_form(df, sample_num, max_length, y_col):
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    train, val, test = train_test_split(df, sample_num)
    train_doc = train
    
    X_train = preprocess_whole(train, tokenizer, max_length, y_col)
    X_val = preprocess_whole(val, tokenizer, max_length, y_col)
    X_test = preprocess_whole(test, tokenizer, max_length, y_col)
        
    onehotencoder = OneHotEncoder()
    y_train = onehotencoder.fit_transform(np.array(train[y_col]).reshape(-1, 1)).toarray()
    y_val = onehotencoder.fit_transform(np.array(val[y_col]).reshape(-1, 1)).toarray()
    y_test = onehotencoder.fit_transform(np.array(test[y_col]).reshape(-1, 1)).toarray()
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [248]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)
MAX_SEQ_LEN = 500

In [253]:
X_train, Y_train, X_val, Y_val, X_test, Y_test = training_form(df, 100, MAX_SEQ_LEN, 'immigration_negative')

100%|██████████| 4557/4557 [00:08<00:00, 540.23it/s]
100%|██████████| 100/100 [00:00<00:00, 539.96it/s]
100%|██████████| 100/100 [00:00<00:00, 612.09it/s]


## Model Building and Training

In [254]:
input_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
input_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="input_segment")

In [255]:
pooled_output, sequence_output = bert_layer([input_ids, input_mask, input_segment])
bert_layer.trainable = False

In [257]:
x = tf.keras.layers.Conv1D(filters=64, activation='relu', kernel_size=5, strides=1)(sequence_output)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dropout(0.2)(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(2, activation="softmax", name="dense_output")(x)

In [266]:
model = Model(
      inputs=[input_ids, input_mask, input_segment], outputs=output)
model.compile(loss='categorical_crossentropy',
                  optimizer='Adam',
                  metrics=['accuracy'])

In [267]:
model.fit(X_train,Y_train,epochs=3,batch_size=128,verbose=2,validation_data=[X_val, Y_val], class_weight={0:1, 1:2})

  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


Train on 4557 samples, validate on 100 samples
Epoch 1/3
4557/4557 - 2959s - loss: 0.6583 - accuracy: 0.8111 - val_loss: 1.1380 - val_accuracy: 0.7100
Epoch 2/3
4557/4557 - 2815s - loss: 0.4069 - accuracy: 0.9078 - val_loss: 1.0515 - val_accuracy: 0.6900
Epoch 3/3
4557/4557 - 2649s - loss: 0.3187 - accuracy: 0.9359 - val_loss: 1.2849 - val_accuracy: 0.7400


<tensorflow.python.keras.callbacks.History at 0x7efab8e5c048>

## Model Evaluation

In [166]:
from sklearn import metrics
from sklearn.metrics import classification_report
def predict(original_df, threshold, model):
    
    prediction = model.predict(original_df, batch_size=128)
    pred = [1 if d[1] >= threshold else 0 for d in prediction]
    
    return pred

# Evaluation
def Evaluation(original_df, original_label, model, threshold=0.5):
    
    Y = [0 if element[0] == 1 else 1 for element in original_label]
    prediction = predict(original_df, threshold=threshold, model=model)
    accuracy = metrics.accuracy_score(Y, prediction)

    target_names = ['ACTIVE', 'CHURN']
    report = classification_report(Y, prediction, target_names=target_names)

    return [accuracy, report]

In [268]:
model.evaluate(X_test, Y_test)



[0.6422328320145607, 0.68]

In [270]:
r = Evaluation(X_test, Y_test, model)
print(r[1])

              precision    recall  f1-score   support

      ACTIVE       0.81      0.74      0.77        74
       CHURN       0.41      0.50      0.45        26

    accuracy                           0.68       100
   macro avg       0.61      0.62      0.61       100
weighted avg       0.70      0.68      0.69       100



## Save Model

In [271]:
model.save(r'/home3/r05322021/Desktop/FB_hatecrime/model/immigration_sentiment.h5')