In [23]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from tensorflow.keras.models import  Model
from tqdm import tqdm
import numpy as np
from collections import namedtuple
import pandas as pd
import re
import random
from sklearn.preprocessing import OneHotEncoder

In [24]:
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.1.0
Hub version:  0.8.0


In [25]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=True)

In [26]:
df = pd.read_csv('/home3/r05322021/Desktop/FB_hatecrime/Data/label/race_label.csv', encoding='utf-8', engine='python')
df = df[df.Ethnics_related == 1]

In [54]:
df['Ethnic_negative'] = df['Ethnic_sentiment'].apply(lambda x: 1 if x == -1 else 0)

In [55]:
len(df[df.Ethnic_negative == 1])/len(df)

0.15763288819062582

In [56]:
df = df.fillna('')
# df.loc[df[df.immigration_related == 4].index, 'immigration_related'] = 1

In [37]:
def text_clean(df, target_col):
    df = df[df[target_col].notnull()]
    df = df.fillna('')
    for index in df.index:
        for column in ['post_name', 'post_message', 'post_description']:
            text = df.loc[index, column]
            text = re.sub(r'http\S+', '', str(text))
            text = re.sub(r'\@\w+', '', str(text))
            text = re.sub(r'#\w+', '', str(text))
            text = re.sub(r'\[\w+\]', '', str(text))
            text = re.sub(r'\n', '', str(text))
            text = re.sub(r'\r', '', str(text))
            df.loc[index, column] = text
    
    return df

In [38]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        #Cutting down the excess length
        tokens = tokens[0:max_seq_length]
        return [1]*len(tokens)
    else :
      return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
      #Cutting down the excess length
      tokens = tokens[:max_seq_length]
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments
    else:
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):    
    if len(tokens) > max_seq_length:
      tokens = tokens[:max_seq_length]
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      return token_ids
    else:
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
      return input_ids

def preprocess_text(s, tokenizer, max_length):
    stokens = tokenizer.tokenize(s)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    input_ids = get_ids(stokens, tokenizer, max_length)
    input_masks = get_masks(stokens, max_length)
    input_segments = get_segments(stokens, max_length)
    return input_ids, input_masks, input_segments

def preprocess_whole(df, tokenizer, max_length, y_col):
    
    df = text_clean(df, y_col)
    input_id, input_mask, input_segment = [], [], []
    
    for _, index in enumerate(tqdm(df.index.tolist(), total=len(df)), 1):
        text = df['post_name'].loc[index] + ' ' + df['post_message'].loc[index] + ' ' + df['post_description'].loc[index]
        ids, masks, segments = preprocess_text(text, tokenizer, max_length)
        
        input_id.append(ids)
        input_mask.append(masks)
        input_segment.append(segments)
        
    return [np.array(input_id, dtype=np.int32), 
            np.array(input_mask, dtype=np.int32), 
            np.array(input_segment, dtype=np.int32)]

In [39]:
import random
def train_test_split(df, sample_num=100):
    index = df.index.tolist()
    test = random.sample(index, sample_num)
    index = set(index) - set(test)
    val = random.sample(index, sample_num)
    train = set(index) - set(val)
    
    train = df.loc[train]
    val = df.loc[val]
    test = df.loc[test]
    return train, val, test

def training_form(df, sample_num, max_length, y_col):
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file,do_lower_case)
    train, val, test = train_test_split(df, sample_num)
    train_doc = train
    
    X_train = preprocess_whole(train, tokenizer, max_length, y_col)
    X_val = preprocess_whole(val, tokenizer, max_length, y_col)
    X_test = preprocess_whole(test, tokenizer, max_length, y_col)
        
    onehotencoder = OneHotEncoder()
    y_train = onehotencoder.fit_transform(np.array(train[y_col]).reshape(-1, 1)).toarray()
    y_val = onehotencoder.fit_transform(np.array(val[y_col]).reshape(-1, 1)).toarray()
    y_test = onehotencoder.fit_transform(np.array(test[y_col]).reshape(-1, 1)).toarray()
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [40]:
MAX_SEQ_LEN = 500

In [57]:
X_train, Y_train, X_val, Y_val, X_test, Y_test = training_form(df, 100, MAX_SEQ_LEN, 'Ethnic_negative')

100%|██████████| 3619/3619 [00:15<00:00, 239.29it/s]
100%|██████████| 100/100 [00:00<00:00, 757.52it/s]
100%|██████████| 100/100 [00:00<00:00, 573.38it/s]


In [58]:
input_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                       name="input_ids")
input_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                   name="input_mask")
input_segment = tf.keras.layers.Input(shape=(MAX_SEQ_LEN,), dtype=tf.int32,
                                    name="input_segment")

In [59]:
pooled_output, sequence_output = bert_layer([input_ids, input_mask, input_segment])

In [60]:
bert_layer.trainable = False

In [65]:
x = tf.keras.layers.Conv1D(filters=64, activation='relu', kernel_size=5, strides=1)(sequence_output)
x = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.Dropout(0.2)(x)
# x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(2, activation="softmax", name="dense_output")(x)

In [73]:
model = Model(
      inputs=[input_ids, input_mask, input_segment], outputs=output)
model.compile(loss='categorical_crossentropy',
                  optimizer='Adam',
                  metrics=['accuracy'])

In [74]:
Y_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [48]:
len(df[df.Ethnic_sentiment == -1])/len(df)

0.15763288819062582

In [75]:
model.fit(X_train,Y_train,epochs=5,batch_size=128,verbose=2,validation_data=[X_val, Y_val], class_weight={0:1, 1:8})

  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


  ...
    to  
  ['...']


Train on 3619 samples, validate on 100 samples
Epoch 1/5
3619/3619 - 1556s - loss: 0.5800 - accuracy: 0.8450 - val_loss: 1.8940 - val_accuracy: 0.7100
Epoch 2/5
3619/3619 - 1611s - loss: 0.3536 - accuracy: 0.9301 - val_loss: 2.3451 - val_accuracy: 0.7400
Epoch 3/5
3619/3619 - 1639s - loss: 0.2350 - accuracy: 0.9693 - val_loss: 3.1542 - val_accuracy: 0.7700
Epoch 4/5
3619/3619 - 1604s - loss: 0.1777 - accuracy: 0.9754 - val_loss: 2.8220 - val_accuracy: 0.7700
Epoch 5/5
3619/3619 - 1648s - loss: 0.1437 - accuracy: 0.9831 - val_loss: 2.4181 - val_accuracy: 0.7700


<tensorflow.python.keras.callbacks.History at 0x7f3908614390>

In [79]:
model.evaluate(X_test, Y_test)



[0.6243823659420014, 0.76]

In [70]:
from sklearn import metrics
from sklearn.metrics import classification_report
def predict(original_df, threshold, model):
    
    prediction = model.predict(original_df, batch_size=128)
    pred = [1 if d[1] >= threshold else 0 for d in prediction]
    
    return pred

# Evaluation
def Evaluation(original_df, original_label, model, threshold=0.5):
    
    Y = [0 if element[0] == 1 else 1 for element in original_label]
    prediction = predict(original_df, threshold=threshold, model=model)
    accuracy = metrics.accuracy_score(Y, prediction)

    target_names = ['ACTIVE', 'CHURN']
    report = classification_report(Y, prediction, target_names=target_names)

    return [accuracy, report]

In [81]:
r = Evaluation(X_test, Y_test, model)
print(r[1])

              precision    recall  f1-score   support

      ACTIVE       0.87      0.85      0.86        85
       CHURN       0.24      0.27      0.25        15

    accuracy                           0.76       100
   macro avg       0.55      0.56      0.55       100
weighted avg       0.77      0.76      0.77       100



In [82]:
model.save(r'/home3/r05322021/Desktop/FB_hatecrime/model/race_negative.h5')