# Multi-label Text Classification Model based on BERT with CNN Classification layer. Trained on Posting Emotions Dataset [Script D]

*This notebook contains the script used to build our main multi-label classification model, which recognizes emotions from job postings. In it, we build a BERTBase model with a CNN classification layer. Note that this notebook's code was written following a tutorial on multi-label text classification for tagging questions posted on Q&A sites such as Stack Overfllo. However, the content of the code was written to serve our own model goals.*

---
*References: https://github.com/Moradnejad/Bert-Based-Tag-Recommendation*


## Package Installation, Imports & Setup

In [None]:
pip install bert-tensorflow --quiet

In [None]:
pip install bert-for-tf2 --quiet

In [None]:
pip install sentencepiece

In [None]:
pip install tensorflow --quiet

In [None]:
pip install tensorflow_hub --quiet

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass

In [None]:
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers

import pandas as pd
import numpy as np
import gc
import collections
import random
import re
import nltk

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from bs4 import BeautifulSoup as bs
from collections import Counter
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("popular")
nltk.download("stopwords")
from nltk import word_tokenize
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

## Data Cleaning Methods

The code cleaning techniques in this section was directly taken from the code in the reference mentioned in the title 

In [None]:
def transliterate(line):
    cedilla2latin = [[u'Á', u'A'], [u'á', u'a'], [u'Č', u'C'], [u'č', u'c'], [u'Š', u'S'], [u'š', u's']]
    tr = dict([(a[0], a[1]) for (a) in cedilla2latin])
    new_line = ""
    for letter in line:
        if letter in tr:
            new_line += tr[letter]
        else:
            new_line += letter
    return new_line

In [None]:
def text_cleaner(text,
                 deep_clean=True,
                 stem= True,
                 stop_words=True,
                 translite_rate=True):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning

    ]

    if deep_clean:
        text = text.replace(".", "")
        text = text.replace("[", " ")
        text = text.replace(",", " ")
        text = text.replace("]", " ")
        text = text.replace("(", " ")
        text = text.replace(")", " ")
        text = text.replace("\"", "")
        text = text.replace("-", " ")
        text = text.replace("=", " ")
        text = text.replace("?", " ")
        text = text.replace("!", " ")

        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
        text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
        text = re.sub("(^|\W)\d+($|\W)", " ", text)
        if translite_rate:
            text = transliterate(text)
        if stem:
            text = PorterStemmer().stem(text)
        text = WordNetLemmatizer().lemmatize(text)
        if stop_words:
            stop_words = set(stopwords.words('english'))
            word_tokens = word_tokenize(text)
            text = [w for w in word_tokens if not w in stop_words]
            text = ' '.join(str(e) for e in text)
    else:
        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
    return text.lower()

In [None]:
# Convert html to regular text
def convert_html_to_text(data):
    soup = bs(data,'html.parser')
    body = soup.get_text()
    return body

## Data Import & Cleaning

In [None]:
# Import top sentiments dataset
top_sentiments = pd.read_csv("top_sentiments.csv")

In [None]:
# Clean up text data
top_sentiments['listing'] = top_sentiments['listing'].astype('string')
top_sentiments['tags'] = top_sentiments['tags'].apply(lambda x: str(x).replace("nan", ""))
top_sentiments["tags"] = top_sentiments["tags"].apply(eval)
top_sentiments['tags'] = top_sentiments['tags'].apply(lambda x: ' '.join(x))
top_sentiments.dropna(inplace=True)
print(top_sentiments['tags'])
top_sentiments['Text'] = top_sentiments['listing'].apply(text_cleaner)

In [None]:
# Define labels

X = top_sentiments['Text'].tolist()

#prepare tags
tag_list = []
for item in top_sentiments['tags']:
    temp = item.split(" ")
    for word in temp:
        tag_list.append(word)



tags = list(set(tag_list))


y = []
S=0
for item in top_sentiments['tags']:
    self_tags = []
    itemsplitted = item.split(" ")
    
    for word in tags:
        if word in itemsplitted:
            self_tags.append(1)
        else:
            self_tags.append(0)
    
    values = np.array(self_tags)
    
    Y=all(values == 0)
    if Y==True:
        
        del X[S]
        S=S-1
    else:
        y.append(np.array(self_tags))
    S=S+1    
    

y_list = []
for elem in y:
    y_list.append(elem.tolist())

## Tokenization

In [None]:
#Import BERT Tokenizer and BERT Uncased from the TensorFlow Hub
BertTokenizer = bert.bert_tokenization.FullTokenizer

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [None]:
def tokenize_data(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

In [None]:
tokenized_data = [tokenize_data(text) for text in X]

In [None]:
# Split dataframe into trainand test set
tokenized_data_train,tokenized_data_test,y_list_train, y_list_test = train_test_split(tokenized_data, y_list, test_size = .2,random_state = 42)
print("Length of training data :", len(tokenized_data_train))
print("Length of test data :", len(tokenized_data_test))

for c,item in enumerate(y_list_test):
    y_list_test[c] = np.array(item)

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

## Model Creation

In [None]:
#bert text model
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")

                                    
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 


        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

## Training

In [None]:
# Method to log training progress
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

In [None]:
import sys
import os
import nltk
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import random
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import hamming_loss
import statistics 


whole_predictions = []
whole_real_predictions = []
whole_threshold_predictions = []
text_model = []

one=0

#predict for each label individualy

for i in range(len(y_list_train[0])):

    whole_predictions = []
    whole_real_predictions = []
    whole_threshold_predictions = []

    print("\n" + str(i)+"\'th label prediction started")
    count_zero=0
    count_one=0
    new_label=[]
    new_tokenized_data_train=[]
    label = column(y_list_train,i)
    count_one=sum(label)
    print("count_one",count_one)
    
    for k in range(len(label)):
        if count_zero< count_one and label[k]==0:
            new_label.append(0)
            new_tokenized_data_train.append(tokenized_data_train[k])
            count_zero=count_zero+1
        if label[k]==1:
            new_label.append(1)
            new_tokenized_data_train.append(tokenized_data_train[k])

            
    print("count_zero",count_zero)        
    data_with_len = [[data,new_label[j],len(data)]
                     for j,data in enumerate(new_tokenized_data_train)]

    data_with_len.sort(key=lambda x: x[2])
    sorted_data_labels = [(data_lab[0], data_lab[1]) for data_lab in data_with_len]
    processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_data_labels, output_types=(tf.int32, tf.int32))
    BATCH_SIZE = 14
    batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
    TOTAL_BATCHES = math.ceil(len(sorted_data_labels) / BATCH_SIZE)
    TEST_BATCHES = TOTAL_BATCHES // TOTAL_BATCHES
    batched_dataset.shuffle(TOTAL_BATCHES)
    test_data = batched_dataset.take(TEST_BATCHES)
    train_data = batched_dataset.skip(TEST_BATCHES)
    
   
    VOCAB_LENGTH = len(tokenizer.vocab)
    EMB_DIM = 260
    CNN_FILTERS = 50
    DNN_UNITS = 256
    OUTPUT_CLASSES = 2

    DROPOUT_RATE = 0.2

    NB_EPOCHS = 6

    text_model.append(TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE))

    if OUTPUT_CLASSES == 2:
        text_model[i].compile(loss="binary_crossentropy",
                           optimizer="adam",
                           metrics=["acc"])
    else:
        text_model[i].compile(loss="sparse_categorical_crossentropy",
                           optimizer="adam",
                           metrics=["sparse_categorical_acc"])

    text_model[i].fit(train_data, epochs=NB_EPOCHS)

    self_label_predictions = []
    self_threshold_predictions = []
    self_label_real_values = []
    print("Predicting " + str(i) + "th label...")
    
    for e,item in enumerate(tokenized_data_test):
        if e%2==0:
            progress(e,len(tokenized_data_test))
        res = text_model[i].predict([item])
        self_label_real_values.append(res[0][0])
      
        if res[0][0] > 0.93:
            self_threshold_predictions.append(res[0][0])
        else :
            self_threshold_predictions.append(0.0)

    whole_threshold_predictions.append(self_threshold_predictions)
    whole_real_predictions.append(self_label_real_values)

    whole_threshold_predictions = list(map(list, zip(*whole_threshold_predictions)))
    whole_real_predictions = list(map(list, zip(*whole_real_predictions)))

            
    K_list= [3,5,10]
    for U in range(len(K_list)):
        K_tag_y_list_test = []
        k = K_list[U]
        sigma_recalls = 0
        sigma_precisions = 0
        sigma_f1score = 0

        K_tag_y_list_test = y_list_test
        for f in range(len(K_tag_y_list_test)):
            progress(f,len(K_tag_y_list_test))
            currentitem = np.array(whole_threshold_predictions[f])

            top_k_indexes = (-currentitem).argsort()[:k]
            
            for C in top_k_indexes:
                if whole_threshold_predictions[f][C] == 0.0 :
                    top_k_indexes = top_k_indexes[top_k_indexes != C]
          
            intercep = 0
            for numb in top_k_indexes:
                if K_tag_y_list_test[f][numb] == 1 :
                    intercep += 1
            num_of_exists_tags = np.count_nonzero(K_tag_y_list_test[f] == 1)

            if len(top_k_indexes) == 0 :
                self_recall_k=0
            elif len(top_k_indexes) >= num_of_exists_tags :
                self_recall_k = intercep / num_of_exists_tags
            elif len(top_k_indexes) < num_of_exists_tags :
                self_recall_k = intercep / len(top_k_indexes)
            if len(top_k_indexes)==0:
                self_precisions_k=0 
            else:    
                self_precisions_k = intercep / len(top_k_indexes)
            if self_precisions_k==0 and self_recall_k==0:
                self_f1_score_k=0
            else:    
                self_f1_score_k = 2 * ((self_precisions_k*self_recall_k)/(self_precisions_k+self_recall_k))
            sigma_recalls += self_recall_k
            sigma_precisions += self_precisions_k
            sigma_f1score += self_f1_score_k

        
        recall_k = sigma_recalls / len(K_tag_y_list_test)
        precisions_k = sigma_precisions / len(K_tag_y_list_test)
        f1score_k = sigma_f1score / len(K_tag_y_list_test)
        print("\n")
        print("\n" + str(i)+"\'th label metrics")
        print("Recall@"+ str(K_list[U])+" = " + str(recall_k))
        print("Precision@"+ str(K_list[U])+" = " + str(precisions_k))
        print("f1score@"+ str(K_list[U])+" = " + str(f1score_k))

## Save model

In [None]:
# Save model weights for reloading
for i,model in enumerate(text_model):
    model_name = 'bert_tagger' + str(i)
    path = 'bert_tagger_weights/'+model_name+'/ckpt'
    print(path)
    model.save_weights(path)

## Model Reload (if necessary)

In [None]:
for i,model in enumerate(text_model):
  model_name = 'bert_tagger' + str(i)
  path = '/content/drive/MyDrive/bert_tagger_weights/'+model_name+'/ckpt'
  print(path)
  model.load_weights(path)

## Anvil Backend

In [None]:
# Download ANVIL package
!pip install anvil-uplink --quiet

In [None]:
# Connect to ANVIL app
import anvil.server
anvil.server.connect("D2UKWJF75275SOMVEC36ILFX-OCJ22E76KFJVXWHW")

In [None]:
# Method to get listing interpretation
@anvil.server.callable
def predict_emotions(job_desc, sensitivity):

  print('Reached backend')

  postings = job_desc.split("\n")
  for posting in postings:
    posting = text_cleaner(posting)
  job_desc = text_cleaner(job_desc)
  postings.append(job_desc)

  for posting in postings:
    if posting.strip() == '':
      postings.remove(posting)
    if len(posting.split()) < 4:
      postings.remove(posting)

  tokenized_postings = [tokenize_data(data) for data in postings]
  
  for i, item in enumerate(tokenized_postings):
    if len(item) < 4:
      tokenized_postings.remove(item)
      del postings[i]


  predictions = []
  
  for i in range(len(text_model)):
    label_predictions = []
    for item in tokenized_postings:
      res = text_model[i].predict([item])
      label_predictions.append(res[0][0])
    predictions.append(label_predictions)
  
  prediction_df = pd.DataFrame(
    {'text': postings,
     'approval': predictions[0],
     'disapproval': predictions[1],
     'disappointment': predictions[2],
     'annoyance': predictions[3],
     'gratitude': predictions[4],
     'curiosity': predictions[5],
     'amusement': predictions[6],
     'caring': predictions[7],
     'optimism': predictions[8],
     'realization': predictions[9],
     'excitement': predictions[10],
     'confusion': predictions[11],
     'joy': predictions[12],
     'anger': predictions[13],
     'fear': predictions[14],
     'nervousness': predictions[15],
     'sadness': predictions[16],
     'desire': predictions[17]
    })
  
  text_emotions = []
  for index, row in prediction_df.iterrows():
    emotions = []
    for i in range(1,len(row)):
      if row[i] > sensitivity:
        emotions.append(prediction_df.columns[i])
    text_emotions.append({'sentence_index':index, 'emotions':emotions, 'sentence': row[0]})
  
  return text_emotions

In [None]:
anvil.server.wait_forever()