In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pickle
import re
from scipy.stats import spearmanr
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tokenization
from keras.models import load_model
import tensorflow as tf

In [None]:
def function_1(data):
    data = pd.read_csv("data.csv")   #reading data from csv file
    
    #features required processing data
    text_columns = ["question_title","question_body","answer"]  

    #This below code for removing html tags  
    TAG_RE = re.compile(r'<[^>]+>')
    def remove_tags(text):
        return TAG_RE.sub('', text)

    for i in data.index:
        for j in text_columns:
            data.at[i,j] = remove_tags(data.loc[i,j])
    # dictionary of data for elaborating the words 
    eloborate_dict = {"won't":"will not",
                        "can\'t":"can not",
                        "n\'t":" not",
                        "\'re":" are",
                        "\'s":" is",
                        "\'d":" would",
                        "\'ll":" will",
                        "\'t":" not",
                        "\'ve":" have",
                        "\'m":" am"
                    }
    
    #function for elaborating the text
    def replace_(text):
        for i,j in eloborate_dict.items(): 
            text = re.sub(i,j, text)
        return text

    #function for removing the special symbols
    def replace_special(text): 
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+=]", " ", text)
        text = re.sub(r"\-", " ", text)
        text = re.sub(r"\/", "", text)
        text = re.sub(r"\.", "", text)
        text = re.sub(r"\,", "", text)
        text = re.sub(r"\n", "", text)
        text = re.sub(r"\'", "", text)
        text = re.sub(r"\=", "", text)
        text = re.sub(r"\+", "", text)
        text = re.sub(r"\^", "", text)
    
        return text
    
    
    for col in text_columns:
        data[col] = data[col].apply(lambda x: replace_(x.lower()))
    
    for col in text_columns:
        data[col] = data[col].apply(lambda x: replace_special(x))
        
        
    with open('vocab_file.pkl','rb') as f:
        vocab_file = pickle.load(f)

    with open('do_lower_case.pkl','rb') as f:
        do_lower_case = pickle.load(f)
    
    #tokenizing the sentences
    tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)
    
    #Processing Question title feature
    data_qt_tokens = np.zeros(shape=(data.shape[0],25))
    data_qt_mask = np.zeros(shape=(data.shape[0],25))
    data_qt_segment = np.zeros(shape=(data.shape[0],25))
    max_seq_length = 25
    for i in range(data.shape[0]):
        tokens = tokenizer.tokenize(data.values[i][0])
        if len(tokens) >= max_seq_length-2:
            tokens = tokens[0:(max_seq_length-2)]
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_qt_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_qt_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            data_qt_segment[i] = np.array([0]*max_seq_length)
        else:
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_qt_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            tokens = tokens + ['[PAD]']*(max_seq_length-len(tokens))
            data_qt_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_qt_segment[i] = np.array([0]*max_seq_length)
            
            
    bert_model_1 = tf.keras.models.load_model('bert_model_1.h5',custom_objects={'KerasLayer':hub.KerasLayer})

    data_qt_pooled_output = bert_model_1.predict([data_qt_tokens.astype('int32'), data_qt_mask.astype('int32'), 
                                                   data_qt_segment.astype('int32')])
    
    #Processing Question body feature
    data_q_tokens = np.zeros(shape=(data.shape[0],512))
    data_q_mask = np.zeros(shape=(data.shape[0],512))
    data_q_segment = np.zeros(shape=(data.shape[0],512))
    max_seq_length = 512
    for i in range(data.shape[0]):
        tokens = tokenizer.tokenize(data.values[i][1])
        if len(tokens) >= max_seq_length-2:
            tokens = tokens[0:(max_seq_length-2)]
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_q_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_q_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            data_q_segment[i] = np.array([0]*max_seq_length)
        else:
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_q_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            tokens = tokens + ['[PAD]']*(max_seq_length-len(tokens))
            data_q_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_q_segment[i] = np.array([0]*max_seq_length)
            
    bert_model_2 = tf.keras.models.load_model('bert_model_2.h5',custom_objects={'KerasLayer':hub.KerasLayer})

    data_q_pooled_output = bert_model_2.predict([data_q_tokens.astype('int32'), data_q_mask.astype('int32'), 
                                                   data_q_segment.astype('int32')])

    #Processing Answer feature
    data_a_tokens = np.zeros(shape=(data.shape[0],512))
    data_a_mask = np.zeros(shape=(data.shape[0],512))
    data_a_segment = np.zeros(shape=(data.shape[0],512))
    max_seq_length = 512
    for i in range(data.shape[0]):
        tokens = tokenizer.tokenize(data.values[i][2])
        if len(tokens) >= max_seq_length-2:
            tokens = tokens[0:(max_seq_length-2)]
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_a_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_a_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            data_a_segment[i] = np.array([0]*max_seq_length)
        else:
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_a_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            tokens = tokens + ['[PAD]']*(max_seq_length-len(tokens))
            data_a_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_a_segment[i] = np.array([0]*max_seq_length)
            
    data_a_pooled_output = bert_model_2.predict([data_a_tokens.astype('int32'), data_a_mask.astype('int32'), 
                                                   data_a_segment.astype('int32')])

    
    data_comp = [data_qt_pooled_output,data_q_pooled_output,data_a_pooled_output]

    bert_model = tf.keras.models.load_model('model_bert.h5')
    
    #predicting the 30 features
    y_pred = model_bert.predict(data_comp)
    
    return y_pred

In [None]:
def function_2(data,output):
    data = pd.read_csv("data.csv")   #reading data from csv file
    
    #features required processing data
    text_columns = ["question_title","question_body","answer"]  

    #This below code for removing html tags  
    TAG_RE = re.compile(r'<[^>]+>')
    def remove_tags(text):
        return TAG_RE.sub('', text)

    for i in data.index:
        for j in text_columns:
            data.at[i,j] = remove_tags(data.loc[i,j])
    # dictionary of data for elaborating the words 
    eloborate_dict = {"won't":"will not",
                        "can\'t":"can not",
                        "n\'t":" not",
                        "\'re":" are",
                        "\'s":" is",
                        "\'d":" would",
                        "\'ll":" will",
                        "\'t":" not",
                        "\'ve":" have",
                        "\'m":" am"
                    }
    
    #function for elaborating the text
    def replace_(text):
        for i,j in eloborate_dict.items(): 
            text = re.sub(i,j, text)
        return text

    #function for removing the special symbols
    def replace_special(text): 
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+=]", " ", text)
        text = re.sub(r"\-", " ", text)
        text = re.sub(r"\/", "", text)
        text = re.sub(r"\.", "", text)
        text = re.sub(r"\,", "", text)
        text = re.sub(r"\n", "", text)
        text = re.sub(r"\'", "", text)
        text = re.sub(r"\=", "", text)
        text = re.sub(r"\+", "", text)
        text = re.sub(r"\^", "", text)
    
        return text
    
    
    for col in text_columns:
        data[col] = data[col].apply(lambda x: replace_(x.lower()))
    
    for col in text_columns:
        data[col] = data[col].apply(lambda x: replace_special(x))
        
        
    with open('vocab_file.pkl','rb') as f:
        vocab_file = pickle.load(f)

    with open('do_lower_case.pkl','rb') as f:
        do_lower_case = pickle.load(f)
    
    #tokenizing the sentences
    tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)
    
    #Processing Question title feature
    data_qt_tokens = np.zeros(shape=(data.shape[0],25))
    data_qt_mask = np.zeros(shape=(data.shape[0],25))
    data_qt_segment = np.zeros(shape=(data.shape[0],25))
    max_seq_length = 25
    for i in range(data.shape[0]):
        tokens = tokenizer.tokenize(data.values[i][0])
        if len(tokens) >= max_seq_length-2:
            tokens = tokens[0:(max_seq_length-2)]
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_qt_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_qt_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            data_qt_segment[i] = np.array([0]*max_seq_length)
        else:
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_qt_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            tokens = tokens + ['[PAD]']*(max_seq_length-len(tokens))
            data_qt_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_qt_segment[i] = np.array([0]*max_seq_length)
            
            
    bert_model_1 = tf.keras.models.load_model('bert_model_1.h5',custom_objects={'KerasLayer':hub.KerasLayer})

    data_qt_pooled_output = bert_model_1.predict([data_qt_tokens.astype('int32'), data_qt_mask.astype('int32'), 
                                                   data_qt_segment.astype('int32')])
    
    #Processing Question body feature
    data_q_tokens = np.zeros(shape=(data.shape[0],512))
    data_q_mask = np.zeros(shape=(data.shape[0],512))
    data_q_segment = np.zeros(shape=(data.shape[0],512))
    max_seq_length = 512
    for i in range(data.shape[0]):
        tokens = tokenizer.tokenize(data.values[i][1])
        if len(tokens) >= max_seq_length-2:
            tokens = tokens[0:(max_seq_length-2)]
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_q_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_q_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            data_q_segment[i] = np.array([0]*max_seq_length)
        else:
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_q_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            tokens = tokens + ['[PAD]']*(max_seq_length-len(tokens))
            data_q_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_q_segment[i] = np.array([0]*max_seq_length)
            
    bert_model_2 = tf.keras.models.load_model('bert_model_2.h5',custom_objects={'KerasLayer':hub.KerasLayer})

    data_q_pooled_output = bert_model_2.predict([data_q_tokens.astype('int32'), data_q_mask.astype('int32'), 
                                                   data_q_segment.astype('int32')])

    #Processing Answer feature
    data_a_tokens = np.zeros(shape=(data.shape[0],512))
    data_a_mask = np.zeros(shape=(data.shape[0],512))
    data_a_segment = np.zeros(shape=(data.shape[0],512))
    max_seq_length = 512
    for i in range(data.shape[0]):
        tokens = tokenizer.tokenize(data.values[i][2])
        if len(tokens) >= max_seq_length-2:
            tokens = tokens[0:(max_seq_length-2)]
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_a_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_a_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            data_a_segment[i] = np.array([0]*max_seq_length)
        else:
            tokens = ['[CLS]',*tokens,'[SEP]']
            data_a_mask[i] = np.array([1]*len(tokens)+[0]*(max_seq_length-len(tokens)))
            tokens = tokens + ['[PAD]']*(max_seq_length-len(tokens))
            data_a_tokens[i] = np.array(tokenizer.convert_tokens_to_ids(tokens))
            data_a_segment[i] = np.array([0]*max_seq_length)
            
    data_a_pooled_output = bert_model_2.predict([data_a_tokens.astype('int32'), data_a_mask.astype('int32'), 
                                                   data_a_segment.astype('int32')])

    
    data_comp = [data_qt_pooled_output,data_q_pooled_output,data_a_pooled_output]

    bert_model = tf.keras.models.load_model('model_bert.h5')
    
    #predicting the 30 features
    y_pred = model_bert.predict(data_comp)
    
    metric_spearman = np.mean([spearmanr(output[:, ind], y_pred[:, ind]).correlation for ind in range(y_pred_val.shape[1])])
    
    return metric_spearman