In [73]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [126]:
def read_data(fn, model_name):
    vectors = []
    with open(fn, 'r') as f:
        for row in f.readlines():
            vectors.append([float(each) for each in row.strip().split()])
    cols = [model_name +'_access', model_name+'_costs', model_name+'_delays', model_name+'_errors', model_name+'_trusts']
    vectors = pd.DataFrame(vectors, columns=cols)
    return vectors

def read_data_from_jaden(train_fn, dev_fn, test_fn, model_name):
    train_data = read_data(train_fn, model_name)
    dev_data = read_data(dev_fn, model_name)
    test_data = read_data(test_fn, model_name)
    
    return train_data, dev_data, test_data

def read_data_from_lya(train_fn, test_fn):
    train_data = pd.read_csv(train_fn, header=0)
    test_data = pd.read_csv(test_fn)
    train_data.rename(columns={"access": "BERT_access", "costs": "BERT_costs", "delays":"BERT_delays", "errors":"BERT_errors", "trusts": "BERT_trusts"})
    test_data.rename(columns={"access": "BERT_access", "costs": "BERT_costs", "delays":"BERT_delays", "errors":"BERT_errors", "trusts": "BERT_trusts"})

    dev_data = test_data.iloc[:2500, :]
    test_data = test_data.iloc[2500:, :]
    
    dev_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    
    return train_data, dev_data, test_data

def read_prediction_for_training():
    train_bert, dev_bert, test_bert = read_data_from_lya("BERT_Aspect_training_prediction_value.txt", 
                                                         "BERT_Aspect_test_prediction_value.txt")
    train_pooledrnn, dev_pooledrnn, test_pooledrnn = read_data_from_jaden("Pooled_RNN_Aspect_training_prediction_value.txt", 
                                                                          "Pooled_RNN_Aspect_dev_prediction_value.txt", 
                                                                          "Pooled_RNN_Aspect_test_prediction_value.txt",
                                                                          "Pooled_RNN")
    train_attention, dev_attention, test_attention = read_data_from_jaden("Pooled_RNN_ATTENTION_TEXT_CNN_Aspect_training_prediction_value.txt", 
                                                                          "Pooled_RNN_ATTENTION_TEXT_CNN_Aspect_dev_prediction_value.txt", 
                                                                          "Pooled_RNN_ATTENTION_TEXT_CNN_Aspect_test_prediction_value.txt", 
                                                                          "Pooled_RNN_ATTENTION_TEXT_CNN")  
    train = pd.concat([train_bert, train_pooledrnn, train_attention], axis=1)
    dev = pd.concat([dev_bert, dev_pooledrnn, dev_attention], axis=1)
    test = pd.concat([test_bert, test_pooledrnn, test_attention], axis=1) 
    return train, dev, test


import re
def process(x):
    ans = set()
    for each in x[1:-1].split(','): 
        words = each.strip().strip('"').strip("'")
        m = re.match('(\w+)-', words)
        if m is not None:
            ans.add(m[1])
#         else:
#             ans.add(words)
    return list(ans)



def read_true_label():
    df = pd.read_excel('../medical_sieve_training_set2_lower.xlsx')
    df_t = pd.read_excel('../medical_sieve_test_set_lower.xlsx')
    
    df['ground_truth_aspect'] = df['ground_truth_subaspect'].apply(lambda x: process(x))
    df_t['ground_truth_aspect'] = df_t['ground_truth_subaspect'].apply(lambda x: process(x))
    df_dev = df_t[:2500]
    df_test = df_t[2500:]
    
    mlb_aspect = MultiLabelBinarizer()
    mlb_aspect.fit(df['ground_truth_aspect'])
    aspect_vectors_train = mlb_aspect.transform(df['ground_truth_aspect'])
    aspect_vectors_dev = mlb_aspect.transform(df_dev['ground_truth_aspect'])
    aspect_vectors_test = mlb_aspect.transform(df_test['ground_truth_aspect'])
    print("Unique types of compliant:\n",mlb_aspect.classes_)   
    
    return aspect_vectors_train, aspect_vectors_dev, aspect_vectors_test, mlb_aspect

def read_corpus_prediction():
    corpus_bert = pd.read_csv("BERT_Aspect_corpus_prediction_value.txt", header=0)
    corpus_pooledrnn = read_data("Pooled_RNN_Aspect_corpus_prediction_value.txt", "Pooled_RNN")
    corpus_attention = read_data("Pooled_RNN_Aspect_corpus_prediction_value.txt", "Pooled_RNN_ATTENTION_TEXT_CNN")

    corpus_data = pd.concat([corpus_bert, corpus_pooledrnn, corpus_attention], axis=1)
    return corpus_data

In [110]:
train_pre, dev_pre, test_pre = read_prediction_for_training()
train_true, dev_true, test_true, mlb_aspect = read_true_label()

Unique types of compliant:
 ['access' 'costs' 'delays' 'errors' 'trusts']


In [111]:
INPUT_DIM = train_pre.shape[1]
OUTPUT_DIM = train_true.shape[1]
print("input dim: {}, output dim: {}".format(INPUT_DIM, OUTPUT_DIM))

input dim: 15, output dim: 5


In [112]:
from keras import backend as K
from keras import optimizers
from keras.layers import Input, Embedding, Concatenate,  Dense
from keras.models import Model

def build_model(input_dim, output_dim, verbose = True, compile = True):
    input_layer = Input(shape=(input_dim,))
    x = Dense(input_dim)(input_layer)
    output_layer = Dense(output_dim, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    adam_optimizer = optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    model.summary()
    return model

In [113]:
"""
Training Session
"""
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau

BATCH_SIZE = 256
NUM_EPOCHS = 100

K.clear_session()
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, mode='min', verbose=1)
checkpointer = ModelCheckpoint(filepath="Ensemble_Model_"+str(BATCH_SIZE)+"_Final.hdf5", verbose=1, save_best_only=True)
model = build_model(input_dim=INPUT_DIM, output_dim=OUTPUT_DIM)
model.fit(x = train_pre,
          y = train_true,
          batch_size = BATCH_SIZE,
          epochs = NUM_EPOCHS,
          validation_data = (dev_pre,
                             dev_true),
          callbacks = [reduce_lr, es, checkpointer]
         )
model.load_weights("Ensemble_Model_"+str(BATCH_SIZE)+"_Final.hdf5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 15)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 15)                240       
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 80        
Total params: 320
Trainable params: 320
Non-trainable params: 0
_________________________________________________________________
Train on 43863 samples, validate on 2500 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.32620, saving model to Ensemble_Model_256_Final.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.32620 to 0.16333, saving model to Ensemble_Model_256_Final.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.16333 to 0.11374, saving model to Ensemble_Model_256_Final.hdf5
Epoch 4/100

Epoch

In [93]:
import copy
def combinations(nums):
    ans = [[]]
    for row in nums:
        curr = []
        for combination in ans:
            for element in row:
                new_combination = copy.deepcopy(combination)
                new_combination.append(element)
                curr.append(new_combination)
        ans = curr
    return ans
thresholds = [[0.2, 0.25, 0.3, 0.4, 0.45, 0.5, 0.55] for each in range(5)]

thresholds_set = combinations(thresholds)

In [121]:
val_preds = model.predict(dev_pre)
aspect_vectors = dev_true

In [122]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score

def f1(matrix):
    precision = matrix[1][1]*1.0 / (matrix[0][1] + matrix[1][1])
    recall = matrix[1][1]*1.0 / (matrix[1][0] + matrix[1][1])
    return 2*((precision*recall)/(precision+recall))

max_avg_f1 = 0
max_hamming_score = 0
max_exact_accuracy = 0
max_fuzzy_accuracy = 0
max_fuzzy_accuracy_pos = 0
max_exact_accuracy_pos = 0
max_avg_rocauc = 0
max_confusion_matrix = None
max_threshold_set = []
thresholds_set = [[0.4, 0.2, 0.25, 0.4, 0.3]]
for threshold_set in thresholds_set:
    predict_softmax = np.zeros(aspect_vectors.shape, dtype=int)
    for row_index, row in enumerate(val_preds):
        for index, each in enumerate(row):
            if each >= threshold_set[index]:
                predict_softmax[row_index][index] = 1

    hamming_score = 1 - hamming_loss(predict_softmax, aspect_vectors) 
    num_fuzzy_match = 0
    num_fuzzy_match_pos = 0
    num_exact_match_pos = 0
    num_pos = 0
    for true, pre in zip(mlb_aspect.inverse_transform(aspect_vectors), mlb_aspect.inverse_transform(predict_softmax)):
        if len(true) != 0: 
            num_pos += 1
        intersect = set(pre).intersection(set(true))
        if (len(true)>0 and len(pre)>0 and len(intersect) > 0) or (len(true) == 0 and len(pre) == 0):
            num_fuzzy_match += 1
        if len(true)>0 and len(pre)>0 and len(intersect) > 0:
            num_fuzzy_match_pos += 1
        if len(true)>0 and len(pre)>0 and pre == true: 
            num_exact_match_pos += 1
    fuzzy_accuracy = num_fuzzy_match*1.0/len(predict_softmax)
    exact_accuracy = accuracy_score(predict_softmax, aspect_vectors)
    fuzzy_accuracy_pos =  num_fuzzy_match_pos*1.0/num_pos
    exact_accuracy_pos = num_exact_match_pos*1.0/num_pos


    class_f1 = []
    for aspect, confusion_matrix in zip(mlb_aspect.classes_, multilabel_confusion_matrix(aspect_vectors, predict_softmax)):
#         print(aspect, ':',f1(confusion_matrix),'\n', confusion_matrix, '\n')
        class_f1.append(f1(confusion_matrix))
        
    rocauc_score = roc_auc_score(aspect_vectors, val_preds, 'weighted')
    if np.mean(class_f1) > max_avg_f1:
        max_threshold_set = threshold_set
        max_avg_f1 = max(max_avg_f1, np.mean(class_f1))
        max_hamming_score = hamming_score
        max_exact_accuracy = exact_accuracy
        max_fuzzy_accuracy = fuzzy_accuracy 
        max_exact_accuracy_pos = exact_accuracy_pos
        max_fuzzy_accuracy_pos = fuzzy_accuracy_pos
        max_avg_rocauc = rocauc_score
        max_confusion_matrix = multilabel_confusion_matrix(aspect_vectors, predict_softmax)
        
        

print("threshold set:", max_threshold_set)
print("Confusion Matrix for Each Aspect:\n" + "="*60)
print(max_confusion_matrix)
print("Result of Metrics for Evaluation:\n" + "="*60)
print("Hamming score:", max_hamming_score)
print("Exact accuracy:", max_exact_accuracy)
print("Fuzzy accuracy:", max_fuzzy_accuracy)
print("Exact accuracy (exclude negative):", max_exact_accuracy_pos )
print("Fuzzy accuracy (exclude negative):", max_fuzzy_accuracy_pos)
print("Average F1 Score: ", max_avg_f1)
print("ROC AUC Score: ", max_avg_rocauc)

threshold set: [0.4, 0.2, 0.25, 0.4, 0.3]
Confusion Matrix for Each Aspect:
[[[2215   96]
  [  54  135]]

 [[2322   49]
  [  29  100]]

 [[2380   34]
  [  33   53]]

 [[2427   20]
  [  22   31]]

 [[2391   28]
  [  31   50]]]
Result of Metrics for Evaluation:
Hamming score: 0.96832
Exact accuracy: 0.8468
Fuzzy accuracy: 0.8468
Exact accuracy (exclude negative): 0.6858736059479554
Fuzzy accuracy (exclude negative): 0.6858736059479554
Average F1 Score:  0.6400166060116974
ROC AUC Score:  0.9677987766472353


In [119]:
val_preds = model.predict(test_pre)
aspect_vectors = test_true

In [120]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score

def f1(matrix):
    precision = matrix[1][1]*1.0 / (matrix[0][1] + matrix[1][1])
    recall = matrix[1][1]*1.0 / (matrix[1][0] + matrix[1][1])
    return 2*((precision*recall)/(precision+recall))

max_avg_f1 = 0
max_hamming_score = 0
max_exact_accuracy = 0
max_fuzzy_accuracy = 0
max_fuzzy_accuracy_pos = 0
max_exact_accuracy_pos = 0
max_avg_rocauc = 0
max_confusion_matrix = None
max_threshold_set = []
thresholds_set = [[0.4, 0.2, 0.25, 0.4, 0.3]]
for threshold_set in thresholds_set:
    predict_softmax = np.zeros(aspect_vectors.shape, dtype=int)
    for row_index, row in enumerate(val_preds):
        for index, each in enumerate(row):
            if each >= threshold_set[index]:
                predict_softmax[row_index][index] = 1

    hamming_score = 1 - hamming_loss(predict_softmax, aspect_vectors) 
    num_fuzzy_match = 0
    num_fuzzy_match_pos = 0
    num_exact_match_pos = 0
    num_pos = 0
    for true, pre in zip(mlb_aspect.inverse_transform(aspect_vectors), mlb_aspect.inverse_transform(predict_softmax)):
        if len(true) != 0: 
            num_pos += 1
        intersect = set(pre).intersection(set(true))
        if (len(true)>0 and len(pre)>0 and len(intersect) > 0) or (len(true) == 0 and len(pre) == 0):
            num_fuzzy_match += 1
        if len(true)>0 and len(pre)>0 and len(intersect) > 0:
            num_fuzzy_match_pos += 1
        if len(true)>0 and len(pre)>0 and pre == true: 
            num_exact_match_pos += 1
    fuzzy_accuracy = num_fuzzy_match*1.0/len(predict_softmax)
    exact_accuracy = accuracy_score(predict_softmax, aspect_vectors)
    fuzzy_accuracy_pos =  num_fuzzy_match_pos*1.0/num_pos
    exact_accuracy_pos = num_exact_match_pos*1.0/num_pos


    class_f1 = []
    for aspect, confusion_matrix in zip(mlb_aspect.classes_, multilabel_confusion_matrix(aspect_vectors, predict_softmax)):
#         print(aspect, ':',f1(confusion_matrix),'\n', confusion_matrix, '\n')
        class_f1.append(f1(confusion_matrix))
        
    rocauc_score = roc_auc_score(aspect_vectors, val_preds, 'weighted')
    if np.mean(class_f1) > max_avg_f1:
        max_threshold_set = threshold_set
        max_avg_f1 = max(max_avg_f1, np.mean(class_f1))
        max_hamming_score = hamming_score
        max_exact_accuracy = exact_accuracy
        max_fuzzy_accuracy = fuzzy_accuracy 
        max_exact_accuracy_pos = exact_accuracy_pos
        max_fuzzy_accuracy_pos = fuzzy_accuracy_pos
        max_avg_rocauc = rocauc_score
        max_confusion_matrix = multilabel_confusion_matrix(aspect_vectors, predict_softmax)
        
        

print("threshold set:", max_threshold_set)
print("Confusion Matrix for Each Aspect:\n" + "="*60)
print(max_confusion_matrix)
print("Result of Metrics for Evaluation:\n" + "="*60)
print("Hamming score:", max_hamming_score)
print("Exact accuracy:", max_exact_accuracy)
print("Fuzzy accuracy:", max_fuzzy_accuracy)
print("Exact accuracy (exclude negative):", max_exact_accuracy_pos )
print("Fuzzy accuracy (exclude negative):", max_fuzzy_accuracy_pos)
print("Average F1 Score: ", max_avg_f1)
print("ROC AUC Score: ", max_avg_rocauc)

threshold set: [0.4, 0.2, 0.25, 0.4, 0.3]
Confusion Matrix for Each Aspect:
[[[2201  132]
  [  35  132]]

 [[2332   47]
  [  37   84]]

 [[2378   27]
  [  40   55]]

 [[2425   21]
  [  20   34]]

 [[2389   41]
  [  40   30]]]
Result of Metrics for Evaluation:
Hamming score: 0.9648
Exact accuracy: 0.8284
Fuzzy accuracy: 0.8284
Exact accuracy (exclude negative): 0.6607495069033531
Fuzzy accuracy (exclude negative): 0.6607495069033531
Average F1 Score:  0.5900099442886633
ROC AUC Score:  0.9660783638293943


In [116]:
val_preds = model.predict(train_pre)
aspect_vectors = train_true

In [118]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score

def f1(matrix):
    precision = matrix[1][1]*1.0 / (matrix[0][1] + matrix[1][1])
    recall = matrix[1][1]*1.0 / (matrix[1][0] + matrix[1][1])
    return 2*((precision*recall)/(precision+recall))

max_avg_f1 = 0
max_hamming_score = 0
max_exact_accuracy = 0
max_fuzzy_accuracy = 0
max_fuzzy_accuracy_pos = 0
max_exact_accuracy_pos = 0
max_avg_rocauc = 0
max_confusion_matrix = None
max_threshold_set = []
thresholds_set = [[0.4, 0.2, 0.25, 0.4, 0.3]]

for threshold_set in thresholds_set:
    predict_softmax = np.zeros(aspect_vectors.shape, dtype=int)
    for row_index, row in enumerate(val_preds):
        for index, each in enumerate(row):
            if each >= threshold_set[index]:
                predict_softmax[row_index][index] = 1

    hamming_score = 1 - hamming_loss(predict_softmax, aspect_vectors) 
    num_fuzzy_match = 0
    num_fuzzy_match_pos = 0
    num_exact_match_pos = 0
    num_pos = 0
    for true, pre in zip(mlb_aspect.inverse_transform(aspect_vectors), mlb_aspect.inverse_transform(predict_softmax)):
        if len(true) != 0: 
            num_pos += 1
        intersect = set(pre).intersection(set(true))
        if (len(true)>0 and len(pre)>0 and len(intersect) > 0) or (len(true) == 0 and len(pre) == 0):
            num_fuzzy_match += 1
        if len(true)>0 and len(pre)>0 and len(intersect) > 0:
            num_fuzzy_match_pos += 1
        if len(true)>0 and len(pre)>0 and pre == true: 
            num_exact_match_pos += 1
    fuzzy_accuracy = num_fuzzy_match*1.0/len(predict_softmax)
    exact_accuracy = accuracy_score(predict_softmax, aspect_vectors)
    fuzzy_accuracy_pos =  num_fuzzy_match_pos*1.0/num_pos
    exact_accuracy_pos = num_exact_match_pos*1.0/num_pos


    class_f1 = []
    for aspect, confusion_matrix in zip(mlb_aspect.classes_, multilabel_confusion_matrix(aspect_vectors, predict_softmax)):
#         print(aspect, ':',f1(confusion_matrix),'\n', confusion_matrix, '\n')
        class_f1.append(f1(confusion_matrix))
        
    rocauc_score = roc_auc_score(aspect_vectors, val_preds, 'weighted')
    if np.mean(class_f1) > max_avg_f1:
        max_threshold_set = threshold_set
        max_avg_f1 = max(max_avg_f1, np.mean(class_f1))
        max_hamming_score = hamming_score
        max_exact_accuracy = exact_accuracy
        max_fuzzy_accuracy = fuzzy_accuracy 
        max_exact_accuracy_pos = exact_accuracy_pos
        max_fuzzy_accuracy_pos = fuzzy_accuracy_pos
        max_avg_rocauc = rocauc_score
        max_confusion_matrix = multilabel_confusion_matrix(aspect_vectors, predict_softmax)
        
        

print("threshold set:", max_threshold_set)
print("Confusion Matrix for Each Aspect:\n" + "="*60)
print(max_confusion_matrix)
print("Result of Metrics for Evaluation:\n" + "="*60)
print("Hamming score:", max_hamming_score)
print("Exact accuracy:", max_exact_accuracy)
print("Fuzzy accuracy:", max_fuzzy_accuracy)
print("Exact accuracy (exclude negative):", max_exact_accuracy_pos )
print("Fuzzy accuracy (exclude negative):", max_fuzzy_accuracy_pos)
print("Average F1 Score: ", max_avg_f1)
print("ROC AUC Score: ", max_avg_rocauc)

threshold set: [0.4, 0.2, 0.25, 0.4, 0.3]
Confusion Matrix for Each Aspect:
[[[40458   588]
  [  577  2240]]

 [[40321   864]
  [  492  2186]]

 [[41230   666]
  [  615  1352]]

 [[41758   447]
  [  811   847]]

 [[40875   886]
  [ 1027  1075]]]
Result of Metrics for Evaluation:
Hamming score: 0.9682055490960491
Exact accuracy: 0.8466361170006611
Fuzzy accuracy: 0.8480496090098717
Exact accuracy (exclude negative): 0.6837392550143266
Fuzzy accuracy (exclude negative): 0.6892908309455588
Average F1 Score:  0.6676898506050032
ROC AUC Score:  0.9706521611491519


## Prediction on all

In [127]:
BATCH_SIZE = 256
corpus_data = read_corpus_prediction()
model = build_model(input_dim=INPUT_DIM, output_dim=OUTPUT_DIM)
model.load_weights("Ensemble_Model_"+str(BATCH_SIZE)+"_Final.hdf5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 15)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 15)                240       
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 80        
Total params: 320
Trainable params: 320
Non-trainable params: 0
_________________________________________________________________


In [128]:
corpus_val_preds = model.predict(corpus_data)

In [134]:
# np.savetxt('Ensemble_Aspect_corpus_prediction_value.txt', corpus_val_preds, delimiter=' ')
threshold_set = [0.4, 0.2, 0.25, 0.4, 0.3]
predict_softmax = np.zeros((corpus_data.shape[0], corpus_val_preds.shape[1]), dtype=int)
for row_index, row in enumerate(corpus_val_preds):
    for index, each in enumerate(row):
        if each >= threshold_set[index]:
            predict_softmax[row_index][index] = 1
np.savetxt('Ensemble_Aspect_corpus_prediction.txt', predict_softmax.astype(int), delimiter=' ')