In [17]:
import time

import numpy as np
import pandas as pd
from keras import callbacks
from keras import regularizers, optimizers
from keras.layers import *
from keras.models import *
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

model_name = 'pretrained-mlp'
dense_layer_sizes = [37]
dropout_rate = 0.6
learning_rate = 0.001
n_fold = 5
batch_size = 32
epochs = 1000
patience = 100
# n_test = 100
lambd = 0.1  # L2 regularization

In [None]:
def build_mlp_model(input_shape):
    X_input = layers.Input(input_shape)

    # First dense layer
    X = layers.Dense(dense_layer_sizes[0], name='dense0')(X_input)
    X = layers.BatchNormalization(name='bn0')(X)
    X = layers.Activation('relu')(X)
    X = layers.Dropout(dropout_rate, seed=7)(X)

    # Second dense layer
    # X = layers.Dense(dense_layer_sizes[0], name = 'dense1')(X)
    # X = layers.BatchNormalization(name = 'bn1')(X)
    # X = layers.Activation('relu')(X)
    # X = layers.Dropout(dropout_rate, seed = 9)(X)

    # Output layer
    X = layers.Dense(3, name='output', kernel_regularizer=regularizers.l2(lambd))(X)
    X = layers.Activation('softmax')(X)

    # Create model
    model = models.Model(input=X_input, output=X, name='classif_model')
    return model


In [116]:
def parse_json(embeddings):
    """
    Parses the embeddings given by BERT, and suitably formats them to be passed to the MLP model

    Input: embeddings (DataFrame) containing contextual embeddings from BERT, also labels for the classification problem
    columns: "emb_A": contextual embedding for the word A
             "emb_B": contextual embedding for the word B
             "emb_P": contextual embedding for the pronoun
             "label": the answer to the coreference problem: "A", "B" or "NEITHER"

    Output: X (numpy array) for each line in the GAP file, the concatenation of the embeddings of the target words
            Y (numpy array) for each line in the GAP file, the one-hot encoded answer to the coreference problem
    """
    embeddings.sort_index(
        inplace=True)  # Sorting the DataFrame, because reading from the json file messed with the order
    N = len(embeddings)
    dims = 1024
    Y = np.zeros((N, 3))
    all_P = np.zeros((N, dims))
    all_A = np.zeros((N, dims))
    all_B = np.zeros((N, dims))
        # Concatenate features
    for i in range(len(embeddings)):
        all_A[i] = np.array(embeddings.loc[i, 'emb_A'])
        all_B[i] = np.array(embeddings.loc[i, 'emb_B'])
        all_P[i] = np.array(embeddings.loc[i, 'emb_P'])
#         all_A.append(np.array(embeddings.loc[i, 'emb_A']))
#         all_B.append(np.array(embeddings.loc[i, 'emb_B']))
#         all_P.append(np.array(embeddings.loc[i, 'emb_P']))

    # One-hot encoding for labels
    for i in range(len(embeddings)):
        label = embeddings.loc[i, 'label']
        if label == 'A':
            Y[i, 0] = 1
        elif label == 'B':
            Y[i, 1] = 1
        else:
            Y[i, 2] = 1
#     all_P = np.array(all_P, dtype=np.float32)
#     all_A = np.array(all_A, dtype=np.float32)
#     all_B = np.array(all_B, dtype=np.float32)
    
#     np.nan_to_num(all_P, copy=False)
#     np.nan_to_num(all_A, copy=False)
#     np.nan_to_num(all_B, copy=False)
    return [all_A, all_B, all_P], Y

OOF_NAME, DATA = 'nn_base_emb_gap', 'large-emb-gap'

development = pd.read_json('data/%s-development.json' % DATA)
validation = pd.read_json('data/%s-validation.json'% DATA)
test = pd.read_json('data/%s-test.json'% DATA)

new_train = pd.concat([validation, test])
new_train = new_train.reset_index(drop=True)

X_train, Y_train = parse_json(new_train)

X_test, Y_test = parse_json(development)


In [122]:
np.sum(np.isnan(X_test[0][208]))

0

In [71]:
for i in range(len(development)):
    if len(development.loc[0, 'emb_A']) != 768:
        print(i)

In [75]:
X_test[0].dtype

dtype('float32')

In [99]:
def parse_json(embeddings):
    """
    Parses the embeddings given by BERT, and suitably formats them to be passed to the MLP model

    Input: embeddings (DataFrame) containing contextual embeddings from BERT, also labels for the classification problem
    columns: "emb_A": contextual embedding for the word A
             "emb_B": contextual embedding for the word B
             "emb_P": contextual embedding for the pronoun
             "label": the answer to the coreference problem: "A", "B" or "NEITHER"

    Output: X (numpy array) for each line in the GAP file, the concatenation of the embeddings of the target words
            Y (numpy array) for each line in the GAP file, the one-hot encoded answer to the coreference problem
    """
    embeddings.sort_index(
        inplace=True)  # Sorting the DataFrame, because reading from the json file messed with the order
    X = np.zeros((len(embeddings), 3 * 1024))
    Y = np.zeros((len(embeddings), 3))

    # Concatenate features
    for i in range(len(embeddings)):
        A = np.array(embeddings.loc[i, 'emb_A'])
        B = np.array(embeddings.loc[i, 'emb_B'])
        P = np.array(embeddings.loc[i, 'emb_P'])
        X[i] = np.concatenate((A, B, P))

#     np.nan_to_num(X, copy=False)
    # One-hot encoding for labels
    for i in range(len(embeddings)):
        label = embeddings.loc[i, 'label']
        if label == 'A':
            Y[i, 0] = 1
        elif label == 'B':
            Y[i, 1] = 1
        else:
            Y[i, 2] = 1

    return X, Y


# Read development embeddings from json file - this is the output of Bert
development = pd.read_json('data/large-emb-gap-development.json')
X_development, Y_development = parse_json(development)

validation = pd.read_json('data/large-emb-gap-validation.json')
X_validation, Y_validation = parse_json(validation)

test = pd.read_json('data/large-emb-gap-test.json')
X_test, Y_test = parse_json(test)


In [103]:
remove_test = [row for row in range(len(X_development)) if np.sum(np.isnan(X_development[row]))]
remove_test

[209, 1988]

In [123]:
remove_test = [row for row in range(len(X_test[0])) if np.sum(np.isnan(X_test[0][row]))]
print(remove_test)

[209, 1988]


In [94]:
X_train[0].shape

(2454, 1024)

In [27]:
type(X_test[0][0])

numpy.ndarray

In [13]:
X_test[0][0]

array([0.738879, -0.22716499999999998, 0.1093045, -0.5271255, -0.1011425,
       -0.3374955, 1.443006, -0.11599899999999999, -0.1925945, 0.099926,
       0.0966345, -0.335821, 0.5851005, -0.24083549999999998, -0.342422,
       0.47573099999999996, -0.1167145, 0.399569, -0.20975449999999998,
       0.033386, 0.5201835, 0.143432, -0.6489345, 0.31892149999999997,
       0.09772499999999999, 0.48163599999999995, -0.1387115, 0.4075855,
       -0.0728805, 0.99753, 0.747265, -0.019926, 0.5298205,
       0.23479049999999999, 0.35161149999999997, 0.41965149999999996,
       -0.265684, -0.27146349999999997, 0.057056499999999996, -0.6913,
       0.1341615, -0.484091, 0.13802899999999999, -0.1446525, -0.147395,
       -0.107264, 0.9926904999999999, -0.493407, 0.5483445,
       -0.40435099999999996, -1.058451, 0.2930125, -0.215253, -0.502663,
       0.312512, -0.275433, -0.009658, -0.8013205, -0.568321, -0.4131915,
       0.3421835, 0.0686625, -0.7094765, -0.5369245,
       -0.042256999999999996, 0

In [14]:
class End2End_NCR():
    
    def __init__(self, word_input_shape): 
        
        self.word_input_shape = word_input_shape
        self.hidden_dim   = 150
        
    def build(self):
        
        A, B, P = Input((self.word_input_shape,)), Input((self.word_input_shape,)), Input((self.word_input_shape,))
        inputs = [A, B, P]

        
        self.ffnn = Sequential([Dense(self.hidden_dim, use_bias=True),
                                     Activation('relu'),
                                     Dropout(rate=0.2, seed = 7),
                                     Dense(1, activation='linear')])

        PA = Multiply()([inputs[0], inputs[2]])
        PB = Multiply()([inputs[1], inputs[2]])

        PA = Concatenate(axis=-1)([P, A, PA])
        PB = Concatenate(axis=-1)([P, B, PB])
        PA_score = self.ffnn(PA)
        PB_score = self.ffnn(PB)
        # Fix the Neither to score 0.
        score_e  = Lambda(lambda x: K.zeros_like(x))(PB_score)
        
        #Final Output
        output = Concatenate(axis=-1)([PA_score, PB_score, score_e])
        output = Activation('softmax')(output)        
        model = Model(inputs, output)
        
        return model

In [18]:
End2End_NCR(1024).build().summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1024)         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1024)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1024)         0                                            
__________________________________________________________________________________________________
multiply_1 (Multiply)           (None, 1024)         0           input_1[0][0]                    
                                                                 input_3[0][0]                    
__________

In [3]:
# Read development embeddings from json file - this is the output of Bert
development = pd.read_json('data/large-emb-gap-development.json')
X_development, Y_development = parse_json(development)

validation = pd.read_json('data/large-emb-gap-validation.json')
X_validation, Y_validation = parse_json(validation)

test = pd.read_json('data/large-emb-gap-test.json')
X_test, Y_test = parse_json(test)

In [5]:
development.head()

Unnamed: 0,emb_A,emb_B,emb_P,label
0,"[-0.40596499999999996, -0.4736605, -0.12929649...","[-0.453505, -0.7266819999999999, -0.597712, 0....","[-0.61323, -0.160519, 0.481929, -0.55641299999...",A
1,"[-0.514829, -0.6953269999999999, 0.278206, 0.0...","[-0.704917, 0.042984999999999995, 0.1404515, -...","[-0.593152, -0.178061, 0.194238, -0.5945929999...",A
2,"[-0.6475200000000001, -0.018141, -0.00232, 0.2...","[-0.65946025, -0.52119425, 0.16222825, 0.00214...","[0.264017, -0.798705, -0.11953899999999999, -0...",B
3,"[0.5062059999999999, -0.06980599999999999, 0.1...","[-0.1498205, -0.140093, 0.5434215, -0.108469, ...","[-0.057193999999999995, -0.37878599999999996, ...",B
4,"[0.018103499999999998, -0.08592325, 0.39331325...","[-0.578073, -0.044946, 0.35519, 0.028821, -0.2...","[-0.6689729999999999, -0.380459, -0.269386, 0....",B


In [4]:
X_development.shape

(2000, 3072)

In [6]:
Y_development.shape

(2000, 3)

In [7]:
X_validation.shape

(454, 3072)

In [8]:
import pandas as pd
import numpy as np

lgbm_large_mgap = pd.read_csv('output/lgbm_large_mgap.csv', index_col='ID')
lgbm_base_mgap = pd.read_csv('output/lgbm_base_mgap.csv', index_col='ID')

fname = lgbm_large_mgap.index

In [11]:
weights = [(lgbm_large_mgap, 3), (lgbm_base_mgap, 1)]
res_prob = np.zeros_like(lgbm_large_mgap.values)
total_weight = 0
for dfx, w in weights:
    res_prob += lgbm_large_mgap.values*w
    total_weight += w
res_prob /= total_weight

In [12]:
res = pd.DataFrame(res_prob, columns=['A', 'B', 'NEITHER'])
res['ID'] = fname
res.to_csv("output/blend.csv", index=False)