https://github.com/google-research-datasets/gap-coreference           
https://www.kaggle.com/c/gendered-pronoun-resolution/data           
https://www.kaggle.com/mateiionita/taming-the-bert-a-baseline

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [0]:
import re, string, nltk
import zipfile
import sys, os, time
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_auc_score,log_loss

In [0]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input,Dense,Dropout,BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras import backend, models, layers, initializers, regularizers, constraints, optimizers
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [0]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Load Data

In [0]:
train = pd.read_csv('https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv', sep='\t')
test = pd.read_csv('https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv', sep='\t')
val = pd.read_csv('https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv', sep='\t')
df = pd.concat([train,val,test]).reset_index(drop=True)
df.shape[0], train.shape[0], val.shape[0], test.shape[0]

NameError: ignored

In [0]:
df.head(1)

## Baseline on preceding and succeeding 5 words

In [0]:
def get_features(text, offset, l):
    f = []

    if offset > 5:    
        f = text[offset-5:offset]
    else:
        x = ['<unk>']*5
        x[5-offset:5] = text[:offset]
        f = x        
        
    if (len(text)-offset+l) > 5:
        f += text[offset:offset+5]
    else:
        x = ['<unk>']*5
        x[-5:-5+offset] = text[offset:]
        f += x
        
    return f

#### Extract features

In [0]:
features = []

for i,row in enumerate(df.values):
    text = row[1]    
    a_offset = len(text[:row[5]].split())
    a_len = len(row[4].split())
    
    b_offset = len(text[:row[8]].split())
    b_len = len(row[7].split())
    
    text = text.split()
    f = get_features(text, a_offset, a_len)
    f += get_features(text, b_offset, b_len)
    
    features += [f]

features = pd.DataFrame(features).values

In [0]:
y = ['<unk>']
for row in df['Text']:
    y += row.split()
le = preprocessing.LabelEncoder().fit(y)

In [0]:
features = le.transform(features.flatten().astype(str))
features = features.reshape((df.shape[0],20))

In [0]:
labels = []
for i,row in enumerate(df[['A-coref','B-coref']].values):
    if row[0] == True:
        labels += [0]
    elif row[1] == True:
        labels += [1]
    else:
        labels += [2]

In [0]:
X_train = features[:3000].copy()
X_test = features[3000:].copy()
Y_train = labels[:3000].copy()
Y_test = labels[3000:].copy()

#### Classification

In [0]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)
pred = clf.predict(X_test)



In [0]:
precision = precision_score(pred,Y_test,average='weighted')
recall = recall_score(pred,Y_test,average='weighted')
f1 = f1_score(pred,Y_test,average='weighted')
accuracy = accuracy_score(pred, Y_test)

print('precision:',precision,
  '\nrecall:',recall,'\nf1:',f1,'\nacc:',accuracy)

precision: 0.6872397451128324 
recall: 0.6031636863823934 
f1: 0.6412982346833844 
acc: 0.6031636863823934


## Baseline on preceding and succeeding 5 POS 

In [0]:
def get_features(text, offset, l):
    f = []

    if offset > 5:    
        f = text[offset-5:offset]
    else:
        x = ['<unk>']*5
        x[5-offset:5] = text[:offset]
        f = x        
        
    if (len(text)-offset+l) > 5:
        f += text[offset:offset+5]
    else:
        x = ['<unk>']*5
        x[-5:-5+offset] = text[offset:]
        f += x
        
    return f

In [0]:
features = []

for i,row in enumerate(df.values):
    text = row[1]    
    a_offset = len(text[:row[5]].split())
    a_len = len(row[4].split())
    
    b_offset = len(text[:row[8]].split())
    b_len = len(row[7].split())
    
    tokens = np.array(nltk.pos_tag(text.split()))[:,1]
    tokens = [token for token in tokens]
    f = get_features(tokens, a_offset, a_len)
    f += get_features(tokens, b_offset, b_len)
    
    features += [f]
    
features = pd.DataFrame(features).fillna('<unk>').values

In [0]:
y = [('<unk>','<unk>')]
for row in df['Text']:
    y += nltk.pos_tag(row.split())
y = np.array(y)[:,1]
le = preprocessing.LabelEncoder().fit(y)

In [0]:
features = le.transform(features.flatten().astype(str))
features = features.reshape((df.shape[0],20))

In [0]:
X_train = features[:3000].copy()
X_test = features[3000:].copy()
Y_train = labels[:3000].copy()
Y_test = labels[3000:].copy()

#### Classification

In [0]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)
pred = clf.predict(X_test)



In [0]:
precision = precision_score(pred,Y_test,average='weighted')
recall = recall_score(pred,Y_test,average='weighted')
f1 = f1_score(pred,Y_test,average='weighted')
accuracy = accuracy_score(pred, Y_test)

print('precision:',precision,
  '\nrecall:',recall,'\nf1:',f1,'\nacc:',accuracy)

precision: 0.675903821932107 
recall: 0.6052269601100413 
f1: 0.6371361934578361 
acc: 0.6052269601100413


# BERT Features 

In [0]:
!unzip '/content/uncased_L-12_H-768_A-12.zip'

In [0]:
def compute_offset_no_spaces(text, offset):
    count = 0
    for i in range(offset):
        if text[i] != ' ': count += 1
    return count

In [0]:
def count_length(text):
    count = 0
    for i in range(len(text)):
        if text[i] not in ['#',' ']: count += 1
    return count

In [0]:
def run_bert(data):
    text = data["Text"]
    text.to_csv("input.txt", index = False, header = False)
    print('running bert')
    os.system("python3 extract_features.py \
      --input_file=input.txt \
      --output_file=output.jsonl \
      --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \
      --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \
      --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \
      --layers=-1 \
      --max_seq_length=256 \
      --batch_size=8")
    
    bert_output = pd.read_json("output.jsonl", lines = True)

    os.system("rm output.jsonl")
    os.system("rm input.txt")
    
    print('post bert')

    index = data.index
    columns = ["emb_A", "emb_B", "emb_P", "label"]
    emb = pd.DataFrame(index = index, columns = columns)
    emb.index.name = "ID"
    
    for i in range(len(data)): 
        if i % 100 == 0: print(i)
        P = data.loc[i,"Pronoun"].lower()
        A = data.loc[i,"A"].lower()
        B = data.loc[i,"B"].lower()

        P_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"Pronoun-offset"])
        A_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"A-offset"])
        B_offset = compute_offset_no_spaces(data.loc[i,"Text"], data.loc[i,"B-offset"])
        A_length = count_length(A)
        B_length = count_length(B)

        emb_A = np.zeros(768)
        emb_B = np.zeros(768)
        emb_P = np.zeros(768)

        count_chars = 0
        cnt_A, cnt_B, cnt_P = 0, 0, 0

        features = pd.DataFrame(bert_output.loc[i,"features"]) # Get the BERT embeddings for the current line in the data file
        for j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
            token = features.loc[j,"token"]

            if count_chars  == P_offset: 
                emb_P += np.array(features.loc[j,"layers"][0]['values'])
                cnt_P += 1
            if count_chars in range(A_offset, A_offset + A_length): 
                emb_A += np.array(features.loc[j,"layers"][0]['values'])
                cnt_A +=1
            if count_chars in range(B_offset, B_offset + B_length): 
                emb_B += np.array(features.loc[j,"layers"][0]['values'])
                cnt_B +=1
            count_chars += count_length(token)
        emb_A /= cnt_A
        emb_B /= cnt_B

        label = "Neither"
        if (data.loc[i,"A-coref"] == True):
            label = "A"
        if (data.loc[i,"B-coref"] == True):
            label = "B"

        emb.iloc[i] = [emb_A, emb_B, emb_P, label]

    return emb

In [0]:
def parse_json(embeddings):
    embeddings.sort_index(inplace = True) # Sorting the DataFrame, because reading from the json file messed with the order
    X = np.zeros((len(embeddings),3*768))
    Y = np.zeros((len(embeddings), 3))

    # Concatenate features
    for i in range(len(embeddings)):
        A = np.array(embeddings.loc[i,"emb_A"])
        B = np.array(embeddings.loc[i,"emb_B"])
        P = np.array(embeddings.loc[i,"emb_P"])
        X[i] = np.concatenate((A,B,P))

    # One-hot encoding for labels
    for i in range(len(embeddings)):
        label = embeddings.loc[i,"label"]
        if label == "A":
            Y[i,0] = 1
        elif label == "B":
            Y[i,1] = 1
        else:
            Y[i,2] = 1

    return X, Y

In [0]:
emb = pd.read_json('contextual_embeddings.json')
emb = parse_json(emb)

## Train model

In [0]:
train_data = emb[0][:2000]
test_data = emb[0][2454:]
val_data = emb[0][2000:2454]
train_label = emb[1][:2000]
test_label = emb[1][2454:]
val_label = emb[1][2000:2454]

In [0]:
train_data.shape, test_data.shape, val_data.shape

((1998, 2304), (1999, 2304), (1999, 2304))

In [0]:
# Image features
inputs = Input(shape=(2304,))
dense = Dense(256, activation='relu')(inputs)
norm = BatchNormalization()(dense)
dropout = Dropout(0.5)(norm)
dense = Dense(3, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=dense)
# model.compile(loss='categorical_crossentropy', optimizer='adam', kernel_regularizer = regularizers.l2(0.1))
model.compile(loss='categorical_crossentropy', optimizer='adam')

print(model.summary())

In [0]:
keras.utils.plot_model(model, to_file='model.png', show_shapes=True)

In [0]:
model.fit(x = train_data, y = train_label, epochs = 1000, batch_size = 32, validation_data = (val_data, val_label))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 2000 samples, validate on 454 samples
Epoch 1/1000





Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 

<keras.callbacks.History at 0x7f0f58e8c048>

In [0]:
pred_val = model.predict(x = val_data, verbose = 0)
pred_test = model.predict(x = test_data, verbose = 0)

log_loss(test_label, pred_test), log_loss(val_label, pred_val)

(0.9703576371073723, 1.0027130029537605)

In [0]:
pred = model.predict(test_data)
pred_test = [np.argmax(row) for row in pred]
y = [np.argmax(row) for row in test_label]

precision = precision_score(pred_test,y,average='weighted')
recall = recall_score(pred_test,y,average='weighted')
f1 = f1_score(pred_test,y,average='weighted')
accuracy = accuracy_score(pred_test, y)

print('precision:',precision,'\nrecall:',recall,'\nf1:',f1,'\nacc:',accuracy)

precision: 1.0 
recall: 0.4275 
f1: 0.5989492119089317 
acc: 0.4275


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


### Crossval

In [0]:
train_data = emb[0][:2454]
test_data = emb[0][2454:]
train_label = emb[1][:2454]
test_label = emb[1][2454:]

In [0]:
def build_mlp_model(input_shape):
	X_input = layers.Input(input_shape)

	# First dense layer
	X = layers.Dense(300, name = 'dense0')(X_input)
	X = layers.BatchNormalization(name = 'bn0')(X)
	X = layers.Activation('relu')(X)
	X = layers.Dropout(0.5, seed = 7)(X)

	# Output layer
	X = layers.Dense(3, name = 'output', kernel_regularizer = regularizers.l2(0.1))(X)
	X = layers.Activation('softmax')(X)

	# Create model
	model = models.Model(input = X_input, output = X, name = "classif_model")
	return model

In [0]:
folds = KFold(n_splits=5, shuffle=True, random_state=3)
scores = []
prediction = np.zeros((len(test_data),3))
for fold_n, (train_index, valid_index) in enumerate(folds.split(train_data)):
	print('Fold', fold_n, 'started at', time.ctime())
	X_tr, X_val = train_data[train_index], train_data[valid_index]
	Y_tr, Y_val = train_label[train_index], train_label[valid_index]

	# Define the model, re-initializing for each fold
	classif_model = build_mlp_model([train_data.shape[1]])
	classif_model.compile(optimizer = optimizers.Adam(lr = 0.001), loss = "categorical_crossentropy")

	classif_model.fit(x = X_tr, y = Y_tr, epochs = 1000, batch_size = 32, validation_data = (X_val, Y_val), verbose = 0)

	# make predictions on validation and test data
	pred_valid = classif_model.predict(x = X_val, verbose = 0)
	pred = classif_model.predict(x = test_data, verbose = 0)

	# oof[valid_index] = pred_valid.reshape(-1,)
	scores.append(log_loss(Y_val, pred_valid))
	prediction += pred
prediction /= 5

# Print CV scores, as well as score on the test data
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
print(scores)
print("Test score:", log_loss(test_label,prediction))

In [0]:
prediction /= 5

# Print CV scores, as well as score on the test data
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
print(scores)
print("Test score:", log_loss(test_label,prediction))

CV mean score: 0.9597, std: 0.0182.
[0.9594847553859175, 0.9349472906341864, 0.9501213746983748, 0.9902845710940856, 0.9634564618675077]
Test score: 0.9698580836856097


In [0]:
pred = prediction.copy()
for i,row in enumerate(pred):
  max_ = np.argmax(row)
  print(row)
  pred[i] = [0,0,0]
  pred[i,max_] = 1

In [0]:
precision = precision_score(test_label,pred,average='weighted')
recall = recall_score(test_label,pred,average='weighted')
f1 = f1_score(test_label,pred,average='weighted')
accuracy = accuracy_score(test_label,pred)

print('precision:',precision,'\nrecall:',recall,'\nf1:',f1,'\nacc:',accuracy)

precision: 0.18275624999999998 
recall: 0.4275 
f1: 0.2560507880910683 
acc: 0.4275


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [0]:
train_data[0][:10]

array([ 0.738879 , -0.2271645,  0.109304 , -0.527125 , -0.101143 ,
       -0.3374955,  1.4430055, -0.1159995, -0.192594 ,  0.099925 ])

In [0]:
train_data = emb[0][:2454]
test_data = emb[0][2454:]
train_label = emb[1][:2454]
test_label = emb[1][2454:]

In [0]:
idx = [row for row in range(len(train_data)) if np.sum(np.isnan(train_data[row]))]
train_data = np.delete(train_data, idx, 0)
train_label = np.delete(train_label, idx, 0)

In [0]:
idx = [row for row in range(len(test_data)) if np.sum(np.isnan(test_data[row]))]
test_data = np.delete(test_data, idx, 0)
test_label = np.delete(test_label, idx, 0)

In [0]:
y_train = [np.argmax(row) for row in train_label]
y_test = [np.argmax(row) for row in test_label]

In [0]:
clf = LogisticRegression(random_state=0).fit(train_data, y_train)
pred = clf.predict(test_data)



In [0]:
precision = precision_score(y_test,pred,average='weighted')
recall = recall_score(y_test,pred,average='weighted')
f1 = f1_score(y_test,pred,average='weighted')
accuracy = accuracy_score(y_test,pred)

print('precision:',precision,'\nrecall:',recall,'\nf1:',f1,'\nacc:',accuracy)

precision: 0.7477162193510789 
recall: 0.753376688344172 
f1: 0.7468932304493048 
acc: 0.753376688344172
