In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import *
import time

import spacy 
nlp = spacy.load('en_core_web_sm')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

In [None]:
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/extract_features.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
! 7z x uncased_L-12_H-768_A-12.zip

In [None]:
def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    for r3 in r1.split(' '):
        s = str(s).replace(r3,r2)
    return s

def fill_nlp_tag_empty_cols(df, tag):
    df[f"A-{tag}"] = None
    df[f"B-{tag}"] = None
    
def fill_nlp_empty_cols(df, tags):
    for tag in tags:
        fill_nlp_tag_empty_cols(df, tag)

def fill_word_offset_empty_cols(df):
    df['Pronoun-word-offset'] = None
    df['A-word-offset'] = None
    df['B-word-offset'] = None
    df['A-word-dist'] = None
    df['B-word-dist'] = None
    
def word_offset(doc, w):
    count = 0
    for token in doc:
        if token.text == w:
            break
        if not token.is_punct and token.text != '`':
            count += 1
    return count
    
def fill_similarity(df):
    df['sim_A_P'] = 0.0
    df['sim_B_P'] = 0.0
    
def get_nlp_tag_feature(doc, tag):
    tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep'])
    A_tag = len(tokens[((tokens['text']=='subjectone') & (tokens['dep']==tag))])
    B_tag = len(tokens[((tokens['text']=='subjecttwo') & (tokens['dep']==tag))])
    
    return A_tag, B_tag

def get_similarity(doc, pronoun):
    A_token, B_token, P_token = None, None, None
    for token in doc:
        if token.text == "subjectone":
            A_token = token
        if token.text == "subjecttwo":
            B_token = token
        
        if token.text == pronoun:
            P_token = token
            
    sim_A_P = 0 if A_token is None else A_token.similarity(P_token)
    sim_B_P = 0 if B_token is None else B_token.similarity(P_token)

    return sim_A_P, sim_B_P

def sentences_same(s, w1, w2):
    doc = nlp(str(s))
    for sent in doc.sents:
        t1 = False
        for token in sent:
            if w1 == token.text:
                t1 = True
            if t1 and w2 == token.text:
                return True
            elif t1:
                return False
            
def sentences(s, w1, w2):
    doc = nlp(str(s))
    t1, t2 = None
    for token in doc:
        if token.text == w1:
            t1 = token
        if token.text == w2:
            t2 = token
    if t1.sent == t2.sent:
        return False
    else:
        return True

def add_nlp_features_with_similarirty(df, tags):
    size = len(df)
    fill_nlp_empty_cols(df, tags)
    fill_word_offset_empty_cols(df)
    
    for i in range(0, size):
        text = df.loc[i, 'Text']
        doc = nlp(str(text))
        
        #add tag features
        for tag in tags:
            df.loc[i, f"A-{tag}"], df.loc[i, f"B-{tag}"] = get_nlp_tag_feature(doc, tag)
            
        #add word offset features
        df.loc[i, 'Pronoun-word-offset'] = word_offset(doc, df.loc[i, 'Pronoun'])
        df.loc[i, 'A-word-offset'] = word_offset(doc, 'subjectone')
        df.loc[i, 'B-word-offset'] = word_offset(doc, 'subjecttwo')
        
        df.loc[i, 'A-word-dist'] = np.abs(df.loc[i, 'Pronoun-word-offset'] - df.loc[i, 'A-word-offset'])
        df.loc[i, 'B-word-dist'] = np.abs(df.loc[i, 'Pronoun-word-offset'] - df.loc[i, 'B-word-offset'])
        
        #add similarity 
        df.loc[i, 'sim_A_P'], df.loc[i, 'sim_B_P'] = get_similarity(doc, df.loc[i, 'Pronoun'])


    
def add_distance_features(df):
    df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len)
    df['A-offset2'] = df['A-offset'] + df['A'].map(len)
    df['B-offset2'] = df['B-offset'] + df['B'].map(len)
    df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']).abs()
    df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']).abs()
    df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1)
    df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1)

In [None]:
import modeling
import extract_features
import tokenization

In [None]:
def compute_offset_no_spaces(text, offset):
    count = 0
    for pos in range(offset):
        if text[pos] != " ": count +=1
    return count

def count_chars_no_special(text):
    count = 0
    special_char_list = ["#"]
    for pos in range(len(text)):
        if text[pos] not in special_char_list: count +=1
    return count

def count_length_no_special(text):
    count = 0
    special_char_list = ["#", " "]
    for pos in range(len(text)):
        if text[pos] not in special_char_list: count +=1
    return count

In [None]:
def bert_embeddings(df):
    text = df["Text"]
    text.to_csv("input.txt", index = False, header = False)
    
    # run BERT model
    os.system("python3 extract_features.py \
      --input_file=input.txt \
      --output_file=output.jsonl \
      --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \
      --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \
      --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \
      --layers=-1 \
      --max_seq_length=256 \
      --batch_size=8")
    
    bert_output = pd.read_json("output.jsonl", lines = True)

    os.system("rm output.jsonl")
    os.system("rm input.txt")

    index = df.index
    columns = ["emb_A", "emb_B", "emb_P", "label"]
    emb = pd.DataFrame(index = index, columns = columns)
    emb.index.name = "ID"
    
    for i in range(len(df)): # For each line in the data file
        # get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT
        P = df.loc[i,"Pronoun"].lower()
        A = df.loc[i,"A"].lower()
        B = df.loc[i,"B"].lower()

        # For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT
        P_offset = compute_offset_no_spaces(df.loc[i,"Text"], df.loc[i,"Pronoun-offset"])
        A_offset = compute_offset_no_spaces(df.loc[i,"Text"], df.loc[i,"A-offset"])
        B_offset = compute_offset_no_spaces(df.loc[i,"Text"], df.loc[i,"B-offset"])
        # Figure out the length of A, B, not counting spaces or special characters
        A_length = count_length_no_special(A)
        B_length = count_length_no_special(B)

        # Initialize embeddings with zeros
        emb_A = np.zeros(768)
        emb_B = np.zeros(768)
        emb_P = np.zeros(768)

        # Initialize counts
        count_chars = 0
        cnt_A, cnt_B, cnt_P = 0, 0, 0

        features = pd.DataFrame(bert_output.loc[i,"features"])
        
        for j in range(2,len(features)):  # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words
            token = features.loc[j,"token"]

            # See if the character count until the current token matches the offset of any of the 3 target words
            if count_chars  == P_offset: 
                # print(token)
                emb_P += np.array(features.loc[j,"layers"][0]['values'])
                cnt_P += 1
            if count_chars in range(A_offset, A_offset + A_length): 
                # print(token)
                emb_A += np.array(features.loc[j,"layers"][0]['values'])
                cnt_A +=1
            if count_chars in range(B_offset, B_offset + B_length): 
                # print(token)
                emb_B += np.array(features.loc[j,"layers"][0]['values'])
                cnt_B +=1
            # Update the character count
            count_chars += count_length_no_special(token)
        # Taking the average between tokens in the span of A or B, so divide the current value by the count	
        emb_A /= cnt_A
        emb_B /= cnt_B
        
        label = "Neither"
        if (df.loc[i,"A-coref"] == 1):
            label = "A"
        if (df.loc[i,"B-coref"] == 1):
            label = "B"

        # Put everything together in emb
        emb.iloc[i] = [emb_A, emb_B, emb_P, label]
        
    return emb


def parse_embeddings(embedding_df):
    embedding_df.sort_index(inplace=True)
    
    X = np.zeros((len(embedding_df), 768 * 3))
    Y = np.zeros((len(embedding_df), 3))
    
    for i in range(len(embedding_df)):
        A = np.array(embedding_df.loc[i,"emb_A"])
        B = np.array(embedding_df.loc[i,"emb_B"])
        P = np.array(embedding_df.loc[i,"emb_P"])
        X[i] = np.concatenate((A,B,P))
        
        label = embedding_df.loc[i, "label"]
        
        if label == 'A':
            Y[i, 0] = 1
        elif label == 'B':
            Y[i, 1] = 1
        else:
            Y[i, 2] = 1
            
    return X, Y


def get_embedding_similarities(embedding_df):
    embedding_df.sort_index(inplace=True)
    
    X = pd.DataFrame(index = embedding_df.index, columns = ['emb_sim_A_P', 'emb_sim_B_P'])
    for i in range(len(embedding_df)):
        A = np.array(embedding_df.loc[i,"emb_A"])
        B = np.array(embedding_df.loc[i,"emb_B"])
        P = np.array(embedding_df.loc[i,"emb_P"])
        concat = np.concatenate((A, B, P))
        
        if np.sum(np.isnan(concat)):
            continue

        A = A.reshape(-1, 1)
        B = B.reshape(-1, 1)
        P = P.reshape(-1, 1)
        X.loc[i, 'emb_sim_A_P'] = metrics.pairwise.cosine_similarity(A, P)
        X.loc[i, 'emb_sim_B_P'] = metrics.pairwise.cosine_similarity(B, P)
        
    return X

In [None]:
gap_development = pd.read_csv('../input/gapdatasetmaksym/gapdataset/gap-development.tsv', delimiter='\t')
gap_test = pd.read_csv('../input/gapdatasetmaksym/gapdataset/gap-test.tsv', delimiter='\t')
gap_validation = pd.read_csv('../input/gapdatasetmaksym/gapdataset/gap-validation.tsv', delimiter='\t')

In [None]:
gap_development.head()

In [None]:
gap_test.head()

In [None]:
gap_validation.head()

In [None]:
def rename_labels(df):
    df['A-coref'] = df['A-coref'].astype(int)
    df['B-coref'] = df['B-coref'].astype(int)
    df['Neither'] = 1.0 - (df['A-coref'] + df['B-coref'])
    
    
# train = pd.concat((gap_test, gap_validation, gap_development)).reset_index(drop=True)
train = pd.concat((gap_test, gap_validation)).reset_index(drop=True)
rename_labels(train)
train.head()

In [None]:
print("Started at ", time.ctime())
# test_emb = bert_embeddings(gap_test)
# test_emb.to_json("contextual_embeddings_gap_test.json", orient = 'columns')

# validation_emb = bert_embeddings(gap_validation)
# validation_emb.to_json("contextual_embeddings_gap_validation.json", orient = 'columns')

train_emb = bert_embeddings(train)
train_emb.to_json("contextual_embeddings_train.json", orient = 'columns')

development_emb = bert_embeddings(gap_development)
development_emb.to_json("contextual_embeddings_gap_development.json", orient = 'columns')
print("Finished at ", time.ctime())

In [None]:
train_bert = pd.read_json("contextual_embeddings_train.json")
development_bert = pd.read_json("contextual_embeddings_gap_development.json")

X_train, Y_train = parse_embeddings(train_bert)
X_test, Y_test = parse_embeddings(development_bert)

# Drop NaN
remove_test = [row for row in range(len(X_test)) if np.sum(np.isnan(X_test[row]))]
X_test = np.delete(X_test, remove_test, 0)
Y_test = np.delete(Y_test, remove_test, 0)

remove_train = [row for row in range(len(X_train)) if np.sum(np.isnan(X_train[row]))]
X_train = np.delete(X_train, remove_train, 0)
Y_train = np.delete(Y_train, remove_train, 0)

In [None]:
train['Text'] = train.apply(lambda r: name_replace(r['Text'], r['A'], 'subjectone'), axis=1)
train['Text'] = train.apply(lambda r: name_replace(r['Text'], r['B'], 'subjecttwo'), axis=1)

Find top 5 the most often occurency tags

In [None]:
tags = {}
for text in train['Text']:
    doc = nlp(str(text))
    for token in doc:
        if token.text == 'subjectone' or token.text == 'subjecttwo':
            if token.dep_ in tags:
                tags[token.dep_] += 1
            else:
                tags[token.dep_] = 1

In [None]:
sorted_tags = sorted(tags.items(), key=lambda kv: kv[1])
sorted_tags[-5:]

Extend feature matrix

In [None]:
import warnings
warnings.filterwarnings('ignore')

add_distance_features(train)
add_nlp_features_with_similarirty(train, ['poss', 'nsubj', 'pobj', 'dobj', 'conj'])

add_distance_features(gap_development)
add_nlp_features_with_similarirty(gap_development, ['poss', 'nsubj', 'pobj', 'dobj', 'conj'])

In [None]:
sim_train = get_embedding_similarities(train_bert)
sim_test = get_embedding_similarities(development_bert)

In [None]:
feature_col = [
               'Pronoun-offset', 
#                'Pronoun-offset2', 
               'A-offset', 
#                'A-offset2', 
               'A-dist', 
               'B-offset', 
#                'B-offset2',
               'B-dist', 
#                'section_min',  
#                'section_max',
               'A-poss', 
               'B-poss', 
               'A-nsubj',
               'B-nsubj',
               'A-pobj',
               'B-pobj',
               'A-dobj',
               'B-dobj',
               'A-conj',
               'B-conj',
               'A-word-offset',
               'B-word-offset',
               'Pronoun-word-offset',
               'A-word-dist',
               'B-word-dist',
#                'emb_sim_A_P',
#                'emb_sim_B_P'
#                'A-with-P',
#                'B-with-P'
              ]
pred_col = ['A-coref', 'B-coref', 'Neither']

In [None]:
# drop rows which correspond to NaN rows in embeddings df
train.drop(remove_train, axis=0, inplace=True)
gap_development.drop(remove_test, axis=0, inplace=True)

X_train = np.concatenate((train[feature_col], X_train), axis=1)
X_test = np.concatenate((gap_development[feature_col], X_test), axis=1)

print(X_test.shape)
print(gap_development.shape)
# x_train, x_test, y_train, y_test = model_selection.train_test_split(train[feature_col].fillna(-1), train[pred_col], test_size=0.2, random_state=1)

In [None]:
from keras import applications, layers, models, regularizers
import keras.backend as K
from keras import callbacks as kc
import tensorflow as tf
from sklearn.model_selection import cross_val_score, KFold

batch_size = 32
dropout = 0.6

def create_conv1d(train_data):   
    X_input = layers.Input(shape=(train_data.shape[1], 1))
    
    X = layers.Conv1D(128, 3)(X_input)
    X = layers.BatchNormalization()(X)
    X = layers.Activation('relu')(X)
    X = layers.MaxPool1D()(X)
    X = layers.Dropout(dropout, seed = 2)(X)
    
    X = layers.Dense(100)(X)
    X = layers.BatchNormalization()(X)
    X = layers.Activation('relu')(X)
    X = layers.Dropout(dropout)(X)
#     X = layers.Conv1D(64, 3, activation='relu')(X_input)
#     X = layers.Conv1D(64, 3, activation='relu')(X)
#     X = layers.BatchNormalization()(X)
#     X = layers.Activation('relu')(X)
#     X = layers.MaxPool1D(3)(X)
#     X = layers.Dropout(dropout, seed = 2)(X)
    
#     X = layers.Conv1D(128, 3, activation='relu')(X)
#     X = layers.Conv1D(128, 3, activation='relu')(X)
#     X = layers.BatchNormalization()(X)
#     X = layers.Activation('relu')(X)
#     X = layers.MaxPool1D(3)(X)
#     X = layers.Dropout(dropout, seed = 7)(X)
    
    X = layers.Flatten()(X)
    X = layers.Dense(3, name = 'output', kernel_regularizer = regularizers.l2(0.1))(X)
    X = layers.Activation('softmax')(X)

    model = models.Model(input = X_input, output = X)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


def create_mlp(train_data):
    input_X = layers.Input([train_data.shape[1]])
    
    X = layers.Dense(150)(input_X)
    X = layers.BatchNormalization()(X)
    X = layers.Activation('relu')(X)
    X = layers.Dropout(dropout)(X)
    
#     X = layers.Dense(200)(X)
#     X = layers.BatchNormalization()(X)
#     X = layers.Activation('relu')(X)
#     X = layers.Dropout(dropout)(X)
    
    X = layers.Dense(3, kernel_regularizer = regularizers.l1(0.3))(X)
    X = layers.Activation('softmax')(X)
    
    model = models.Model(input = input_X, output = X)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
def train_kfold(model, X_train, Y_train, name):
    min_loss = 1.0
    best_model = 0
    folds = KFold(n_splits=5, shuffle=True, random_state=3)
    val_scores = []
    test_scores = []
    os.system("rm {}_best_model_*".format(name))
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)):
        X_tr = X_train[train_index]
        X_val =  X_train[valid_index]
        Y_tr = Y_train[train_index]
        Y_val = Y_train[valid_index]

        file_path = "{}_best_model_{}.hdf5".format(name, fold_n+1)
        check_point = kc.ModelCheckpoint(file_path, monitor = "val_loss", save_best_only = True, mode = "min")
        early_stopping = kc.EarlyStopping(monitor='val_loss', patience=50, mode='min', restore_best_weights=True)
        callbacks = [check_point, early_stopping]

        history = model.fit(X_tr, Y_tr, 
                epochs=1000, 
                batch_size=batch_size,
                callbacks=callbacks,
                validation_data=(X_val, Y_val),
                verbose=1)
        
        if min(history.history['val_loss']) < min_loss:
            min_loss = min(history.history['val_loss'])
            best_model = fold_n + 1
        
    return best_model

In [None]:
# conv1d.fit(np.expand_dims(X_train,axis=2), Y_train,
#           epochs=epochs,
#           batch_size=batch_size)
# print('Conv log_loss: ', metrics.log_loss(Y_test, conv1d.predict(np.expand_dims(X_test,axis=2))))

In [None]:
conv1d = create_conv1d(np.expand_dims(X_train,axis=2))
best_conv1d = train_kfold(conv1d, np.expand_dims(X_train,axis=2), Y_train, "conv1d")
conv1d.load_weights("./conv1d_best_model_{}.hdf5".format(best_conv1d))
print('Conv log_loss: ', metrics.log_loss(Y_test, conv1d.predict(np.expand_dims(X_test,axis=2))))

In [None]:
mlp = create_mlp(X_train)
best_mlp = train_kfold(mlp, X_train, Y_train, "mlp")
mlp.load_weights("./mlp_best_model_{}.hdf5".format(best_mlp))
print('MLP log_loss: ', metrics.log_loss(Y_test, mlp.predict(X_test)))

In [None]:
mlp2 = create_mlp(X_train)
mlp2.fit(X_train, Y_train, epochs=20, batch_size=batch_size) 
print('MLP log_loss: ', metrics.log_loss(Y_test, mlp2.predict(X_test)))

In [None]:
# model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth = 7, n_estimators=2000, random_state=33))
model = linear_model.LogisticRegression(penalty='l2', multi_class='multinomial')
model.fit(X_train, Y_train)
print('log_loss: ', metrics.log_loss(Y_test, model.predict_proba(X_test)))

In [None]:
# gap_development['A-coref'] = train['A-coref'].astype(int)
# gap_development['B-coref'] = train['B-coref'].astype(int)
# gap_development['NEITHER'] = 1.0 - (train['A-coref'] + train['B-coref'])
# gap_development.head()

In [None]:
# add_additional_features(gap_development)
# gap_development_x = gap_development[feature_col]
# gap_development_y = gap_development[pred_col]
# gap_development_pred = model.predict_proba(gap_development_x)
# print('log_loss: ', metrics.log_loss(gap_development_y, gap_development_pred))

In [None]:
# gap_development_y.head()

In [None]:
# gap_development_pred

Submission test

In [None]:
test_sub1 = pd.read_csv('../input/gendered-pronoun-resolution/test_stage_1.tsv', delimiter='\t')
results = model.predict_proba(test_sub1[feature_col])
test_sub1.rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
test_sub1['A'] = results[:,0]
test_sub1['B'] = results[:,1]
test_sub1['NEITHER'] = results[:,2]
test_sub1[['ID', 'A', 'B', 'Neither']].to_csv('submission1.csv', index=False)
test_sub1.head()

In [None]:
test_sub2 = pd.read_csv('../input/gendered-pronoun-resolution/test_stage_2.tsv', delimiter='\t')
results = model.predict_proba(test_sub2[feature_col])
test_sub2.rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
test_sub2['A'] = results[:,0].astype(np.float)
test_sub2['B'] = results[:,1].astype(np.float)
test_sub2['NEITHER'] = results[:,2].astype(np.float)
test_sub2[['ID', 'A', 'B', 'NEITHER']].to_csv('submission2.csv', index=False)
test_sub2.head()

In [None]:
test_sub2 = pd.read_csv('../input/gendered-pronoun-resolution/test_stage_2.tsv', delimiter='\t')
test_sub2.head()
# results = model.predict(test_sub2[feature_col])
# test_sub2.rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
# test_sub2['A'] = results[:,0].astype(np.float)
# test_sub2['B'] = results[:,1].astype(np.float)
# test_sub2['NEITHER'] = results[:,2].astype(np.float)
# test_sub2[['ID', 'A', 'B', 'NEITHER']].to_csv('submission2.csv', index=False)
# test_sub2.head()