<a href="https://colab.research.google.com/github/nsandadi/Coreference-Resolution/blob/main/Pooling_GAP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# POOLING
### BERT, RoBERTa, CorefRoBERTa

## Install Transformers library from Huggingface

In [1]:
# !pip install transformers
!pip install git+https://github.com/huggingface/transformers
!pip install pytorch-pretrained-bert

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-w3a4hpem
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-w3a4hpem
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 13.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 33.1MB/s 
Building wheels for collected packages: transformers
  Building wh

## Import Statements

In [2]:
import pandas as pd
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import gc
import torch
from torch import nn
from torch import optim
from torch.nn.utils import clip_grad_norm_
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from copy import deepcopy
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import os
import timeit

## Load data and packages

In [3]:
print('installing apex')
os.system('git clone -q https://github.com/NVIDIA/apex.git')
os.system('pip install -q --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex/')
os.system('rm -rf apex')


print('downloading data')
os.system('pip install pytorch-pretrained-bert -q')
os.system('wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-development.tsv -q')
os.system('wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-test.tsv -q')
os.system('wget https://github.com/google-research-datasets/gap-coreference/raw/master/gap-validation.tsv -q')


print('loading data')
gap_dev = pd.read_csv('gap-development.tsv', delimiter='\t')
gap_val = pd.read_csv('gap-validation.tsv', delimiter='\t')
gap_test = pd.read_csv('gap-test.tsv', delimiter='\t')

all_data = pd.concat([gap_dev, gap_val, gap_test])
all_data = all_data.reset_index(drop=True)

installing apex
downloading data
loading data


## Download pre-trained models

In [4]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, WordpieceTokenizer
from transformers import RobertaTokenizer, RobertaModel
from transformers import AutoModel, AutoTokenizer

In [11]:
######## hyper-parameters tuning ######
# BERT_NAME = 'bert-large-uncased'
BERT_NAME = 'roberta-large'
# BERT_NAME = "nielsr/coref-roberta-large"
BERT_SIZE = 1024  # 768 for base, 1024 for large
SEED = 23
L = 8
S_DIM = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # If model is BERT
# tokenizer = BertTokenizer.from_pretrained(BERT_NAME)
# bert = BertModel.from_pretrained(BERT_NAME)
# bert = bert.to(device)

# If model is RoBERTa
bert = RobertaModel.from_pretrained(BERT_NAME, output_hidden_states=True).cuda()
tokenizer = RobertaTokenizer.from_pretrained(BERT_NAME)

# # If model is CorefBERT or CorefRoBERTa
# bert = AutoModel.from_pretrained(BERT_NAME, output_hidden_states= True).cuda()
# tokenizer = AutoTokenizer.from_pretrained(BERT_NAME)
# bert = bert.to(device)

## Model Training and Prediction

In [12]:
## Tokenize
def bert_tokenize(text, p, a, b, p_offset, a_offset, b_offset):
    idxs = {}
    tokens = []
    
    a_span = [a_offset, a_offset+len(a), 'a']
    b_span = [b_offset, b_offset+len(b), 'b']
    p_span = [p_offset, p_offset+len(p), 'p']
    
    spans = [a_span, b_span, p_span]
    spans = sorted(spans, key=lambda x: x[0])
    
    last_offset = 0
    idx = -1
    
    def token_part(string):
        _idxs = []
        nonlocal idx
        for w in tokenizer.tokenize(string):
            idx += 1
            tokens.append(w)
            _idxs.append(idx)
        return _idxs
    
    
    for span in spans:
        token_part(text[last_offset:span[0]])
        idxs[span[2]] = token_part(text[span[0]:span[1]])
        last_offset = span[1]
    token_part(text[last_offset:])
    return tokens, idxs

print('tokenize...')
_ = all_data.apply(lambda x: bert_tokenize(x['Text'], x['Pronoun'], x['A'], x['B'], x['Pronoun-offset'], x['A-offset'], x['B-offset']), axis=1)
all_data['encode'] = [tokenizer.convert_tokens_to_ids(i[0]) for i in _]
all_data['p_idx'] = [i[1]['p'] for i in _]
all_data['a_idx'] = [i[1]['a'] for i in _]
all_data['b_idx'] = [i[1]['b'] for i in _]

## Data pre-processing
print('clean..')
all_data.at[2602, 'encode'] = all_data.loc[2602, 'encode'][:280]
all_data.at[3674, 'encode'] = all_data.loc[3674, 'encode'][:280]  # too long, target in head
all_data.at[209, 'encode'] = all_data.loc[209, 'encode'][60:]
all_data.at[209, 'a_idx'] = [_ - 60 for _ in all_data.loc[209, 'a_idx']]  # too log, traget in tail
all_data.at[209, 'b_idx'] = [_ - 60 for _ in all_data.loc[209, 'b_idx']]
all_data.at[209, 'p_idx'] = [_ - 60 for _ in all_data.loc[209, 'p_idx']]


class GPTData(Dataset):
    
    def __init__(self, dataframe):
        self.data = dataframe
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        _ = self.data.loc[idx]
        sample = {'id': _['ID'],
                  'encode': torch.LongTensor([101] + _['encode'] + [102]),
                  'p_idx': torch.LongTensor(_['p_idx'])+1,
                  'a_idx': torch.LongTensor(_['a_idx'])+1,
                  'b_idx': torch.LongTensor(_['b_idx'])+1,
                  'coref': torch.LongTensor([0 if _['A-coref'] else 1 if _['B-coref'] else 2])
                 }
        return sample
        
class SortLenSampler(Sampler):
    
    def __init__(self, data_source, key):
        self.sorted_idx = sorted(range(len(data_source)), key=lambda x: len(data_source[x][key]))
    
    def __iter__(self):
        return iter(self.sorted_idx)
    
    def __len__(self):
        return len(self.sorted_idx)
        

def gpt_collate_func(x):
    _ = [[], [], [], [], [], []]
    for i in x:
        _[0].append(i['encode'])
        _[1].append(i['p_idx'])
        _[2].append(i['a_idx'])
        _[3].append(i['b_idx'])
        _[4].append(i['coref'])
        _[5].append(i['id'])
    return torch.nn.utils.rnn.pad_sequence(_[0], batch_first=True, padding_value=0), \
           torch.nn.utils.rnn.pad_sequence(_[1], batch_first=True, padding_value=-1), \
           torch.nn.utils.rnn.pad_sequence(_[2], batch_first=True, padding_value=-1), \
           torch.nn.utils.rnn.pad_sequence(_[3], batch_first=True, padding_value=-1), \
           torch.cat(_[4], dim=0), _[5]

## Pooling
def meanpooling(x, idx, pad=-1):
    """x: Layer X Seq X Feat, idx: Seq """
    t_type = torch.cuda.FloatTensor if isinstance(x, torch.cuda.FloatTensor) else torch.FloatTensor
    _ = torch.zeros((x.shape[0], x.shape[2]))
    cnt = 0
    for i in idx:
        if i == pad:
            break
        for j in range(x.shape[0]):
            _[j] += x[j,i,:]
        cnt += 1
    
    if cnt == 0:
        raise ValueError('0 dive')
    mean = _/cnt
    return mean

def sumpooling(x, idx, pad=-1):
    """x: Layer X Seq X Feat, idx: Seq """
    t_type = torch.cuda.FloatTensor if isinstance(x, torch.cuda.FloatTensor) else torch.FloatTensor
    _ = torch.zeros((x.shape[0], x.shape[2]))
    cnt = 0
    for i in idx:
        if i == pad:
            break
        for j in range(x.shape[0]):
            _[j] += x[j,i,:]
    return _

def maxpooling(x, idx, pad=-1):
  """x: Layer X Seq X Feat, idx: Seq"""
  t_type = torch.cuda.FloatTensor if isinstance(x, torch.cuda.FloatTensor) else torch.FloatTensor
  _ = torch.full((x.shape[0], x.shape[2]), -float('inf'))
  for i in idx:
      if i == pad:
          break
      for j in range(x.shape[0]):
          for k in range(x.shape[2]):
              _[j][k] = torch.max(_[j][k], x[j,i,:][k])
  return _


def minpooling(x, idx, pad=-1):
  """x: Layer X Seq X Feat, idx: Seq"""
  t_type = torch.cuda.FloatTensor if isinstance(x, torch.cuda.FloatTensor) else torch.FloatTensor
  _ = torch.full((x.shape[0], x.shape[2]), float('inf'))
  for i in idx:
      if i == pad:
          break
      for j in range(x.shape[0]):
          for k in range(x.shape[2]):
              _[j][k] = torch.min(_[j][k], x[j,i,:][k])
  return _


def get_span_tensor(bert_t, index, last_layer=L, pad_id=-1):
    """return Seq X Layer X Feat"""
    span_tensor = []
    for i in index:
        if i == pad_id:
            break
        # span_tensor.append(bert_t[16:21, i, :])
        span_tensor.append(bert_t[-last_layer:, i, :])
    return torch.stack(span_tensor)
    

_ = GPTData(all_data)
gpt_iter = DataLoader(_, batch_size=5, sampler=SortLenSampler(_, 'encode'), collate_fn=gpt_collate_func)

## Extract BERT features
bert_feats = []
print('extract bert features..')
start = timeit.default_timer()
bert.eval()
for (x, p, a, b, y, id_) in gpt_iter:
    r = bert.forward(x.cuda(), attention_mask= (x!=0).cuda())
    # _ = torch.stack(r[0][-L:]).cpu().data.clone()  ## For BERT
    _ = torch.stack(r[2][-L:]).cpu().data.clone()  ## For RoBERTa
    del(r)
    for i, v in enumerate(id_):
        bert_feats.append({'a': meanpooling(_[:,i,:],a[i]),
                           'b': meanpooling(_[:,i,:],b[i]),
                           'p': meanpooling(_[:,i,:],p[i]),
                           'ap': (a[i][0] - p[i][0]).type(torch.FloatTensor),
                           'bp': (b[i][0] - p[i][0]).type(torch.FloatTensor),
                           'y': y[i],
                           'id': v})

print('extract bert features finished.')
stop = timeit.default_timer()
print('Runtime: ', stop - start)        

torch.manual_seed(SEED)
np.random.seed(SEED)

############

class BERTfeature(Dataset):
    
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]
    
def bert_collate_func(x):
    _ = [[] for i in range(6)]
    for i in x:
        _[0].append(i['a'])
        _[1].append(i['b'])
        _[2].append(i['p'])
        _[3].append(i['y'])
        _[4].append(i['ap'])
        _[5].append(i['bp'])
    return [torch.stack(i) for i in _]


## Split into train & test
test = [i for i in bert_feats if 'dev' in i['id']]
train = [i for i in bert_feats if 'dev' not in i['id']]

############

class SimilarityLayer(nn.Module):
    
    def __init__(self, hidden_dim, dropout=0.3):
        super(SimilarityLayer, self).__init__()
        self.ffnn = nn.Linear(hidden_dim*5, S_DIM)
        nn.init.kaiming_normal_(self.ffnn.weight)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, a, b, p):
        _input = torch.cat([p, a, b, p*a, p*b], dim=1)
        y = self.ffnn(self.dropout(_input))
        
        return y
    

class MSnet(nn.Module):
    
    def __init__(self, hidden_dim, dropout=0.5, hidden_layer=4):
        super(MSnet, self).__init__()
        self.sim_layers = nn.ModuleList([SimilarityLayer(hidden_dim, dropout=dropout) for i in range(hidden_layer)])
        self.bn = nn.BatchNorm1d(S_DIM*hidden_layer)
        self.dropout = nn.Dropout(dropout)
        self.mention_score = nn.Linear(S_DIM*hidden_layer+2, 3)
        self.dist_ecoding = nn.Linear(1,1)
        
    def forward(self, a, b, p, ap, bp):
        y = []
        for i, l in enumerate(self.sim_layers):
            y.append(l(a[:,i,:], b[:,i,:], p[:,i,:]))
        y = torch.cat(y, dim=1) # B X 64*Layer
        y = self.dropout(self.bn(y).relu())
        ap = self.dist_ecoding(ap[:,None]).tanh()
        bp = self.dist_ecoding(bp[:,None]).tanh()
        return self.mention_score(torch.cat([y, ap, bp], dim=1))


def training_cuda(epoch, model, lossfunc, optimizer, train_iter, val_iter, test_iter, start=5):
    best_score = 10
    for i in range(epoch):
        model.train()
        epoch_score = np.array([])
        for (a, b, p, y, ap, bp) in iter(train_iter):
            model.zero_grad()
            pred = model.forward(a.cuda(), b.cuda(), p.cuda(), ap.cuda(), bp.cuda())
            # loss = lossfunc(pred, y.cuda()) + l2 * torch.stack([torch.norm(i[1]) for i in model.named_parameters() if 'weight' in i[0]]).sum()
            loss = lossfunc(pred, y.cuda())
            s = score(pred.softmax(1), y.cuda())
            epoch_score = np.append(epoch_score, s.cpu().data.numpy())
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            model.eval()
            model.zero_grad()
            val_score =  np.array([])
            for (va, vb, vp, vy, vap, vbp) in val_iter:
                vpred = model.forward(va.cuda(), vb.cuda(), vp.cuda(), vap.cuda(), vbp.cuda())
                vs = score(vpred.softmax(1), vy.cuda())
                val_score = np.append(val_score, vs.cpu().data.numpy())
            print('epcoh {:02} - train_score {:.4f} - val_score {:.4f} '.format(
                                i, np.mean(epoch_score), np.mean(val_score)))
            if  np.mean(val_score) < best_score:
                best_score = np.mean(val_score)
                if i > start:
                    torch.save(model.state_dict(), 'tmp.m')
    model.load_state_dict(torch.load('tmp.m'))
    test_pred = np.array([])
    for (ta, tb, tp, ty, tap, tbp) in test_iter:
        vpred = model.forward(ta.cuda(), tb.cuda(), tp.cuda(), tap.cuda(), tbp.cuda())
        test_pred = np.append(test_pred, vpred.softmax(1).cpu().data.numpy())
    return best_score, test_pred


def score(pred, y):
    t_float = torch.FloatTensor
    if isinstance(pred, torch.cuda.FloatTensor):
        t_float = torch.cuda.FloatTensor
    y = (torch.cumsum(torch.ones(y.shape[0], 3), dim=1) -1).type(t_float) == y[:,None].type(t_float)
    s = (y.type(t_float) * pred).sum(1).log()
    return -s

## Training
print('training')
m = MSnet(BERT_SIZE, dropout=0.4, hidden_layer=L).cuda()
optimizer = optim.Adam(m.parameters(), lr=3e-4, weight_decay=1e-5)
loss_fuc = nn.CrossEntropyLoss()
batch_size = 32

kfold = KFold(n_splits=5, random_state=SEED, shuffle=True)
scores = []
m_s = deepcopy(m.state_dict().copy())
opt_s = deepcopy(optimizer.state_dict().copy())

k_th = 0
test_iter = DataLoader(BERTfeature(test), batch_size=batch_size, shuffle=False, collate_fn=bert_collate_func)
test_preds = []

for train_idx, val_idx in kfold.split(list(range(len(train)))):
    
    _train = [v for i, v in enumerate(train) if i in train_idx]
    _val = [v for i, v in enumerate(train) if i in val_idx]
    train_iter = DataLoader(BERTfeature(_train), batch_size=batch_size, shuffle=True, collate_fn=bert_collate_func)
    val_iter = DataLoader(BERTfeature(_val), batch_size=batch_size, shuffle=False, collate_fn=bert_collate_func)
    
    m.load_state_dict(m_s)
    optimizer.load_state_dict(opt_s)
    s, y = training_cuda(30, m, loss_fuc, optimizer, train_iter, val_iter, test_iter)
    scores.append(s)
    test_preds.append(y)
    
    k_th += 1
    print('------------'*3)
    
print('Score: {:.4f} {:.4f}'.format(np.mean(scores), np.std(scores)))
probs = np.mean(test_preds, axis=0).reshape((-1, 3))
true = torch.cat([ty for (ta, tb, tp, ty, tap, tbp) in test_iter], dim=0).data.numpy()
t_ids = [i['id'] for i in test]
print(log_loss(true, probs))

## Accuracy and F1-score
acc_pred = []
for i in range(len(probs)):
    acc_pred.append(list(probs[i]).index(max(probs[i])))
acc_pred = np.asarray(acc_pred)
from sklearn.metrics import accuracy_score
print("Accuracy:",accuracy_score(true, acc_pred))
print("F1 score:", f1_score(true, acc_pred, average=None))


tokenize...
clean..
extract bert features..
extract bert features finished.
Runtime:  43.43034713999987
training
epcoh 00 - train_score 1.0570 - val_score 0.9331 
epcoh 01 - train_score 0.8668 - val_score 0.7827 
epcoh 02 - train_score 0.7168 - val_score 0.6807 
epcoh 03 - train_score 0.6226 - val_score 0.5869 
epcoh 04 - train_score 0.5416 - val_score 0.5381 
epcoh 05 - train_score 0.4802 - val_score 0.4776 
epcoh 06 - train_score 0.4056 - val_score 0.4315 
epcoh 07 - train_score 0.3701 - val_score 0.4107 
epcoh 08 - train_score 0.3132 - val_score 0.3741 
epcoh 09 - train_score 0.2939 - val_score 0.3595 
epcoh 10 - train_score 0.2573 - val_score 0.3641 
epcoh 11 - train_score 0.2246 - val_score 0.3509 
epcoh 12 - train_score 0.2016 - val_score 0.3507 
epcoh 13 - train_score 0.1895 - val_score 0.3422 
epcoh 14 - train_score 0.1626 - val_score 0.3360 
epcoh 15 - train_score 0.1544 - val_score 0.3390 
epcoh 16 - train_score 0.1426 - val_score 0.3560 
epcoh 17 - train_score 0.1218 - val_s