In [36]:
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset,DataLoader
import random

In [37]:
read_path1='entailment_trees_emnlp2021_data_v3/dataset/task_1/train.jsonl'
read_path2='fullresult.jsonlines'

In [38]:
def read_data(read_path1,read_path2):
    import jsonlines
    import re
    dict_hypo_theoryset={}
    
    pattern = r'sent\d+: '
    with jsonlines.open(read_path1, "r") as rfd:
        for data in rfd:
            result = re.split(pattern, data['context'])
            result_n = set([x.strip() for x in result if x.strip()!=''])
            dict_hypo_theoryset[data['hypothesis']]=result_n
    rfd.close()
    
    theory=[]
    pos_hypo=[]
    for hypo in dict_hypo_theoryset.keys():
        para=""
        for sent in dict_hypo_theoryset[hypo]:
            para+=sent+'. '
        theory.append(para.strip())
        pos_hypo.append([hypo])
        
    neg_hypo=[]
    
    with jsonlines.open(read_path2, "r") as rfd:
        for data in rfd:
            for key in data.keys():
                neg_hypo.append(data[key].split('.'))
               
    rfd.close()        
    
    
    return theory,pos_hypo,neg_hypo

In [39]:
def fill_padding(data,max_len):
    if len(data)<max_len:
        pad_len=max_len-len(data)
        padding=[0 for _ in range(pad_len)]
        data=torch.tensor(data+padding)
    else:
        data=torch.tensor(data[:,max_len])
    return data

In [40]:
def get_data(pos_hypo,neg_hypo,theory):
    data={'theory':[],'neg':[],'pos':[]}
    for i,t in enumerate(theory):
        data['theory'].append(t)
        data['neg'].append(neg_hypo[i])
        data['pos'].append(pos_hypo[i])
    return data

In [41]:
theory,pos_hypo,neg_hypo=read_data(read_path1,read_path2)

In [42]:
data=get_data(pos_hypo,neg_hypo,theory)

In [43]:
print(pos_hypo[1][0])

the earth rotating on its axis causes stars to move relative to the horizon during the night


In [44]:
np.random.seed(0)

In [50]:
class InputDataset(Dataset):
    def __init__(self, data,tokenizer,sent_len,data_size,split=0.8,mode='train'):
        self.data=data
        self.sent_len=sent_len
        self.data_size=data_size
        self.tokenizer=tokenizer
        self.split=split
        self.mode=mode
        
    def __len__(self,):
        return self.data_size
    
    def __getitem__(self,item):
        x=np.random.rand(1)*100
        assert self.mode in ['train','test'],"mode must be train or test"
        if self.mode=='train':
            item=item%(int(len(data['theory'])*self.split))
        elif self.mode=='test':
            item=item%(int(len(data['theory'])*self.split))+int((1-split)*len(data['theory']))
        
            
        if x[0]<=25:
            label=1
            hypo=self.data['pos'][item][0]
        else:
            label=0
            hypo=random.choice(self.data['neg'][item])
        label=torch.tensor(label,dtype=torch.long)
        theory=self.data['theory'][item]
        
        encoding=self.tokenizer.encode_plus(
            theory,
            hypo,
            add_special_tokens=True, #add [CLS] and [SEP]
            max_length=self.sent_len,#max input length
            return_token_type_ids=True,#theory 11111 and hypo 00000
            pad_to_max_length=True,# fill or cut up to max input length 
            return_attention_mask=True,# attention encoding
            return_tensors='pt'# pytorch model
        )
        
        return {
            "theory":theory,
            "hypo":hypo,
            "input_ids":encoding['input_ids'].flatten(),
            "attention_mask":encoding['attention_mask'],
            "token_type_ids":encoding['token_type_ids'],
            "label":label
        }
    
    
    

In [51]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset=InputDataset(data,tokenizer,500,100000)

In [52]:
data_loader=DataLoader(train_dataset,batch_size=2)

In [53]:
batch=next(iter(data_loader))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [49]:
print(batch)

{'theory': ['the earth revolving around the sun causes stars to appear in different areas in the sky at different times of year. leo is a kind of constellation. a constellation contains stars.', "apparent motion is when an object appears to move relative to another object 's position. the earth rotating on its axis causes stars to appear to move across the sky at night. a star is a kind of celestial object / celestial body. stars appear to move relative to the horizon during the night. earth is a kind of celestial object."], 'hypo': [' cold leo will cause rain / year as they pass by', " earth has a positive stars on a rabbit 's night in a winter axis"], 'input_ids': tensor([[  101,  1996,  3011, 24135,  2105,  1996,  3103,  5320,  3340,  2000,
          3711,  1999,  2367,  2752,  1999,  1996,  3712,  2012,  2367,  2335,
          1997,  2095,  1012,  6688,  2003,  1037,  2785,  1997, 15300,  1012,
          1037, 15300,  3397,  3340,  1012,   102,  3147,  6688,  2097,  3426,
         

In [57]:
from torch import nn
from transformers import BertModel,BertPreTrainedModel,BertConfig
class BertForSeq(BertPreTrainedModel):
    
    def __init__(self,config):
        super(BertForSeq,self).__init__(config)
        self.config=BertConfig(config)
        self.num_labels=config.num_labels #set to 1, it's a logit
        self.bert=BertModel(config)   
        self.dropout=nn.Dropout(config.hidden_derpout_prob)
        self.classifier=nn.Linear(config._hidden_size,self.num_labels)
        
        self.init_weights()
        
    def forward(self,input_ids,attention_mask=None,token_type_ids=None,return_dict=None ):
        return_dict=return_dict if return_dict is not None else self.config.use_return_dict

        outputs=self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=return_dict
        ) #prediction

        pooled_output=outputs[1]
        pooled_output=self.dropout(pooled_output)
        logits=self.classifier(pooled_output)
        return logits
    
    
    

In [137]:
def loss_fn(output, targets):
    return nn.BCEWithLogitsLoss()(output, targets.view(-1,1))

def train_func(data_loader, model, optimizer, device, scheduler):
    model.to(device)
    model.train()
    
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["input_ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["attention_mask"]
        targets = d["label"]
        
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        output = model(
            input_ids=ids,
            attention_mask = mask,
            token_type_ids = token_type_ids
        )
        
        
        loss = loss_fn(output, targets)
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        
def eval_func(data_loader, model, device):
    model.eval()
    
    fin_targets = []
    fin_output = []
    
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["input_ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["attention_mask"]
            targets = d["label"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.long)


            output = model(
                input_ids=ids,
                attention_mask = mask,
                token_type_ids = token_type_ids
            )
        
            fin_targets.extend(targets.cpu().detach().numpy().to_list())
            fin_targets.extend(torch.sigmoid(output).cpu().detach().numpy().to_list())
            
        return fin_output, fin_targets
    

In [None]:
def run():

    df_train, df_valid = train_test_split(data, test_size = 0.1, random_state=23, stratify=data.label.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = DATALoader(
        data=df_train.text.values,
        target=df_train.label.values,
        max_length=512
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=8,
        num_workers=4,
    )

    val_dataset = DATALoader(
        data=df_valid.text.values,
        target=df_valid.label.values,
        max_length=512
    )

    val_data_loader = torch.utils.data.DataLoader(
        val_dataset, 
        batch_size=4,
        num_workers=1,
    )

    device = torch.device("cuda")
    model = BERTClassification()

    param_optimizer = list(model.named_parameters())
    no_decay = [
        "bias", 
        "LayerNorm,bias",
        "LayerNorm.weight",
               ]
    optimizer_parameters = [
        {'params': [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],
                   'weight_decay':0.001},
        {'params': [p for n,p in param_optimizer if any(nd in n for nd in no_decay)],
                   'weight_decay':0.0}
    ]

    num_train_steps = int(len(df_train)/ 8*10)

    optimizers = AdamW(optimizer_parameters, lr=3e-5)

    scheduler = get_linear_schedule_with_warmup(
        optimizers,
        num_warmup_steps=0,
        num_training_steps=num_train_steps

    )

    best_accuracy = 0
    for epoch in range(5):
        train_func(data_loader=train_data_loader, model=model, optimizer=optimizers, device=device, scheduler=scheduler)
        outputs, targets = eval_func(data_loader=train_data_loader, model=model, device=device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score()
        print(f"Accuracy Score: {accuracy}")

        if accuracy>best_accuracy:
            torch.save(model.state_dict(), "model.bin")
            best_accuracy = accuracy