# **COMP90042 Project 1**

Certain parts of the code is from the pytorch tutorial, the link is: https://mccormickml.com/2019/07/22/BERT-fine-tuning/

In [None]:
!pip install pytorch-pretrained-bert pytorch-nlp

In [85]:
from collections import defaultdict
import random
import json
import pandas as pd
import re
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import nltk
from sklearn.metrics import precision_recall_fscore_support

In [81]:
train_json_path = 'train.json'
external_json_path = 'external.json'
dev_json_path = 'dev.json'
test_json_path = 'test-unlabelled.json'

# **1. Generate Training/Development/Test Data**
*     by segmenting the text into first two paragraphs and last eight paragraphs

In [8]:
First = 2
Last = 8

# transform data
train_tsv_path = 'train28.tsv'
dev_tsv_path = 'dev28.tsv'
test_tsv_path = 'test28.tsv'
dev_map_path = 'dev28_map.json'
test_map_path = 'test28_map.json'

In [53]:
def JsonToTsv(target_file, *files):
    texts = []
    labels = []
    for file in files:

        df = pd.read_json(file)
        text = list(df.loc['text'].values)
        try:
            label = list(df.loc['label'].values)
        except:
            label = [0 for _ in range(len(text))]
        texts.extend(text)
        labels.extend(label)

    df = pd.DataFrame({
        'sentence':texts,
        'label':labels
    })

    df.to_csv(target_file, sep='\t', index=False, header=True)

import re
def tsvArticleToTsvPara(source, target):
    df = pd.read_csv(source, sep='\t')
    all_paras = []
    all_labels = []

    d = defaultdict(int)
    para_id = 0

    for i, (texts, label) in enumerate(df.values):

        paras = re.split('\n', texts)
        paras = [p for p in paras if len(p)>10]
        para_index = [i for i in range(len(paras))]


        former = para_index[:2]
        later = para_index[-8:]
        index = former + later
        
        for _ in range(len(index)):

            d[para_id] = i
            para_id+=1
        all_paras.extend([paras[i] for i in index])
        all_labels.extend([label for _ in index])


    df = pd.DataFrame({
        'sentence':all_paras,
        'label':all_labels
    })
    df.to_csv(target, sep='\t', index=False, header=True)
    return d


def generate_all_para(name):
    for n in ['train', 'dev', 'test']:
        if n == 'train':
            ofs = [train_json_path, external_json_path]
        elif n == 'test':
            ofs = [test_json_path,]
        else:
            ofs = [dev_json_path,]
        JsonToTsv(n+'.tsv', *ofs)
        d = tsvArticleToTsvPara(n+'.tsv', n+name+'.tsv')
        with open(n+name+'_map.json', 'w') as f:
            json.dump(d, f)

In [54]:
generate_all_para(str(First)+str(Last))

# 2.Fine-Tuning with BERT

In [73]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU: {}'.format(device_name))

Found GPU: /device:GPU:0


In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

2.1 Train the model

In [55]:
trn_df = pd.read_csv(train_tsv_path, delimiter='\t', header=0, names=['sentence', 'label'])
trn_df.shape

(22767, 2)

In [59]:
nltk.download('punkt')
def get_segmented_data(texts):
    segmenter = nltk.data.load('tokenizers/punkt/english.pickle')
    segmented_texts = []
    for text in texts:
        text = text.split('\n')
        segmented_text = '[CLS] '
        for para in text:
            for sentence in segmenter.tokenize(para):
                segmented_text += sentence + ' [SEP] '
        segmented_text = segmented_text[:-1]
        segmented_texts.append(segmented_text)
    return segmented_texts

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
# read & preprocess instances and labels
sentences = get_segmented_data(trn_df.sentence.values)
labels = [int(i) for i in trn_df.label.values]


In [61]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

100%|██████████| 231508/231508 [00:00<00:00, 1907627.70B/s]


Tokenize the first sentence:
['[CLS]', 'why', 'houston', 'flooding', 'isn', '‘', 't', 'a', 'sign', 'of', 'climate', 'change', '[SEP]']


In [64]:
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [65]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)

In [66]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [67]:
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [68]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

100%|██████████| 407873900/407873900 [00:25<00:00, 15962880.42B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [69]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [70]:
optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)

In [71]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [77]:
t = [] 
train_loss_set = []
epochs = 4

for _ in trange(epochs, desc="Epoch"):  
  
    # Training

    model.train()
  
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
    
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    # Validation

    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))


Epoch:   0%|          | 0/4 [00:00<?, ?it/s][A

Train loss: 0.010229544432498893



Epoch:  25%|██▌       | 1/4 [05:19<15:58, 319.44s/it][A

Validation Accuracy: 0.9544270833333334
Train loss: 0.007508927741463553



Epoch:  50%|█████     | 2/4 [10:38<10:38, 319.46s/it][A

Validation Accuracy: 0.9453125
Train loss: 0.008330305108390806



Epoch:  75%|███████▌  | 3/4 [15:58<05:19, 319.53s/it][A

Validation Accuracy: 0.9479166666666666
Train loss: 0.0035635288403740527



Epoch: 100%|██████████| 4/4 [21:18<00:00, 319.54s/it][A

Validation Accuracy: 0.953125





2.2 Evaluate on Development Data

In [78]:
def predict(tsv_file_path, model):
    df = pd.read_csv(tsv_file_path, delimiter='\t', header=0, names=['sentence', 'label'])

    sentences = df.sentence.values
    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
    labels = [int(i) for i in df.label.values]

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    MAX_LEN = 128
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    attention_masks = []

    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask) 

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)

    batch_size = 32  

    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    
    
    model.eval()
    predictions , true_labels = [], []

    # Predict 
    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
    return flat_predictions

In [79]:
dev_flat_predictions = predict(dev_tsv_path, model)

In [82]:
def read_data_from_json(file_path):
    df = pd.read_json(file_path)
    labels = [int(i) for i in df.loc['label'].values]
    return labels
dev_labels = read_data_from_json(dev_json_path)

with open(dev_map_path,'r') as f:
    map = json.load(f)

In [83]:
new_predictions = [0] * len(dev_labels)
for i, result in enumerate(dev_flat_predictions):
    if result == 1:
        new_predictions[map[str(i)]] += 1

for i in range(len(new_predictions)):
    if new_predictions[i] >4:
        new_predictions[i] = 1
    else: 
        new_predictions[i]=0

print(len([i for i in new_predictions if i==1]))

57


In [1]:
p, r, f, _ = precision_recall_fscore_support(dev_labels, new_predictions, pos_label=1, average="binary")
print('precision:',p)
print('recall:',r)
print('f_score:',f)

2.3 Predict on Test Dataset

In [None]:
test_flat_predictions = predict(test_tsv_path, model)

In [None]:
with open(test_map_path,'r') as f:
    test_map = json.load(f)

In [None]:
new_predictions = [0] * (max(test_map.values())+1)
for i, result in enumerate(test_flat_predictions):
    if result == 1:
        new_predictions[test_map[str(i)]] += 1

for i in range(len(new_predictions)):
    if new_predictions[i] >4:
        new_predictions[i] = 1
    else: new_predictions[i]=0
        
print(len([i for i in new_predictions if i>0]))

In [None]:
test_dict = {}
for i in range(len(new_predictions)):
    label = 1 if new_predictions[i]>= 0.5 else 0
    test_dict['test-'+str(i)] = {'label':label}

In [None]:
import zipfile
import datetime
def save_to_zip(pred_dict, zip_name):
    with open('test-output.json', 'w') as f:
        json.dump(pred_dict, f)
    z = zipfile.ZipFile(zip_name+'.zip', 'w', zipfile.ZIP_STORED)
    z.write('test-output.json', 'test-output.json')
    z.close()
save_to_zip(test_dict, str(datetime.datetime.now())[:13])
str(datetime.datetime.now())[:13]