In [3]:
import glob
import pandas as pd
import math
import numpy as np
import seaborn as sns
from tqdm import tqdm,trange
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import spacy
from spacy.tokenizer import Tokenizer

import torch
import os
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

from util.dataloader import CreateDataloader
from util.xlnetTokenizer import XlnetTokenize
from util.dataset import CreateDataset
from util.trainer import ModelTrainer

In [3]:
from transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

In [4]:
nlp = spacy.load("en_core_web_sm")
pd.set_option('display.max_colwidth', None)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

In [6]:
# Reading all dataset
path = "/content/drive/MyDrive/CS4248/scicite_data/*.jsonl"

Scicite_data_getter = CreateDataset(filepath=path)

In [7]:
train_data = Scicite_data_getter.df_train

In [8]:
Scicite_data_getter.classes

array(['background', 'method', 'result'], dtype=object)

In [None]:
train_data[train_data.label == "result"]

In [10]:
# Getting appropriate inputs and outputs

train_label = Scicite_data_getter._label_to_id("train")
train_sentence = Scicite_data_getter._get_sentence_data("train")

valid_label = Scicite_data_getter._label_to_id("dev")
valid_sentence = Scicite_data_getter._get_sentence_data("dev")

test_label = Scicite_data_getter._label_to_id("test")
test_sentence = Scicite_data_getter._get_sentence_data("test")

In [19]:
# Initiating Tokenizer class
# Manual define vocabulary address, if you download the model in local
# The vocabulary can download from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model"
# vocabulary = 'models/xlnet-base-cased/xlnet-base-cased-spiece.model'
vocabulary = "/content/drive/MyDrive/CS4248/model xlnet-base-cased/xlnet-base-cased-spiece.model"
TOKEN_LENGTH = 64
# With cased model, set do_lower_case = False
basetokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

# Max Token Length can be changed
train_input , train_masks, train_segment, train_trimmed_idx = XlnetTokenize.tokenize(
    base_tokenizer=basetokenizer, max_token_length=TOKEN_LENGTH, sentences=train_sentence)

No.:0
sentence: However, how frataxin interacts with the Fe-S cluster biosynthesis components remains unclear as direct one-to-one interactions with each component were reported (IscS [12,22], IscU/Isu1 [6,11,16] or ISD11/Isd11 [14,15]).
input_ids:[476, 19, 160, 17, 5706, 10693, 153, 9647, 23, 33, 18, 5063, 13, 83, 9774, 4749, 12681, 305, 6142, 4314, 1484, 7035, 34, 1568, 65, 13, 261, 13, 1112, 11401, 33, 231, 5528, 55, 550, 17, 10, 96, 23, 369, 83, 4145, 1396, 19, 2896, 3158, 19, 35, 23, 369, 1580, 167, 96, 3192, 174, 4145, 342, 19, 1545, 19, 1608, 3158, 4, 3]
attention_masks:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:1000
sentenc

In [20]:
# DEV
val_input , val_masks, val_segment, val_trimmed_idx = XlnetTokenize.tokenize(
    base_tokenizer=basetokenizer, max_token_length=TOKEN_LENGTH, sentences=valid_sentence)

No.:0
sentence: These results are in contrast with the findings of Santos et al.(16), who reported a significant association between low sedentary time and healthy CVF among Portuguese
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 592, 825, 41, 25, 3377, 33, 18, 5373, 20, 13793, 17, 993, 1296, 9, 10, 1608, 11, 19, 61, 550, 24, 1376, 3461, 161, 599, 17, 23, 68, 1908, 1449, 92, 21, 2895, 330, 26839, 447, 6652, 4, 3, 4, 3]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]




In [21]:
# TEST
test_input , test_masks, test_segment, test_trimmed_idx = XlnetTokenize.tokenize(
    base_tokenizer=basetokenizer, max_token_length=TOKEN_LENGTH, sentences=test_sentence)

No.:0
sentence: Chapel, as well as X10 [2], UPC [3] , CoArray Fortran [6], and Titanium [5], rely on the Partitioned Global Address Space (PGAS) memory model, which enables
The work has been performed under the HPC-EUROPA2 project (project number: 228398) with the support of the European Commission - Capacities Area - Research Infrastructures.
input_ids:[13321, 19, 34, 143, 34, 1404, 963, 4145, 184, 3158, 19, 128, 6598, 4145, 233, 3158, 17, 19, 659, 10208, 2563, 3316, 2156, 4145, 342, 3158, 19, 21, 14523, 3075, 4145, 217, 3158, 19, 6871, 31, 18, 3752, 6895, 68, 4648, 20277, 4992, 17, 10, 10384, 5713, 11, 2429, 1342, 19, 59, 8350, 32, 154, 51, 72, 2062, 168, 18, 6398, 323, 4, 3]
attention_masks:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Getting DataLoaders

In [22]:
BATCH_NUM = 32
# Train Loader
train_loader = CreateDataloader(input=train_input,seg=train_segment,mask=train_masks,tags=train_label,loader_type= "train")
train_data_loader = train_loader._data_loader(batch_num=BATCH_NUM)
# Valid Loader
valid_loader = CreateDataloader(input=val_input,seg=val_segment,mask=val_masks,tags=valid_label,loader_type= "valid")
valid_data_loader = valid_loader._data_loader(batch_num=BATCH_NUM)
# Test Loader
test_loader = CreateDataloader(input=test_input,seg=test_segment,mask=test_masks,tags=test_label,loader_type= "train")
test_data_loader = test_loader._data_loader(batch_num=BATCH_NUM)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], [4, 4, 4, 4, 4, 4, 4, 4, 4, 4

In [None]:
# Training the model
# Load Model
model_file_address = "/content/drive/MyDrive/CS4248/model xlnet-base-cased"

## Saving Model
model_name = "test_model"
savepath = f"/content/drive/MyDrive/CS4248/project_models/{model_name}"

pretrained_xlnet_model = XLNetForSequenceClassification.from_pretrained(
    model_file_address,num_labels=len(Scicite_data_getter.classes))

xlnet_model_trainer = ModelTrainer(
      model=pretrained_xlnet_model,
      savepath=savepath,
      batch_num=BATCH_NUM,
      input_length=len(train_input))

# Activate trainer 0.00003
train_history = xlnet_model_trainer.train_model(
      train_loader=train_data_loader,
      valid_loader=valid_data_loader,
      learning_rate=0.01,
      epoch_num=1
)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/CS4248/model xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



    ***** Running training ***** 

    Num Examples 8243 

    Batch Size 32 and Num steps 258
    


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

current train loss 3.336108982563019 and acc 12.3
current train loss 2.3886044532060624 and acc 13.65
current train loss 2.1001332183678945 and acc 13.2
current train loss 1.8944399923086166 and acc 13.625


In [None]:
model = xlnet_model_trainer.model
# Setting model into eval state
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

y_true = []
y_predict = []
print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_segs,b_labels = batch

    with torch.no_grad():
        outputs = model(input_ids =b_input_ids,token_type_ids=b_segs, input_mask = b_input_mask,labels=b_labels)
        tmp_eval_loss, logits = outputs[:2]

    # Get textclassification predict result
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = accuracy(logits, label_ids)
#     print(tmp_eval_accuracy)
#     print(np.argmax(logits, axis=1))
#     print(label_ids)

    # Save predict and real label reuslt for analyze
    for predict in np.argmax(logits, axis=1):
        y_predict.append(predict)

    for real_result in label_ids.tolist():
        y_true.append(real_result)


    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_steps += 1


eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / len(val_inputs)
loss = tr_loss/nb_tr_steps
result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'loss': loss}
report = classification_report(y_pred=np.array(y_predict),y_true=np.array(y_true))

# Save the report into file
output_eval_file = os.path.join(savepath, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print("  %s = %s"%(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))

    print(report)
    writer.write("\n\n")
    writer.write(report)

***** Running evaluation *****
  Num examples =2473
  Batch size = 32
***** Eval results *****
  eval_accuracy = 0.841083704003235
  eval_loss = 0.7912155960280544
  loss = 0.1175869092060667
              precision    recall  f1-score   support

           0       0.86      0.91      0.89      1486
           1       0.87      0.70      0.77       656
           2       0.72      0.81      0.76       331

    accuracy                           0.84      2473
   macro avg       0.82      0.81      0.81      2473
weighted avg       0.84      0.84      0.84      2473

