In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
DATASET_PUBMED_RCT_DIR = './gdrive/Shareddrives/DATASETS/PUBMED_RCT/'
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

In [None]:
!pip install transformers -q
!pip install sentence_transformers -q
!pip install datasets -q

[K     |████████████████████████████████| 4.7 MB 4.3 MB/s 
[K     |████████████████████████████████| 101 kB 10.4 MB/s 
[K     |████████████████████████████████| 596 kB 49.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 35.9 MB/s 
[K     |████████████████████████████████| 85 kB 2.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 23.5 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 365 kB 4.3 MB/s 
[K     |████████████████████████████████| 141 kB 49.6 MB/s 
[K     |████████████████████████████████| 115 kB 48.9 MB/s 
[K     |████████████████████████████████| 212 kB 49.3 MB/s 
[K     |████████████████████████████████| 127 kB 48.9 MB/s 
[?25h

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_parquet(DATASET_PUBMED_RCT_DIR + 'train.parquet')
df_dev = pd.read_parquet(DATASET_PUBMED_RCT_DIR + 'dev.parquet')
df_test = pd.read_parquet(DATASET_PUBMED_RCT_DIR + 'test.parquet')

In [None]:
df_train.columns

Index(['pmid', 'label', 'sentence', 'label_id'], dtype='object')

In [None]:
df_train['label_id'].unique()

array([0, 1, 2, 3, 4])

In [None]:
df_train[['label', 'label_id']].drop_duplicates()

Unnamed: 0,label,label_id
0,objective,0
1,methods,1
6,results,2
11,conclusions,3
12,background,4


In [None]:
# same text but with different labels
print(f'{len(df_train.index)}')
mask = df_train.groupby('sentence')['label_id'].transform('nunique') > 1
df_train = df_train[~mask].copy()
print(f'{len(df_train.index)}')

180040
179892


In [None]:
# same text but with different labels
print(f'{len(df_dev.index)}')
mask = df_dev.groupby('sentence')['label_id'].transform('nunique') > 1
df_dev = df_dev[~mask].copy()
print(f'{len(df_dev.index)}')

30212
30212


In [None]:
# same text but with different labels
print(f'{len(df_test.index)}')
mask = df_test.groupby('sentence')['label_id'].transform('nunique') > 1
df_test = df_test[~mask].copy()
print(f'{len(df_test.index)}')

30135
30122


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", model_max_length=512)

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  tokenized_sentence = tokenizer(examples["sentence"], truncation=True)
  tokenized_sentence['label'] = examples['label_id']
  return tokenized_sentence

In [None]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

In [None]:
dataset_train = dataset_train.map(preprocess_function)
dataset_val = dataset_val.map(preprocess_function)
dataset_test = dataset_test.map(preprocess_function)

  0%|          | 0/179892 [00:00<?, ?ex/s]

  0%|          | 0/30212 [00:00<?, ?ex/s]

  0%|          | 0/30122 [00:00<?, ?ex/s]

In [None]:
label2id = {'objective': 0, 'methods': 1, 'results': 2, 'conclusions': 3, 'background': 4}
id2label = {0: 'objective', 1: 'methods', 2: 'results', 3: 'conclusions', 4: 'background'}

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=5, id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_config(config)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR + 'pubmed_rct_classification/checkpoint',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence, pmid, label_id. If __index_level_0__, sentence, pmid, label_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 179892
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 224880


Epoch,Training Loss,Validation Loss
1,0.5429,0.506363
2,0.4764,0.479497
3,0.4408,0.482357
4,0.3933,0.512598
5,0.3365,0.516175
6,0.3141,0.568344
7,0.2656,0.671993
8,0.2421,0.738909
9,0.2267,0.886328
10,0.1994,0.888216


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, sentence, pmid, label_id. If __index_level_0__, sentence, pmid, label_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30212
  Batch size = 16
Saving model checkpoint to ./gdrive/Shareddrives/MODELS/pubmed_rct_classification/checkpoint/checkpoint-11244
Configuration saved in ./gdrive/Shareddrives/MODELS/pubmed_rct_classification/checkpoint/checkpoint-11244/config.json
Model weights saved in ./gdrive/Shareddrives/MODELS/pubmed_rct_classification/checkpoint/checkpoint-11244/pytorch_model.bin
tokenizer config file saved in ./gdrive/Shareddrives/MODELS/pubmed_rct_classification/checkpoint/checkpoint-11244/tokenizer_config.json
Special tokens file saved in ./gdrive/Shareddrives/MODELS/pubmed_rct_classification/checkpoint/checkpoin

TrainOutput(global_step=224880, training_loss=0.220772381029872, metrics={'train_runtime': 40620.9058, 'train_samples_per_second': 88.571, 'train_steps_per_second': 5.536, 'total_flos': 1.507812945775578e+17, 'train_loss': 0.220772381029872, 'epoch': 20.0})

In [None]:
predictions = trainer.predict(dataset_test)

In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
  metric_acc = load_metric("accuracy")
  metric_p = load_metric("precision")
  metric_r = load_metric("recall")
  metric_f1 = load_metric("f1")
    
  preds = eval_pred.predictions
  labels = eval_pred.label_ids
  predictions = np.argmax(preds, axis=-1)

  accuracy = metric_acc.compute(predictions=predictions, references=labels)
  #precision = metric_p.compute(predictions=predictions, references=labels)
  #recall = metric_r.compute(predictions=predictions, references=labels)
  f1 = metric_f1.compute(predictions=predictions, references=labels, average="micro")
  
  return accuracy, f1

In [None]:
compute_metrics(predictions)

In [None]:
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

model.save_pretrained(OUTPUT_MODEL_DIR + 'pubmed_rct_classification/model')