In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
DATASET_PMC_FULL = './gdrive/Shareddrives/DATASETS/PMC-Sents-FULL/'
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

In [None]:
!pip install transformers -q
!pip install sentence_transformers -q
!pip install datasets -q

[K     |████████████████████████████████| 4.7 MB 28.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 55.2 MB/s 
[K     |████████████████████████████████| 101 kB 13.1 MB/s 
[K     |████████████████████████████████| 596 kB 49.8 MB/s 
[K     |████████████████████████████████| 85 kB 5.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 45.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 365 kB 29.2 MB/s 
[K     |████████████████████████████████| 212 kB 51.5 MB/s 
[K     |████████████████████████████████| 141 kB 66.7 MB/s 
[K     |████████████████████████████████| 115 kB 75.4 MB/s 
[K     |████████████████████████████████| 127 kB 70.1 MB/s 
[?25h

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_parquet(DATASET_PMC_FULL + 'train_full_text_span1.parquet')
df_dev = pd.read_parquet(DATASET_PMC_FULL + 'val_full_text_span1.parquet')
df_test = pd.read_parquet(DATASET_PMC_FULL + 'test_full_text_span1.parquet')

In [None]:
df_train.columns

Index(['id', 'text', 'label_id'], dtype='object')

In [None]:
df_train['label_id'].unique()

array([0, 3, 2, 4, 1])

In [None]:
# same text but with different labels
print(f'{len(df_train.index)}')
mask = df_train.groupby('text')['label_id'].transform('nunique') > 1
df_train = df_train[~mask].copy()
print(f'{len(df_train.index)}')

138473
138473


In [None]:
# same text but with different labels
print(f'{len(df_dev.index)}')
mask = df_dev.groupby('text')['label_id'].transform('nunique') > 1
df_dev = df_dev[~mask].copy()
print(f'{len(df_dev.index)}')

17309
17309


In [None]:
# same text but with different labels
print(f'{len(df_test.index)}')
mask = df_test.groupby('text')['label_id'].transform('nunique') > 1
df_test = df_test[~mask].copy()
print(f'{len(df_test.index)}')

17310
17310


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", model_max_length=512)

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  tokenized_sentence = tokenizer(examples["text"], truncation=True)
  tokenized_sentence['label'] = examples['label_id']
  return tokenized_sentence

In [None]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

In [None]:
dataset_train = dataset_train.map(preprocess_function)
dataset_val = dataset_val.map(preprocess_function)
dataset_test = dataset_test.map(preprocess_function)

  0%|          | 0/138473 [00:00<?, ?ex/s]

  0%|          | 0/17309 [00:00<?, ?ex/s]

  0%|          | 0/17310 [00:00<?, ?ex/s]

In [None]:
label2id = {'background': 0, 'objective': 1, 'method': 2, 'result': 3, 'other': 4}
id2label = {0: 'background', 1: 'objective', 2: 'method', 3: 'result', 4: 'other'}

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=5, id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_config(config)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, label_id, id. If text, __index_level_0__, label_id, id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 138473
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 173100


Epoch,Training Loss,Validation Loss
1,0.8314,0.80921
2,0.7722,0.804643
3,0.6837,0.776872
4,0.6243,0.823665
5,0.5721,0.814441
6,0.5135,0.967918
7,0.4544,0.964414
8,0.3822,1.059438
9,0.3436,1.246366
10,0.2826,1.285402


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, label_id, id. If text, __index_level_0__, label_id, id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17309
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-8655
Configuration saved in ./results/checkpoint-8655/config.json
Model weights saved in ./results/checkpoint-8655/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8655/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8655/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, label_id, id. If text, __index_level_0__, label_id, id are not expected by `BertForSequenceCla

TrainOutput(global_step=173100, training_loss=0.34728773200257124, metrics={'train_runtime': 48663.377, 'train_samples_per_second': 56.911, 'train_steps_per_second': 3.557, 'total_flos': 1.2801632522523541e+17, 'train_loss': 0.34728773200257124, 'epoch': 20.0})

In [None]:
predictions = trainer.predict(dataset_test)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, label_id, id. If text, __index_level_0__, label_id, id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 17310
  Batch size = 16


In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
  metric_acc = load_metric("accuracy")
  metric_p = load_metric("precision")
  metric_r = load_metric("recall")
  metric_f1 = load_metric("f1")
    
  preds = eval_pred.predictions
  labels = eval_pred.label_ids
  predictions = np.argmax(preds, axis=-1)

  accuracy = metric_acc.compute(predictions=predictions, references=labels)
  #precision = metric_p.compute(predictions=predictions, references=labels)
  #recall = metric_r.compute(predictions=predictions, references=labels)
  f1 = metric_f1.compute(predictions=predictions, references=labels, average="micro")
  
  return accuracy, f1

In [None]:
compute_metrics(predictions)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

({'accuracy': 0.6997111496244945}, {'f1': 0.6997111496244945})

In [None]:
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

model.save_pretrained(OUTPUT_MODEL_DIR + 'pmc_full_scibert_classification')

Configuration saved in ./gdrive/Shareddrives/MODELS/pmc_full_scibert_classification/config.json
Model weights saved in ./gdrive/Shareddrives/MODELS/pmc_full_scibert_classification/pytorch_model.bin


In [None]:
OUTPUT_MODEL_DIR + 'pmc_full_scibert_classification'

'./gdrive/Shareddrives/MODELS/pmc_full_scibert_classification'