In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
DATASET_PMC_FULL = './gdrive/Shareddrives/UNICAMP/DOUTORADO/SEGUNDO_ARTIGO/DATASETS/PMC-Sents-FULL/'
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/UNICAMP/DOUTORADO/SEGUNDO_ARTIGO/MODELS/'

In [None]:
!pip install transformers -q
!pip install sentence_transformers -q
!pip install datasets -q

[K     |████████████████████████████████| 4.7 MB 9.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 35.7 MB/s 
[K     |████████████████████████████████| 596 kB 59.3 MB/s 
[K     |████████████████████████████████| 101 kB 12.3 MB/s 
[K     |████████████████████████████████| 85 kB 3.5 MB/s 
[K     |████████████████████████████████| 1.3 MB 18.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 365 kB 10.2 MB/s 
[K     |████████████████████████████████| 212 kB 44.9 MB/s 
[K     |████████████████████████████████| 115 kB 62.8 MB/s 
[K     |████████████████████████████████| 141 kB 63.4 MB/s 
[K     |████████████████████████████████| 127 kB 62.8 MB/s 
[?25h

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_parquet(DATASET_PMC_FULL + 'train_full_text_span1.parquet')
df_dev = pd.read_parquet(DATASET_PMC_FULL + 'val_full_text_span1.parquet')
df_test = pd.read_parquet(DATASET_PMC_FULL + 'test_full_text_span1.parquet')

In [None]:
df_train.columns

Index(['id', 'text', 'label_id'], dtype='object')

In [None]:
df_train['label_id'].unique()

array([0, 3, 2, 4, 1])

In [None]:
# same text but with different labels
print(f'{len(df_train.index)}')
mask = df_train.groupby('text')['label_id'].transform('nunique') > 1
df_train = df_train[~mask].copy()
print(f'{len(df_train.index)}')

138473
138473


In [None]:
# same text but with different labels
print(f'{len(df_dev.index)}')
mask = df_dev.groupby('text')['label_id'].transform('nunique') > 1
df_dev = df_dev[~mask].copy()
print(f'{len(df_dev.index)}')

17309
17309


In [None]:
# same text but with different labels
print(f'{len(df_test.index)}')
mask = df_test.groupby('text')['label_id'].transform('nunique') > 1
df_test = df_test[~mask].copy()
print(f'{len(df_test.index)}')

17310
17310


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased", model_max_length=512)

Downloading tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  tokenized_sentence = tokenizer(examples["text"], truncation=True)
  tokenized_sentence['label'] = examples['label_id']
  return tokenized_sentence

In [None]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

In [None]:
dataset_train = dataset_train.map(preprocess_function)
dataset_val = dataset_val.map(preprocess_function)
dataset_test = dataset_test.map(preprocess_function)

  0%|          | 0/138473 [00:00<?, ?ex/s]

  0%|          | 0/17309 [00:00<?, ?ex/s]

  0%|          | 0/17310 [00:00<?, ?ex/s]

In [None]:
label2id = {'background': 0, 'objective': 1, 'method': 2, 'result': 3, 'other': 4}
id2label = {0: 'background', 1: 'objective', 2: 'method', 3: 'result', 4: 'other'}

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained("nreimers/MiniLM-L6-H384-uncased", num_labels=5, id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_config(config)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, id, __index_level_0__, label_id. If text, id, __index_level_0__, label_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 138473
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 173100


Epoch,Training Loss,Validation Loss
1,0.8313,0.805388
2,0.7753,0.781679
3,0.701,0.769816
4,0.6626,0.784116
5,0.64,0.77709
6,0.5925,0.822657
7,0.5635,0.827323
8,0.5022,0.891086
9,0.4696,0.945512
10,0.3972,1.026195


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, id, __index_level_0__, label_id. If text, id, __index_level_0__, label_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17309
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-8655
Configuration saved in ./results/checkpoint-8655/config.json
Model weights saved in ./results/checkpoint-8655/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-8655/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-8655/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, id, __index_level_0__, label_id. If text, id, __index_level_0__, label_id are not expected by `BertForSequenceCla

TrainOutput(global_step=173100, training_loss=0.4356618184453831, metrics={'train_runtime': 8876.9989, 'train_samples_per_second': 311.982, 'train_steps_per_second': 19.5, 'total_flos': 1.7420390859096588e+16, 'train_loss': 0.4356618184453831, 'epoch': 20.0})

In [None]:
predictions = trainer.predict(dataset_test)

In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
  metric_acc = load_metric("accuracy")
  metric_p = load_metric("precision")
  metric_r = load_metric("recall")
  metric_f1_macro = load_metric("f1")
  metric_f1_micro = load_metric("f1")
    
  preds = eval_pred.predictions
  labels = eval_pred.label_ids
  predictions = np.argmax(preds, axis=-1)

  accuracy = metric_acc.compute(predictions=predictions, references=labels)
  #precision = metric_p.compute(predictions=predictions, references=labels)
  #recall = metric_r.compute(predictions=predictions, references=labels)
  f1_micro = metric_f1_micro.compute(predictions=predictions, references=labels, average="micro")
  f1_macro = metric_f1_micro.compute(predictions=predictions, references=labels, average="macro")
  
  return {'accuracy': accuracy, 'f1_macro': f1_macro, 'f1_micro': f1_micro}

In [None]:
compute_metrics(predictions)

In [None]:
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/UNICAMP/DOUTORADO/SEGUNDO_ARTIGO/MODELS/'

model.save_pretrained(OUTPUT_MODEL_DIR + 'pmc_full_minilm_classification')

In [None]:
OUTPUT_MODEL_DIR + 'pmc_full_minilm_classification'