In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
DATASET_CSABSTRUCT_DIR = './gdrive/Shareddrives/DATASETS/CSAbstruct/'
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

In [None]:
!pip install transformers -q
!pip install sentence_transformers -q
!pip install datasets -q

[K     |████████████████████████████████| 4.7 MB 7.6 MB/s 
[K     |████████████████████████████████| 101 kB 12.6 MB/s 
[K     |████████████████████████████████| 596 kB 68.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 89.0 MB/s 
[K     |████████████████████████████████| 85 kB 4.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 61.1 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 365 kB 7.9 MB/s 
[K     |████████████████████████████████| 115 kB 74.2 MB/s 
[K     |████████████████████████████████| 141 kB 78.4 MB/s 
[K     |████████████████████████████████| 212 kB 71.7 MB/s 
[K     |████████████████████████████████| 127 kB 77.5 MB/s 
[?25h

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_parquet(DATASET_CSABSTRUCT_DIR + 'train.parquet')
df_dev = pd.read_parquet(DATASET_CSABSTRUCT_DIR + 'dev.parquet')
df_test = pd.read_parquet(DATASET_CSABSTRUCT_DIR + 'test.parquet')

In [None]:
df_train.columns

Index(['id', 'sentence', 'subject_label', 'label_id'], dtype='object')

In [None]:
df_train['label_id'].unique()

array([0, 1, 2, 3, 4])

In [None]:
df_train[['subject_label', 'label_id']].drop_duplicates()

Unnamed: 0,subject_label,label_id
0,background,0
1,objective,1
2,method,2
4,result,3
67,other,4


In [None]:
# same text but with different labels
print(f'{len(df_train.index)}')
mask = df_train.groupby('sentence')['label_id'].transform('nunique') > 1
df_train = df_train[~mask].copy()
print(f'{len(df_train.index)}')

11333
11250


In [None]:
# same text but with different labels
print(f'{len(df_dev.index)}')
mask = df_dev.groupby('sentence')['label_id'].transform('nunique') > 1
df_dev = df_dev[~mask].copy()
print(f'{len(df_dev.index)}')

2026
2024


In [None]:
# same text but with different labels
print(f'{len(df_test.index)}')
mask = df_test.groupby('sentence')['label_id'].transform('nunique') > 1
df_test = df_test[~mask].copy()
print(f'{len(df_test.index)}')

1349
1349


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", model_max_length=512)

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  tokenized_sentence = tokenizer(examples["sentence"], truncation=True)
  tokenized_sentence['label'] = examples['label_id']
  return tokenized_sentence

In [None]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

In [None]:
dataset_train = dataset_train.map(preprocess_function)
dataset_val = dataset_val.map(preprocess_function)
dataset_test = dataset_test.map(preprocess_function)

  0%|          | 0/11250 [00:00<?, ?ex/s]

  0%|          | 0/2024 [00:00<?, ?ex/s]

  0%|          | 0/1349 [00:00<?, ?ex/s]

In [None]:
label2id = {'background': 0, 'objective': 1, 'method': 2, 'result': 3, 'other': 4}
id2label = {0: 'background', 1: 'objective', 2: 'method', 3: 'result', 4: 'other'}

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification

# Download configuration from huggingface.co and cache.
config = AutoConfig.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=5, id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_config(config)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, subject_label, id, label_id, __index_level_0__. If sentence, subject_label, id, label_id, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11250
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14080


Epoch,Training Loss,Validation Loss
1,1.3914,1.243852
2,1.2312,1.145303
3,1.0057,1.196041
4,0.9235,1.202617
5,0.8045,1.315722
6,0.692,1.422711
7,0.6325,1.435719
8,0.5253,1.647757
9,0.4458,1.846971
10,0.3604,1.912609


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, subject_label, id, label_id, __index_level_0__. If sentence, subject_label, id, label_id, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2024
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-704
Configuration saved in ./results/checkpoint-704/config.json
Model weights saved in ./results/checkpoint-704/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-704/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-704/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, subject_label, id, label_id, __index_level_0__. If sentence, subject_label, i

TrainOutput(global_step=14080, training_loss=0.4865468857301907, metrics={'train_runtime': 1880.0824, 'train_samples_per_second': 119.676, 'train_steps_per_second': 7.489, 'total_flos': 6623378442163824.0, 'train_loss': 0.4865468857301907, 'epoch': 20.0})

In [None]:
predictions = trainer.predict(dataset_test)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, subject_label, id, label_id, __index_level_0__. If sentence, subject_label, id, label_id, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1349
  Batch size = 16


In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_pred):
  metric_acc = load_metric("accuracy")
  metric_p = load_metric("precision")
  metric_r = load_metric("recall")
  metric_f1 = load_metric("f1")
    
  preds = eval_pred.predictions
  labels = eval_pred.label_ids
  predictions = np.argmax(preds, axis=-1)

  accuracy = metric_acc.compute(predictions=predictions, references=labels)
  #precision = metric_p.compute(predictions=predictions, references=labels)
  #recall = metric_r.compute(predictions=predictions, references=labels)
  f1 = metric_f1.compute(predictions=predictions, references=labels, average="micro")
  
  return accuracy, f1

In [None]:
compute_metrics(predictions)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

({'accuracy': 0.6641957005189029}, {'f1': 0.6641957005189029})

In [None]:
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

model.save_pretrained(OUTPUT_MODEL_DIR + 'csabstruct_classification')

Configuration saved in ./gdrive/Shareddrives/MODELS/csabstruct_classification/config.json
Model weights saved in ./gdrive/Shareddrives/MODELS/csabstruct_classification/pytorch_model.bin
