In [None]:
## import simi

import evaluate
import pandas as pd
from sentence_transformers import evaluation, losses, models, InputExample, SentenceTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, EarlyStoppingCallback, TrainingArguments, Trainer
from datasets import Dataset


TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
MODEL = model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
RANDOM_STATE = 1

set_seed(RANDOM_STATE)
enable_full_determinism(RANDOM_STATE)

In [2]:
df = pd.read_csv("abstracts-arxiv-dataset.csv", index_col=0)
df["categories"] = df["categories"].map(lambda c: tuple(c.split()))

In [3]:
X = df[df["categories"].map(lambda c: len(tuple(filter(lambda s: s.startswith("math") or s.startswith("stat"), c)))>0)]
X.sample(5)

Unnamed: 0,categories,doi,text
811596,"(cs.IT, cs.NI, math.IT)",,Online Edge Caching in Fog-Aided Wireless Netw...
2106902,"(math.GR,)",,Virtual Endomorphisms of Nilpotent Groups. A v...
402000,"(math.AG,)",,Cancellativization of dimer models. We show th...
1171108,"(math.CO,)",,Independence number and connectivity for fract...
631241,"(math.CO, cs.DM, cs.FL, math.NT)",,Abelian Powers and Repetitions in Sturmian Wor...


In [4]:
X_train, X_test = train_test_split(X, train_size=0.8, random_state=RANDOM_STATE)
print("train:", len(X_train), "test:", len(X_test))

train: 520388 test: 130097


In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" # only use one GPU for now, see https://github.com/huggingface/transformers/issues/14128
#os.environ["TOKENIZERS_PARALLELISM"]="false"
os.environ["NNCL_DEBUG"]="INFO"

In [6]:
train_ds = Dataset.from_pandas(X_train[['text']].reset_index(drop=True))
test_ds = Dataset.from_pandas(X_test[['text']].reset_index(drop=True))

train_ds = train_ds.map(lambda e: TOKENIZER(e["text"], add_special_tokens=True, padding="max_length", truncation=True), batched=True)
test_ds = test_ds.map(lambda e: TOKENIZER(e["text"], add_special_tokens=True, padding="max_length", truncation=True), batched=True)

  0%|          | 0/521 [00:00<?, ?ba/s]

  0%|          | 0/131 [00:00<?, ?ba/s]

In [7]:
targs = TrainingArguments(
        output_dir="[train]bert+re-train_mlm_abstracts_arxiv",
        overwrite_output_dir=True,
        num_train_epochs=16,
        evaluation_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_strategy="epoch",
        load_best_model_at_end=True, 
)

trainer = Trainer(model=MODEL, args=targs,
        data_collator=DataCollatorForLanguageModeling(tokenizer=TOKENIZER, mlm=True, mlm_probability=0.15),
        train_dataset=train_ds,
        eval_dataset=test_ds,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        tokenizer=TOKENIZER
)

In [None]:
%%time
# finetune model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text. If text are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 520388
  Num Epochs = 16
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 520400
  Number of trainable parameters = 109514298
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.2876,1.213094
2,1.1898,1.135788
3,1.1453,1.086769
4,1.1231,1.052933
5,1.0825,1.031378
6,1.0476,1.007961
7,1.0234,0.986121
8,1.0048,0.971506
9,0.9912,0.954906
10,0.971,0.941756


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

The following columns in the evaluation set do

In [None]:
trainer.save_model("bert+re-train_mlm_abstracts_arxiv")