# Train Transformer for ARQMath 

NOTE: If you use Google Colab you need to enable GPU/TPU support first!


In [1]:
import pandas as pd
import numpy as np
import os
import torch
import logging
from transformers import (
    HfArgumentParser,
    TrainingArguments)
from transformers.data.data_collator import DataCollator
from transformers.trainer import Trainer, set_seed
from transformers.modeling_auto import AutoModelForSequenceClassification, AutoConfig
from transformers.tokenization_auto import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from dataset import ARQMathDataset
from sklearn.metrics import classification_report

seed = 0
set_seed(seed)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)



In [2]:
!nvidia-smi

Thu Jul  2 19:02:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  On   | 00000000:04:00.0 Off |                  N/A |
| 23%   24C    P8     7W / 250W |   3679MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 108...  On   | 00000000:06:00.0 Off |                  N/A |
| 28%   40C    P2    55W / 250W |      1MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX 108...  On   | 00000000:07:00.0 Off |                  N/A |
| 22%   

In [3]:
cuda_visible_devices = '1,2,3,4' # check with nvidia-smi
model_name = 'bert-base-cased'
model_cache_dir = '/home/mostendorff/datasets/BERT_pre_trained_models/pytorch/'

if os.path.exists(model_cache_dir + model_name):
    logger.info('Using model from cache')
    model_name_or_path = model_cache_dir + model_name
else:
    model_name_or_path = model_name

2020-07-02 19:02:16 - INFO - __main__ -   Using model from cache


In [4]:
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices

In [5]:
# Load CSV from disk
df = pd.read_csv('./data/qa-pair.csv', index_col=False)
df['rel'] = df['rel'].astype(int)
df.head()

Unnamed: 0,qID,aID,q,a,rel
0,112,115,What are gradients and how would I use them?\n...,"The ∇ (pronounced ""del"") is an operator, more ...",1
1,118,125,How would you describe calculus in simple term...,There came a time in mathematics when people e...,1
2,118,148,How would you describe calculus in simple term...,One of the greatest achievements of human civi...,0
3,118,517,How would you describe calculus in simple term...,Calculus is basically a way of calculating rat...,0
4,118,127,How would you describe calculus in simple term...,Calculus is a field which deals with two seemi...,0


In [6]:
print(f'Samples: {len(df):,}')

Samples: 154,744


In [7]:
df['rel'].value_counts()

0    103957
1     50787
Name: rel, dtype: int64

In [8]:
# downsample for debugging
#df = df.sample(n=10000)

In [9]:
config = AutoConfig.from_pretrained(model_name_or_path, num_labels=2) 
model = AutoModelForSequenceClassification.from_config(config)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

2020-07-02 19:02:17 - INFO - transformers.configuration_utils -   loading configuration file /home/mostendorff/datasets/BERT_pre_trained_models/pytorch/bert-base-cased/config.json
2020-07-02 19:02:17 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

2020-07-02 19:02:19 - INFO - transformers.configuration_utils -   loading configuration file /home/mostendorff/datasets/BERT_pre_trained_models/pytorch/bert-base-cased/config.json
2020-07-02 19:02:19 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "attention_probs_dropout_prob": 0.1

In [10]:
train_size = 0.8
train_df = df.sample(frac=train_size, random_state=seed)
test_df = df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

In [11]:
train_ds = ARQMathDataset(train_df, tokenizer, 512)
test_ds = ARQMathDataset(test_df, tokenizer, 512)

In [12]:
train_ds[0]

{'input_ids': tensor([  101,  2495, 14867, 12377,  9455, 23043,  1891, 19068,  1937,  2463,
          6777,  1116,   178,  1169,   112,   189,  1256,  2373,  1142,   117,
           178,  1138,  1185,  1911,  1184,   178,  1821,  1217,  1455,   119,
           178,  1198,  1444,  1199,  1494,  1107,  7877, 12924,  2875,  2118,
          1142,  6477,  1104,   170,  2304,   119,  6699,  1195,  1132,  1549,
          1210,  1827,   113,   193,   117,   194,   114,   113,   193,  1475,
           117,   194,  1475,   114,   113,   193,  1477,   117,   194,  1477,
           114,  1105,  1195,  1328,  1106,  1525,   170,  3527, 18311, 15792,
          1161,   185,   113,   193,   114,  1115,  4488,  1194,  1292,  1210,
          1827,   119,  1142,  1110,  1549,  1103,  2495, 14867, 12377,  9455,
         23043,  1891, 19068,   185,   113,   193,   114,   134,   194,   168,
           121,   165,   175, 19366,   196,   113,   193,   118,   193,   168,
           122,   114,   113,   193,   

In [13]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
    
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    #warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluate_during_training=True,
    learning_rate=2e-5,
    logging_steps=2500,
    #eval_steps=4000,
    save_steps=-1,
    save_total_limit=0,
    seed=seed,
)

# Initialize our Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                 
    train_dataset=train_ds,       
    eval_dataset=test_ds, 
    compute_metrics=compute_metrics,
)

2020-07-02 19:02:20 - INFO - transformers.training_args -   PyTorch: setting up devices
2020-07-02 19:02:22 - INFO - transformers.trainer -   You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


In [15]:
trainer.train()

2020-07-02 19:02:23 - INFO - transformers.trainer -   ***** Running training *****
2020-07-02 19:02:23 - INFO - transformers.trainer -     Num examples = 123795
2020-07-02 19:02:23 - INFO - transformers.trainer -     Num Epochs = 3
2020-07-02 19:02:23 - INFO - transformers.trainer -     Instantaneous batch size per device = 10
2020-07-02 19:02:23 - INFO - transformers.trainer -     Total train batch size (w. parallel, distributed & accumulation) = 40
2020-07-02 19:02:23 - INFO - transformers.trainer -     Gradient Accumulation steps = 1
2020-07-02 19:02:23 - INFO - transformers.trainer -     Total optimization steps = 9285


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3095.0, style=ProgressStyle(description_w…

2020-07-02 19:37:12 - INFO - transformers.trainer -   {'loss': 0.6318841715335846, 'learning_rate': 1.4614970382337104e-05, 'epoch': 0.8077544426494345, 'step': 2500}
2020-07-02 19:37:12 - INFO - transformers.trainer -   ***** Running Evaluation *****
2020-07-02 19:37:12 - INFO - transformers.trainer -     Num examples = 30949
2020-07-02 19:37:12 - INFO - transformers.trainer -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=484.0, style=ProgressStyle(description_w…

2020-07-02 20:19:54 - INFO - transformers.trainer -   {'eval_loss': 0.6167520474919603, 'eval_accuracy': 0.671685676435426, 'eval_f1': 0.049396575919169246, 'eval_precision': 0.5751633986928104, 'eval_recall': 0.025806451612903226, 'epoch': 1.615508885298869, 'step': 5000}






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=3095.0, style=ProgressStyle(description_w…

2020-07-02 20:54:51 - INFO - transformers.trainer -   {'loss': 0.6143435514211655, 'learning_rate': 3.844911147011309e-06, 'epoch': 2.4232633279483036, 'step': 7500}
2020-07-02 20:54:51 - INFO - transformers.trainer -   ***** Running Evaluation *****
2020-07-02 20:54:51 - INFO - transformers.trainer -     Num examples = 30949
2020-07-02 20:54:51 - INFO - transformers.trainer -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=484.0, style=ProgressStyle(description_w…

2020-07-02 20:58:46 - INFO - transformers.trainer -   {'eval_loss': 0.6144362179454693, 'eval_accuracy': 0.6735274160716017, 'eval_f1': 0.18673535093367677, 'eval_precision': 0.528714676390155, 'eval_recall': 0.11339198435972629, 'epoch': 2.4232633279483036, 'step': 7500}





2020-07-02 21:23:45 - INFO - transformers.trainer -   

Training completed. Do not forget to share your model on huggingface.co/models =)








TrainOutput(global_step=9285, training_loss=0.6199444479498095)

In [16]:
out = trainer.evaluate()

2020-07-02 21:23:45 - INFO - transformers.trainer -   ***** Running Evaluation *****
2020-07-02 21:23:45 - INFO - transformers.trainer -     Num examples = 30949
2020-07-02 21:23:45 - INFO - transformers.trainer -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=484.0, style=ProgressStyle(description_w…

2020-07-02 21:27:39 - INFO - transformers.trainer -   {'eval_loss': 0.6141976345299689, 'eval_accuracy': 0.6741736405053476, 'eval_f1': 0.18795297149299406, 'eval_precision': 0.533363802559415, 'eval_recall': 0.11407624633431085, 'epoch': 3.0, 'step': 9285}





In [17]:
logger.info(out)

2020-07-02 21:27:39 - INFO - __main__ -   {'eval_loss': 0.6141976345299689, 'eval_accuracy': 0.6741736405053476, 'eval_f1': 0.18795297149299406, 'eval_precision': 0.533363802559415, 'eval_recall': 0.11407624633431085, 'epoch': 3.0}


In [18]:
prediction_output = trainer.predict(test_ds)

2020-07-02 21:27:39 - INFO - transformers.trainer -   ***** Running Prediction *****
2020-07-02 21:27:39 - INFO - transformers.trainer -     Num examples = 30949
2020-07-02 21:27:39 - INFO - transformers.trainer -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=484.0, style=ProgressStyle(description_w…




In [19]:
pred_df = test_df
pred_df['predicted_label_ids'] = prediction_output.predictions.argmax(-1)
pred_df['predicted_0'] = prediction_output.predictions[:,0]
pred_df['predicted_1'] = prediction_output.predictions[:,1]       
pred_df.head()

Unnamed: 0,qID,aID,q,a,rel,predicted_label_ids,predicted_0,predicted_1
0,118,148,How would you describe calculus in simple term...,One of the greatest achievements of human civi...,0,0,0.805599,-0.734638
1,155,2329,How can you prove that a function has no close...,Have you ever heard of Galois theory? It is a ...,0,0,0.730035,-0.681573
2,255,548134,Why does the series \sum_{n=1}^\infty\frac1n n...,Let's group the terms as follows:A=\frac11+\fr...,0,0,0.505219,-0.511177
3,255,261,Why does the series \sum_{n=1}^\infty\frac1n n...,"This is not as good an answer as AgCl's, nonet...",0,0,0.697238,-0.65814
4,255,1139726,Why does the series \sum_{n=1}^\infty\frac1n n...,Let's assume that \sum_{n=1}^{\infty}\frac1n=:...,0,0,0.378869,-0.412837


In [20]:
pred_df.to_csv('./results/test_predictions.csv')

In [21]:
pred_df['rel'].value_counts()

0    20719
1    10230
Name: rel, dtype: int64

In [22]:
pred_df['predicted_label_ids'].value_counts()

0    28761
1     2188
Name: predicted_label_ids, dtype: int64

In [23]:
print(classification_report(pred_df['rel'].values, pred_df['predicted_label_ids'].values))

              precision    recall  f1-score   support

           0       0.68      0.95      0.80     20719
           1       0.53      0.11      0.19     10230

   micro avg       0.67      0.67      0.67     30949
   macro avg       0.61      0.53      0.49     30949
weighted avg       0.63      0.67      0.60     30949



In [24]:
# Save trained model
# model.save_pretrained("path/to/awesome-name-you-picked")
model_save_dir = 'models/arqmath-' + model_name

os.makedirs(model_save_dir)
trainer.save_model(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

2020-07-02 21:31:35 - INFO - transformers.trainer -   Saving model checkpoint to models/arqmath-bert-base-cased
2020-07-02 21:31:35 - INFO - transformers.configuration_utils -   Configuration saved in models/arqmath-bert-base-cased/config.json
2020-07-02 21:31:36 - INFO - transformers.modeling_utils -   Model weights saved in models/arqmath-bert-base-cased/pytorch_model.bin


('models/arqmath-bert-base-cased/vocab.txt',
 'models/arqmath-bert-base-cased/special_tokens_map.json',
 'models/arqmath-bert-base-cased/added_tokens.json')

### Upload model to Huggingface after training

See https://huggingface.co/welcome

```bash
transformers-cli login
transformers-cli upload models/arqmath-bert-base-cased/config.json
```
