In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
import os
import random

import torch
from transformers import (AutoModel,AutoModelForMaskedLM, 
                          AutoTokenizer, LineByLineTextDataset,
                          DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)


In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=99)

In [None]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

data = pd.concat([train_data,test_data])
data['excerpt'] = data['excerpt'].apply(lambda x: x.replace('\n',''))

text  = '\n'.join(data.excerpt.tolist())

with open('text.txt','w') as f:
    f.write(text)

In [None]:
model_name = 'microsoft/deberta-base'
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('./clrp_mlm_deberta_base');

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForMaskedLM: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'config', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['cls.predictions.decoder.we

In [None]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="text.txt",
    block_size=256)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="text.txt", 
    block_size=256)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)



In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,
    save_total_limit=1,
    do_train=True,
    do_eval=False,
    do_predict=True,
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=250,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    #learning_rate=5e-6,
    seed=99,
    lr_scheduler_type="cosine"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

In [None]:
trainer.train()
trainer.save_model(f'./clrp_roberta_base')

***** Running training *****
  Num examples = 2841
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1780


Epoch,Training Loss,Validation Loss
1,3.6221,3.247999
2,2.6921,2.478216
3,2.4351,2.171416
4,2.1143,2.032165
5,2.1394,2.015572


***** Running Evaluation *****
  Num examples = 2841
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-356
Configuration saved in ./results/checkpoint-356/config.json
Model weights saved in ./results/checkpoint-356/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2841
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-712
Configuration saved in ./results/checkpoint-712/config.json
Model weights saved in ./results/checkpoint-712/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2841
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1068
Configuration saved in ./results/checkpoint-1068/config.json
Model weights saved in ./results/checkpoint-1068/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-356] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2841
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-1424
Configuration saved in ./results/checkpoint-1424/c

In [None]:
!zip -r /content/model_deberta_mlm_final.zip /content/clrp_roberta_base

  adding: content/clrp_roberta_base/ (stored 0%)
  adding: content/clrp_roberta_base/config.json (deflated 52%)
  adding: content/clrp_roberta_base/training_args.bin (deflated 48%)
  adding: content/clrp_roberta_base/pytorch_model.bin (deflated 7%)


In [None]:
!zip -r /content/model_deberta_tokenizer_mlm_final.zip /content/clrp_mlm_deberta_base

  adding: content/clrp_mlm_deberta_base/ (stored 0%)
  adding: content/clrp_mlm_deberta_base/special_tokens_map.json (deflated 81%)
  adding: content/clrp_mlm_deberta_base/tokenizer.json (deflated 59%)
  adding: content/clrp_mlm_deberta_base/vocab.json (deflated 59%)
  adding: content/clrp_mlm_deberta_base/tokenizer_config.json (deflated 76%)
  adding: content/clrp_mlm_deberta_base/merges.txt (deflated 53%)


In [None]:
import shutil
shutil.copy("/content/model_deberta_mlm_final.zip", "/content/gdrive/model_deberta_mlm_final.zip")

FileNotFoundError: ignored

In [None]:
# mount it
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# copy it there
!cp /content/model_deberta_mlm_final.zip /content/drive/MyDrive