In [1]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer

In [2]:
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

In [3]:
# load data
dtf_mlm = pd.read_pickle('data/yelp_dataset_reviews.pkl')

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)

# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['text']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['text']].dropna())

In [4]:
MODEL = 'bert'
bert_type = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )
model = BertForMaskedLM.from_pretrained(bert_type)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def tokenize_function(row):
    return tokenizer(
        row['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
)

  0%|          | 0/5942 [00:00<?, ?ba/s]

  0%|          | 0/1049 [00:00<?, ?ba/s]

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='args/bert-restaurants',
    logging_dir='logs/LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("models/restaurants_domain_adapted") #save your custom model

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5941738
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 742718


  0%|          | 0/742718 [00:00<?, ?it/s]

{'loss': 2.3546, 'learning_rate': 1.998922730125044e-05, 'epoch': 0.0}
{'loss': 2.2162, 'learning_rate': 1.9975761427813495e-05, 'epoch': 0.0}
{'loss': 2.1548, 'learning_rate': 1.9962295554376545e-05, 'epoch': 0.0}
{'loss': 2.1012, 'learning_rate': 1.9948829680939595e-05, 'epoch': 0.01}
{'loss': 2.0666, 'learning_rate': 1.993536380750265e-05, 'epoch': 0.01}
{'loss': 2.0747, 'learning_rate': 1.99218979340657e-05, 'epoch': 0.01}
{'loss': 2.025, 'learning_rate': 1.990843206062875e-05, 'epoch': 0.01}
{'loss': 2.0242, 'learning_rate': 1.9894966187191803e-05, 'epoch': 0.01}
{'loss': 1.9964, 'learning_rate': 1.9881500313754854e-05, 'epoch': 0.01}
{'loss': 1.9874, 'learning_rate': 1.9868034440317904e-05, 'epoch': 0.01}
{'loss': 1.996, 'learning_rate': 1.9854568566880958e-05, 'epoch': 0.01}
{'loss': 1.9793, 'learning_rate': 1.9841102693444005e-05, 'epoch': 0.02}
{'loss': 1.9656, 'learning_rate': 1.9827636820007058e-05, 'epoch': 0.02}
{'loss': 1.9495, 'learning_rate': 1.981417094657011e-05, 'epo

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
def normlize_bert_data_size(input_texts, max_size = 512):
    '''
    input_texts: a list of input texts.
    max_size: the max size to truncate data to.
    '''
    nlp = spacy.load("en_core_web_sm")
    new_input_texts = []
    orginal_index = []
    
    for i, doc in enumerate(nlp.pipe(input_texts, disable=["tagger", "attribute_ruler", "lemmatizer"])):
        sentences = [sent.text for sent in doc.sents]
        if any(len(x) > 512 for x in sentences):
            new_sentences = []
            for sent in sentences:
                pass
        else:
            new_sentences = sentences

In [None]:
# text = input_texts[i].split()
# any(len(x) > 512 for x in text)