In [None]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import transformers
import import_ipynb
import os
import wandb

from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import DataCollatorForLanguageModeling

## Setup W&B, Reproducability, Torch Device

In [None]:
os.environ["WANDB_API_KEY"] = "my key"
wandb.login()

In [None]:
WANDB_PROJECT = "my project name"
WANDB_ENTITY = "my account name"

In [None]:
wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY)

In [None]:
config = yaml.safe_load(open('config.yaml'))

In [None]:
REPRO_SEED = conf['seeds']['repro_seed']
helpers.enable_reproducability(REPRO_SEED)

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
print("GPU is available: ", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Hyperparameters

In [None]:
RANDOM_SEED = config['seeds']['sampling_seed']

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5 
#WEIGHT_DECAY = 0.1

## Load and Prepare Data

In [None]:
# load data
train_df = pd.read_pickle('domain_adaption/data/train_domain.pkl')
val_df = pd.read_pickle('domain_adaption/data/val_domain.pkl')
len(train_df), len(val_df)

In [None]:
train = train_df[['text']]
val = val_df[['text']]

In [None]:
# Convert to Dataset object
train_dataset = Dataset.from_pandas(train[['text']].dropna())
valid_dataset = Dataset.from_pandas(val[['text']].dropna())

In [None]:
tokenizer = BertTokenizer.from_pretrained('deepset/gbert-base')
model = AutoModelForMaskedLM.from_pretrained('deepset/gbert-base').to(device) 

## Tokenization

In [None]:
def tokenize_function(row):
    return tokenizer(
        row['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from datetime import date
date = date.today()

In [None]:
callback = EarlyStoppingCallback(early_stopping_patience=2)

In [None]:
training_args = TrainingArguments(
    output_dir='output_path',
    num_train_epochs=20,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    logging_strategy= 'epoch',
    overwrite_output_dir=True,
    log_level= 'error',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss', 
    report_to='wandb',
    disable_tqdm= False,
)


trainer = Trainer(
    model=model,
    args=training_args,
    callbacks=[callback],
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
#trainer.save_model("domain_adaption/model") #save your custom model