## MODEL

In [1]:
import pandas as pd
from os.path import dirname, join
import numpy as np
from transformers import AutoModelForMaskedLM,  AutoTokenizer

In [2]:
model_name= 'google/bert_uncased_L-2_H-128_A-2'

In [3]:
bert_tiny = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
bert_tiny.config

BertConfig {
  "_name_or_path": "google/bert_uncased_L-2_H-128_A-2",
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [5]:
bert_tiny.num_parameters()
model = bert_tiny

## DATASET

In [None]:
# !pip install datasets

In [6]:
input_dir = 'gs://profile-notes/geekfest_files/unlabeled_data/'

train_filename = 'train_text_imaging_only.txt'
valid_filename = 'valid_text_imaging_only.txt'

In [7]:
filename= join(input_dir,valid_filename)

In [8]:
train_data =  pd.read_csv(filename)

In [9]:
train_data.shape

(79504, 1)

In [10]:
train_data.head()

Unnamed: 0,text
0,exam number: a09481858 report status: final ty...
1,exam number: a07029758 report status: final ty...
2,exam number: a08201175 report status: final ty...
3,exam number: a09146288 report status: final ty...
4,exam number: a10099242 report status: final ty...


In [11]:
train_data= train_data[train_data.text.str.len() > 50]

## TOKENIZER

In [12]:
train_data.shape

(79354, 1)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, padding=True, max_len=512)

In [14]:
tokenizer

PreTrainedTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [16]:
import torch
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self, text, tokenizer):
        self.text = text
        self.tokenizer = tokenizer
        
    def __getitem__(self, idx):
        item = self.text[idx]
        item_tokenized = self.tokenizer(item, truncation=True, padding=True, return_special_tokens_mask=True)
        ret = {key: torch.tensor(val) for key, val in item_tokenized.items()}
        return ret

    def __len__(self):
        return len(self.text)

In [31]:
type(train_data['text'].values)

numpy.ndarray

In [17]:
input_dataset = TorchDataset(train_data['text'].values, tokenizer)

## TRAINER

In [18]:
from transformers import Trainer, TrainingArguments

model = bert_tiny
training_args = TrainingArguments(
    output_dir="./notes",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    max_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=input_dataset,
#     prediction_loss_only=True,
)

In [19]:
%%time
trainer.train()

Step,Training Loss


KeyError: 53551

In [20]:
bert_tiny.save_pretrained('./wikituned')

In [None]:
from transformers import AutoModelForSequenceClassification

bert_tiny_loaded = AutoModelForSequenceClassification.from_pretrained('./wikituned')