In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import datasets
from huggingface_hub import notebook_login
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DistilBertConfig

In [2]:
#notebook_login()

In [19]:
load_dotenv()
data_dir = os.path.join(os.getenv("DATA_DIR"), 'processed')

datafiles = {
    'train': data_dir + '/text_train_data.json',
    'test': data_dir + '/text_test_data.json'
}

dataset = datasets.load_dataset('json', data_files=datafiles)

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["synopsis"])
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["synopsis", "related", "id"])

Map (num_proc=4):   0%|          | 0/8068 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (623 > 512). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/2017 [00:00<?, ? examples/s]

In [6]:
block_size = 128

In [7]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/8068 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2017 [00:00<?, ? examples/s]

In [9]:
tokenizer.decode(lm_datasets["train"][5]["input_ids"])

'and other streaming websites between february and april 2019. the blu - ray and dvd for the extra episodes will also be released on may 24, 2019. [SEP] [CLS] metropolis of music, midicity. a kitty girl wearing gothic lolita clothing named cyan is scouted by maple arisugawa, the president of a music agency. from there, she meets chuchu ( a pun off of the sound that rabbits make ) the honor student rabbit girl, a net geek dog girl named retoree ( from " retriever " ), and an alien sheep (? ) girl named moa. together, they form the band named " plasma'

In [10]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    "animeBERT",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

In [11]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [12]:
model = DistilBertForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
)

In [14]:
trainer.train()

  0%|          | 0/2649 [00:00<?, ?it/s]

{'loss': 2.642, 'grad_norm': 9.662248611450195, 'learning_rate': 1.6224990562476407e-05, 'epoch': 0.57}


  0%|          | 0/224 [00:00<?, ?it/s]

{'eval_loss': 2.3702986240386963, 'eval_runtime': 26.7438, 'eval_samples_per_second': 66.744, 'eval_steps_per_second': 8.376, 'epoch': 1.0}
{'loss': 2.5154, 'grad_norm': 10.335651397705078, 'learning_rate': 1.2449981124952812e-05, 'epoch': 1.13}
{'loss': 2.4595, 'grad_norm': 11.136015892028809, 'learning_rate': 8.674971687429219e-06, 'epoch': 1.7}


  0%|          | 0/224 [00:00<?, ?it/s]

{'eval_loss': 2.3110227584838867, 'eval_runtime': 24.9865, 'eval_samples_per_second': 71.439, 'eval_steps_per_second': 8.965, 'epoch': 2.0}
{'loss': 2.4114, 'grad_norm': 10.293266296386719, 'learning_rate': 4.899962249905625e-06, 'epoch': 2.27}
{'loss': 2.4049, 'grad_norm': 10.772994995117188, 'learning_rate': 1.124952812382031e-06, 'epoch': 2.83}


  0%|          | 0/224 [00:00<?, ?it/s]

{'eval_loss': 2.287592649459839, 'eval_runtime': 25.3349, 'eval_samples_per_second': 70.456, 'eval_steps_per_second': 8.842, 'epoch': 3.0}
{'train_runtime': 1201.5314, 'train_samples_per_second': 17.628, 'train_steps_per_second': 2.205, 'train_loss': 2.4826617761124483, 'epoch': 3.0}


TrainOutput(global_step=2649, training_loss=2.4826617761124483, metrics={'train_runtime': 1201.5314, 'train_samples_per_second': 17.628, 'train_steps_per_second': 2.205, 'total_flos': 701911321528320.0, 'train_loss': 2.4826617761124483, 'epoch': 3.0})

In [20]:
model.save_pretrained(save_directory=os.path.join(os.getenv('ROOT_DIR'), 'src', 'models', 'animeBERT'))