# Installs, Imports, Drive Connection, WandB Connection

##### Installs

In [1]:
!pip install transformers
!pip install datasets
!pip install -U PyYAML
!pip install wandb

Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 38.0 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.5 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2
Collecting datasets
  Downloading datasets-1.9.0-py3-none-any.whl (262 kB)
[K     |████████████████████████████████| 262 kB 32.3 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manyl

##### Imports

In [9]:
from pathlib import Path
from datasets import load_dataset

##### Drive Connection

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Get Configuration

In [13]:
import configparser

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini')

['/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini']

##### WandB Connection

In [4]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=goodreads_pretrained_models

env: WANDB_LOG_MODEL=true
env: WANDB_PROJECT=goodreads_pretrained_models


In [5]:
wandb.login(key = config['WandB']['api_key'])

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Dataset Generator

### Load Text Data

#### goodreads_maharjan

For Within-task pre-training

In [24]:
base_path = Path(config['Datasets']['goodreads_maharjan_path'])
dataset = load_dataset(str(base_path / 'goodreadsnered.py'))

Downloading and preparing dataset good_reads_practice_dataset/main_domain (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/2d3e0c5e46b10ced55eb2fed138958ef5f14854c7f2edf2bdfde1d44485d9c39...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset good_reads_practice_dataset downloaded and prepared to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/2d3e0c5e46b10ced55eb2fed138958ef5f14854c7f2edf2bdfde1d44485d9c39. Subsequent calls will reuse this data.


#### goodreads_guarro

For In-domain pre-training

In [None]:
base_path = Path(config['Datasets']['goodreads_guarro_path'])
dataset = load_dataset(str(base_path / 'goodreads_guarro_loading_script.py'))

### Tokenization

# Pre Training

## BERT 512

In [None]:
# Name our project from WandB
%env WANDB_PROJECT=goodreads_success_predictor

env: WANDB_PROJECT=goodreads_success_predictor


In [None]:
chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])

In [None]:
chunked_encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 14074
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 9872
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 9712
    })
})

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', additional_special_tokens = ['[CHARACTER]'])

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained('bert-base-cased')
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(28997, 768)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments

num_epoch = 5

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    "bert-mlm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to = "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train'],
    eval_dataset=chunked_encoded_dataset['validation']
)

trainer.train()

wandb.finish()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,2.822,2.406996,209.8435,48.374
2,2.3157,1.998168,209.8536,48.372
3,2.1699,1.95517,209.779,48.389


VBox(children=(Label(value=' 413.40MB of 413.40MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
train/loss,2.1699
train/learning_rate,0.0
train/epoch,3.0
train/global_step,5448.0
_runtime,4256.0
_timestamp,1620231928.0
_step,13.0
eval/loss,1.95517
eval/runtime,209.779
eval/samples_per_second,48.389


0,1
train/loss,█▆▆▄▃▃▂▁▁▁
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/epoch,▁▂▂▃▃▄▅▅▅▆▇▇██
train/global_step,▁▂▂▃▃▄▅▅▅▆▇▇██
_runtime,▁▂▂▃▃▄▄▅▅▆▇▇██
_timestamp,▁▂▂▃▃▄▄▅▅▆▇▇██
_step,▁▂▂▃▃▄▄▅▅▆▆▇▇█
eval/loss,█▂▁
eval/runtime,▇█▁
eval/samples_per_second,▂▁█


## DistilBert 512

In [None]:
# Name our project from WandB
%env WANDB_PROJECT=goodreads_success_predictor

env: WANDB_PROJECT=goodreads_success_predictor


In [None]:
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', additional_special_tokens = ['[CHARACTER]'])

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased')
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(30523, 768)

In [None]:
chunked_encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 386820
    })
})

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments

num_epoch = 1

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    "distilbert-mlm-guarro_goodreads",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size = 16,
    num_train_epochs = 1,
    # per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to = "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train']
)

trainer.train()

wandb.finish()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids.
***** Running training *****
  Num examples = 386820
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 24177
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mlucaguarro[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss


Saving model checkpoint to distilbert-mlm-guarro_goodreads/checkpoint-500
Configuration saved in distilbert-mlm-guarro_goodreads/checkpoint-500/config.json
Model weights saved in distilbert-mlm-guarro_goodreads/checkpoint-500/pytorch_model.bin
Saving model checkpoint to distilbert-mlm-guarro_goodreads/checkpoint-1000
Configuration saved in distilbert-mlm-guarro_goodreads/checkpoint-1000/config.json
Model weights saved in distilbert-mlm-guarro_goodreads/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to distilbert-mlm-guarro_goodreads/checkpoint-1500
Configuration saved in distilbert-mlm-guarro_goodreads/checkpoint-1500/config.json
Model weights saved in distilbert-mlm-guarro_goodreads/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to distilbert-mlm-guarro_goodreads/checkpoint-2000
Configuration saved in distilbert-mlm-guarro_goodreads/checkpoint-2000/config.json
Model weights saved in distilbert-mlm-guarro_goodreads/checkpoint-2000/pytorch_model.bin
Saving model ch

## ROBERTA 512

In [None]:
chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])

In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', additional_special_tokens = ['[CHARACTER]'])

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained('roberta-base')
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Embedding(50266, 768)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments

num_epoch = 5

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    "roberta-mlm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to = "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train'],
    eval_dataset=chunked_encoded_dataset['validation']
)

trainer.train()

wandb.finish()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.9398,1.737092,238.0227,42.731
2,1.8586,1.690681,237.6009,42.807
3,1.8277,1.659989,237.4209,42.84
4,1.7728,1.647067,237.4105,42.841
5,1.7553,1.646039,237.4291,42.838


VBox(children=(Label(value=' 475.80MB of 475.80MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
train/loss,1.7553
train/learning_rate,0.0
train/epoch,5.0
train/global_step,9115.0
_runtime,7951.0
_timestamp,1620353649.0
_step,23.0
eval/loss,1.64604
eval/runtime,237.4291
eval/samples_per_second,42.838


0,1
train/loss,█▅▅▄▄▄▃▃▂▃▂▂▂▁▁▁▁▁
train/learning_rate,██▇▇▆▆▆▅▅▄▄▃▃▃▂▂▁▁
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇████
_runtime,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▇▇▇▇███
_step,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇██
eval/loss,█▄▂▁▁
eval/runtime,█▃▁▁▁
eval/samples_per_second,▁▆███


## BIGBIRD 2048

In [None]:
chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])

ValueError: ignored

In [None]:
chunked_encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 3697
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 2590
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 2547
    })
})

In [None]:
from transformers import BigBirdTokenizer
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base', additional_special_tokens = ['[CHARACTER]'])

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained('google/bigbird-roberta-base')
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(50359, 768)

In [None]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained('google/bigbird-roberta-base')
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(50359, 768)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    "bigbird-mlm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    report_to = "wandb",
    gradient_accumulation_steps = 4
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train'],
    eval_dataset=chunked_encoded_dataset['validation']
)
# gradient_accumulation_steps 
trainer.train()

wandb.finish()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
0,2.5395,1.980498,1312.7739,1.973
1,2.1408,1.918812,1367.9684,1.893
2,2.0715,1.892684,1342.9984,1.929


VBox(children=(Label(value=' 488.85MB of 488.85MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
train/loss,2.0715
train/learning_rate,0.0
train/epoch,3.0
train/global_step,2772.0
_runtime,13055.0
_timestamp,1620279957.0
_step,8.0
eval/loss,1.89268
eval/runtime,1342.9984
eval/samples_per_second,1.929


0,1
train/loss,█▃▂▂▁
train/learning_rate,█▆▅▃▁
train/epoch,▁▂▃▄▅▆▇██
train/global_step,▁▂▃▄▅▆▇██
_runtime,▁▃▃▄▅▆▇██
_timestamp,▁▃▃▄▅▆▇██
_step,▁▂▃▄▅▅▆▇█
eval/loss,█▃▁
eval/runtime,▁█▅
eval/samples_per_second,█▁▄


In [None]:
wandb.finish()

## ELECTRA 512

https://towardsdatascience.com/understanding-electra-and-training-an-electra-language-model-3d33e3a9660d

In [None]:
# Name our project fro WandB
%env WANDB_PROJECT=goodreads_success_predictor

env: WANDB_PROJECT=goodreads_success_predictor


In [None]:
chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])