# Installs, Imports, Drive Connection, WandB Connection

##### Installs

In [2]:
!pip install transformers
!pip install datasets
!pip install -U PyYAML
!pip install wandb



##### Imports

In [3]:
from pathlib import Path
from datasets import load_dataset

##### Drive Connection

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Get Configuration

In [5]:
import configparser

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/Thesis/BookSuccessPredictor/config_dupe.ini')

['/content/drive/MyDrive/Thesis/BookSuccessPredictor/config_dupe.ini']

In [6]:
import sys
drive_base_path = Path(config['Drive']['drive_base_path'])

sys.path.append(str(drive_base_path / 'BookSuccessPredictor' / '_utils'))

##### WandB Connection

In [7]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=goodreads_pretrained_models

env: WANDB_LOG_MODEL=true
env: WANDB_PROJECT=goodreads_pretrained_models


In [8]:
wandb.login(key = config['WandB']['api_key'])

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Dataset Generator

### Load Text Data

#### goodreads_maharjan

For Within-task pre-training

In [None]:
base_path = Path(config['Datasets']['nered_goodreads_guarro_path'])
dataset = load_dataset(str(base_path / 'goodreadsnered.py'))

#### goodreads_guarro

For In-domain pre-training

In [9]:
base_path = Path(config['Datasets']['goodreads_guarro_path'])
dataset = load_dataset(str(base_path / 'goodreads_guarro_loading_script.py'))

Using preprocess dir: nered
Downloading and preparing dataset good_reads_practice_dataset/main_domain (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/3995ae49b4a729aa5fa9a169020340174192029ed1cac6c2470fd0009e0508cb...


0 examples [00:00, ? examples/s]

Dataset good_reads_practice_dataset downloaded and prepared to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/3995ae49b4a729aa5fa9a169020340174192029ed1cac6c2470fd0009e0508cb. Subsequent calls will reuse this data.


### Tokenization

In [10]:
from transformers import AutoTokenizer

In [11]:
eval(config['Model']['use_ner'])

True

In [12]:
if eval(config['Model']['use_ner']):
  print('adding additional token')
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'], additional_special_tokens = ['[CHARACTER]'])
else:
  print('no additional tokens added')
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'])

adding additional token


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
dataset['train']['text'][0]

'If--and the thing is wildly possible--the charge of writing nonsense were ever brought against the author of this brief but instructive poem, it would be based, I feel convinced, on the line (in p.4)            \x93Then the bowsprit got mixed with the rudder sometimes. \x94  In view of this painful possibility, I will not (as I might) appeal indignantly to my other writings as a proof that I am incapable of such a deed: I will not (as I might) point to the strong moral purpose of this poem itself, to the arithmetical principles so cautiously inculcated in it, or to its noble teachings in Natural History--I will take the more prosaic course of simply explaining how it happened. The [CHARACTER], who was almost morbidly sensitive about appearances, used to have the bowsprit unshipped once or twice a week to be revarnished, and it more than once happened, when the time came for replacing it, that no one on board could remember which end of the ship it belonged to. They knew it was not of 

In [None]:
# # sample_text = "This is your boy coming live from 203 69 street. Its yo boiii boy, Whatcha gon' do about it dawg?"
# data_tokenize = tokenizer(dataset['train']['text'][100], 
#                 max_length = 512,
#                 stride=0,
#                 return_attention_mask=True,
#                 return_token_type_ids=True,
#                 return_overflowing_tokens = True)

In [None]:
# from tokenization_algos import chunk_and_encode_examples_w_complete_sentences, chunk_and_encode_examples_w_overlap

In [14]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
digits = "([0-9])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
prefixes = "(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]" 
websites = "[.](com|net|org|io|gov|me|edu)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

import sys
def tokenize_book_and_make_labels(book, cut_off, success, tokenizer, chunk_limit = sys.maxsize, use_spacy = False):
  dictOfTokenizedChunks = split_book_into_nice_chunks(book, cut_off, tokenizer, chunk_limit, use_spacy)
  labels = [int(success)] * len(dictOfTokenizedChunks['input_ids'])
  return dictOfTokenizedChunks, labels

def seal_off_chunk(dictOfTokenizedChunks, currListOfInputIDs, cut_off):
  currListOfInputIDs.extend([102])
  attend_up_to = len(currListOfInputIDs)
  currListOfInputIDs.extend([0] * (cut_off - attend_up_to))
  
  dictOfTokenizedChunks['input_ids'].append(currListOfInputIDs)
  dictOfTokenizedChunks['token_type_ids'].append([0] * cut_off)
  dictOfTokenizedChunks['attention_mask'].append([1] * attend_up_to + [0] * (cut_off - attend_up_to))

# by default, we do not set a limit on the number of chunks.
def tokenize_complete_sentences(example, tokenizer, cut_off = 512, chunk_limit = sys.maxsize, goodreads_guarro = True):
  if goodreads_guarro:
    dictOfTokenizedChunks = {'input_ids': [], 'token_type_ids': [], 'attention_mask': []}
  else:
    dictOfTokenizedChunks = {'input_ids': [], 'token_type_ids': [], 'attention_mask': [], 'success_label': None, 'genre': None, 'book_title': None}
    
  currListOfInputIDs = [101]
  split_book = split_into_sentences(example['text'])
  num_chunks = 0;
  needs_final_seal = False
  for sent in split_book:
    next_tokenized_sent = tokenizer(sent, add_special_tokens = False)['input_ids']
    if (len(currListOfInputIDs) + len(next_tokenized_sent) < cut_off - 1):
      currListOfInputIDs.extend(next_tokenized_sent)
      needs_final_seal = True
    else:
      seal_off_chunk(dictOfTokenizedChunks, currListOfInputIDs, cut_off)
      num_chunks += 1
      if num_chunks == chunk_limit:
        return dictOfTokenizedChunks
      currListOfInputIDs = [101]  
      needs_final_seal = False
  if (needs_final_seal):
    seal_off_chunk(dictOfTokenizedChunks, currListOfInputIDs, cut_off)

  if not goodreads_guarro:
    dictOfTokenizedChunks['success_label'] = [example['success_label']] * len(dictOfTokenizedChunks['input_ids'])
    dictOfTokenizedChunks['genre'] = [example['genre']] * len(dictOfTokenizedChunks['input_ids'])
    dictOfTokenizedChunks['book_title'] = [example['book_title']] * len(dictOfTokenizedChunks['input_ids'])
    # print(len(dictOfTokenizedChunks['input_ids']))
  return dictOfTokenizedChunks

def tokenize_w_overlap(example, tokenizer):
  data_tokenize = tokenizer(example['text'], 
                  max_length = 512,
                  stride=0,
                  return_attention_mask=True,
                  return_token_type_ids=True,
                  return_overflowing_tokens = True)
  num_chunks = len(data_tokenize['input_ids'])
  return {
      'input_ids': data_tokenize['input_ids'][:num_chunks-1], 
      'token_type_ids': data_tokenize['token_type_ids'][:num_chunks-1], 
      'attention_mask': data_tokenize['attention_mask'][:num_chunks-1],
  }

# When batched = True, we take in multiple examples
def chunk_and_encode_examples_w_complete_sentences(examples, tokenizer, goodreads_guarro=True):
  mega_dict = None
  if goodreads_guarro:
    mega_dict = {'attention_mask': [], 'input_ids': [], 'token_type_ids': []}
  else:
    mega_dict = {'attention_mask': [], 'genre': [], 'input_ids': [], 'success_label': [], 'token_type_ids': [], 'book_title': []}
  for i in range(len(examples['text'])):
    book_sample = None
    if goodreads_guarro:
      book_sample = {'text': examples['text'][i]}
    else:
      book_sample = {'text': examples['text'][i], 'genre': examples['genre'][i], 'success_label': examples['success_label'][i], 'book_title':examples['book_title'][i]}
    dictOfTokenizedChunks = tokenize_complete_sentences(book_sample, tokenizer, goodreads_guarro)
    for key, value in dictOfTokenizedChunks.items():
      mega_dict[key].extend(value)
  return mega_dict

# When batched = True, we take in multiple examples
def chunk_and_encode_examples_w_overlap(examples, tokenizer):
  mega_dict = {'attention_mask': [], 'input_ids': [], 'token_type_ids': []}
  for i in range(len(examples['text'])):
    book_sample = {'text': examples['text'][i]}
    dictOfTokenizedChunks = tokenize_w_overlap(book_sample, tokenizer)
    for key, value in dictOfTokenizedChunks.items():
      mega_dict[key].extend(value)
  return mega_dict

In [15]:
from functools import partial
chunk_and_encode_examples_w_overlap = partial(chunk_and_encode_examples_w_overlap, tokenizer=tokenizer)

In [16]:
# from datasets import Dataset
# dataset_test = Dataset.from_dict(dataset['train'][0:10])

In [17]:
chunked_encoded_dataset = dataset.map(chunk_and_encode_examples_w_overlap, remove_columns=dataset.column_names['train'], batched = True)

  0%|          | 0/3 [00:00<?, ?ba/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
chunked_encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 399250
    })
})

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  



# Pre Training

In [19]:
# Name our project from WandB
%env WANDB_PROJECT=goodreads_success_predictor
# chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])

model_name = config['Model']['name']

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

env: WANDB_PROJECT=goodreads_success_predictor


Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Embedding(30523, 768)

In [20]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments

num_epoch = 1

wandb_run_name = 'distilbert-mlm-guarro_goodreads_NER_stridetokenizer'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    wandb_run_name,
    evaluation_strategy = "no",
    learning_rate=2e-5,
    per_device_train_batch_size = 16,
    num_train_epochs = 1,
    # per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to = "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train']
)

trainer.train()

wandb.finish()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids.
***** Running training *****
  Num examples = 399250
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 24954
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mlucaguarro[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss
500,2.664
1000,2.5624
1500,2.5091
2000,2.4845
2500,2.469
3000,2.4386
3500,2.4288
4000,2.4148
4500,2.4109
5000,2.3761


Saving model checkpoint to distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-500
Configuration saved in distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-500/config.json
Model weights saved in distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-1000
Configuration saved in distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-1000/config.json
Model weights saved in distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-1500
Configuration saved in distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-1500/config.json
Model weights saved in distilbert-mlm-guarro_goodreads_NER_stridetokenizer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to distilbert-mlm-guarro_goodreads_NER_stridetokenizer/che

VBox(children=(Label(value=' 255.57MB of 255.57MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
train/loss,2.2346
train/learning_rate,0.0
train/epoch,1.0
train/global_step,24954.0
_runtime,30292.0
_timestamp,1631580530.0
_step,49.0
train/train_runtime,30292.7827
train/train_samples_per_second,13.18
train/train_steps_per_second,0.824


0,1
train/loss,█▆▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁
