# Installs, Imports, Drive Connection, WandB Connection

##### Installs

In [1]:
!pip install transformers
!pip install datasets
!pip install -U PyYAML
!pip install wandb

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 8.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 56.4 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 63.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

##### Imports

In [2]:
from pathlib import Path
from datasets import load_dataset

##### Drive Connection

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Get Configuration

In [4]:
import configparser

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini')

['/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini']

In [5]:
import sys
drive_base_path = Path(config['Drive']['drive_base_path'])

sys.path.append(str(drive_base_path / 'BookSuccessPredictor' / '_utils'))

##### WandB Connection

In [6]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=goodreads_pretrained_models

env: WANDB_LOG_MODEL=true
env: WANDB_PROJECT=goodreads_pretrained_models


In [7]:
wandb.login(key = config['WandB']['api_key'])

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Dataset Generator

### Load Text Data

#### goodreads_maharjan

For Within-task pre-training

In [None]:
base_path = Path(config['Datasets']['nered_goodreads_maharjan_path'])
dataset = load_dataset(str(base_path / 'goodreadsnered.py'))

Downloading and preparing dataset good_reads_practice_dataset/main_domain to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/fee2eb60ac7713af6f776b7c4dab63145144f749e5689ea3dc2299235f6f560e...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset good_reads_practice_dataset downloaded and prepared to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/fee2eb60ac7713af6f776b7c4dab63145144f749e5689ea3dc2299235f6f560e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

#### goodreads_guarro

For In-domain pre-training

In [None]:
base_path = Path(config['Datasets']['goodreads_guarro_path'])
dataset = load_dataset(str(base_path / 'goodreads_guarro_loading_script.py'))

Using preprocess dir: nered
Downloading and preparing dataset good_reads_practice_dataset/main_domain to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/e30f74f04f5a1d1a80603f66ab1c4edf115a3b542a22dfd44df51cc98859a39c...


0 examples [00:00, ? examples/s]

Dataset good_reads_practice_dataset downloaded and prepared to /root/.cache/huggingface/datasets/good_reads_practice_dataset/main_domain/1.1.0/e30f74f04f5a1d1a80603f66ab1c4edf115a3b542a22dfd44df51cc98859a39c. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

### Tokenization

In [8]:
from transformers import AutoTokenizer

In [9]:
eval(config['Model']['use_ner'])

True

In [22]:
if eval(config['Model']['use_ner']):
  print('adding additional token')
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'], additional_special_tokens = ['[CHARACTER]'])
else:
  print('no additional tokens added')
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'])

adding additional token


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
config['Model']['name']

'distilbert-base-uncased'

In [None]:
# # sample_text = "This is your boy coming live from 203 69 street. Its yo boiii boy, Whatcha gon' do about it dawg?"
# data_tokenize = tokenizer(dataset['train']['text'][100], 
#                 max_length = 512,
#                 stride=0,
#                 return_attention_mask=True,
#                 return_token_type_ids=True,
#                 return_overflowing_tokens = True)

In [None]:
# from tokenization_algos import chunk_and_encode_examples_w_complete_sentences, chunk_and_encode_examples_w_overlap

In [None]:
import sys
def tokenize_book_and_make_labels(book, cut_off, success, tokenizer, chunk_limit = sys.maxsize, use_spacy = False):
  dictOfTokenizedChunks = split_book_into_nice_chunks(book, cut_off, tokenizer, chunk_limit, use_spacy)
  labels = [int(success)] * len(dictOfTokenizedChunks['input_ids'])
  return dictOfTokenizedChunks, labels

def seal_off_chunk(dictOfTokenizedChunks, currListOfInputIDs, cut_off):
  currListOfInputIDs.extend([102])
  attend_up_to = len(currListOfInputIDs)
  currListOfInputIDs.extend([0] * (cut_off - attend_up_to))
  
  dictOfTokenizedChunks['input_ids'].append(currListOfInputIDs)
  dictOfTokenizedChunks['token_type_ids'].append([0] * cut_off)
  dictOfTokenizedChunks['attention_mask'].append([1] * attend_up_to + [0] * (cut_off - attend_up_to))

def tokenize_w_overlap(example, tokenizer):
  data_tokenize = tokenizer(example['text'], 
                  max_length = 512,
                  stride=0,
                  return_attention_mask=True,
                  return_token_type_ids=True,
                  return_overflowing_tokens = True)
  num_chunks = len(data_tokenize['input_ids'])
  return {
      'input_ids': data_tokenize['input_ids'][:num_chunks-1], 
      'token_type_ids': data_tokenize['token_type_ids'][:num_chunks-1], 
      'attention_mask': data_tokenize['attention_mask'][:num_chunks-1],
  }


# When batched = True, we take in multiple examples
def chunk_and_encode_examples_w_overlap(examples, tokenizer):
  mega_dict = {'attention_mask': [], 'input_ids': [], 'token_type_ids': []}
  for i in range(len(examples['text'])):
    book_sample = {'text': examples['text'][i]}
    dictOfTokenizedChunks = tokenize_w_overlap(book_sample, tokenizer)
    for key, value in dictOfTokenizedChunks.items():
      mega_dict[key].extend(value)
  return mega_dict

In [None]:
from functools import partial
chunk_and_encode_examples_w_overlap = partial(chunk_and_encode_examples_w_overlap, tokenizer=tokenizer)

In [None]:
# from datasets import Dataset
# dataset_test = Dataset.from_dict(dataset['train'][0:10])

In [None]:
chunked_encoded_dataset = dataset.map(chunk_and_encode_examples_w_overlap, remove_columns=dataset.column_names['train'], batched = True)

  0%|          | 0/3 [00:00<?, ?ba/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
chunked_encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 399250
    })
})

In [None]:
num_parts = 15

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pickle

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

num_chunks = chunked_encoded_dataset['train'].num_rows
step = num_chunks // num_parts
folder_id = '1sSx_f_1sogMKqwRAWE31FXc0a8rJ0tzQ'
for i in range(num_parts):
  print(i)
  filename = 'train_dataset' + str(i) + '.pkl'
  if i == num_parts - 1:
    dumpme = chunked_encoded_dataset['train'][step*i:]
  else:
    dumpme = chunked_encoded_dataset['train'][step*i:step*(i+1)]
  
  with open(filename, 'wb') as output_file:
    pickle.dump(dumpme, output_file)

  file = drive.CreateFile({'parents':[{u'id': folder_id}]})
  file.SetContentFile(filename)
  file.Upload()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [12]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [15]:
# project_gutenberg, goodreads_maharjan_super
load_path = Path(config['Drive']['drive_base_path']) / 'BookSuccessPredictor' / 'datasets' / 'goodreads_maharjan_super' / 'already_tokenized' / '80_20'

preprocess = 'NERed' if eval(config['Model']['use_ner']) else 'Standard'
load_path = load_path / preprocess

tokenizer_folder = 'OverlapTokenizer' + '/' + config['Tokenizer']['overlap_amt'] if eval(config['Tokenizer']['overlap']) else 'SentenceTokenizer'
load_path = load_path / tokenizer_folder

import re
model_folder = config['Model']['name']
model_folder = model_folder.replace('-base', '')
model_folder = re.sub(r'.*/*(?=\\)\\','', model_folder)
load_path = load_path / model_folder

del preprocess
del tokenizer_folder
del model_folder

In [16]:
load_path

PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/already_tokenized/80_20/NERed/OverlapTokenizer/0/distilbert-uncased')

In [18]:
from datasets import DatasetDict, Dataset, concatenate_datasets
import os, pickle
train_paths = [f for f in os.listdir(load_path) if f.startswith('train') or f.startswith('val') or f.startswith('test')]

train_datasets = []

for trainp in train_paths:
  print(trainp)
  with open(load_path / trainp, "rb") as input_file:
    train_datasets.append(Dataset.from_dict(pickle.load(input_file)))

train_dataset = concatenate_datasets(train_datasets)
del train_datasets

chunked_encoded_dataset = DatasetDict({'train': train_dataset})

train_dataset1.pkl
train_dataset2.pkl
val_dataset1.pkl
val_dataset2.pkl
test_dataset1.pkl
test_dataset2.pkl


In [19]:
chunked_encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'book_title', 'genre', 'input_ids', 'success_label', 'token_type_ids'],
        num_rows: 33940
    })
})

In [20]:
updated_dataset = chunked_encoded_dataset.map(lambda example: {}, remove_columns=['book_title', 'genre', 'success_label'])

  0%|          | 0/33940 [00:00<?, ?ex/s]

In [21]:
updated_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'token_type_ids'],
        num_rows: 33940
    })
})

In [25]:
from transformers import AutoModelForMaskedLM
run = wandb.init()
artifact = run.use_artifact('lucaguarro/goodreads_success_predictor_mlm/model-5v9y1o6j:v0', type='model')
pretrained_model_name_or_path = artifact.download()
model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)
model.resize_token_embeddings(len(tokenizer))

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: Downloading large artifact model-5v9y1o6j:v0, 255.57MB. 3 files... Done. 0:0:0


Embedding(30523, 768, padding_idx=0)

# Pre Training

In [None]:
# Name our project from WandB
%env WANDB_PROJECT=goodreads_success_predictor_mlm
# chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])

model_name = config['Model']['name']

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

env: WANDB_PROJECT=goodreads_success_predictor_mlm


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Embedding(30523, 768)

In [26]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments

num_epoch = 1

wandb_run_name = 'distilbert-mlm-guarro&maharjan_goodreads_NERed_overlap0'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    wandb_run_name,
    evaluation_strategy = "no",
    learning_rate=2e-5,
    per_device_train_batch_size = 16,
    num_train_epochs = 1,
    # per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to = "wandb",
    save_strategy = "epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train']
)

trainer.train()

wandb.finish()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids, book_title, genre, success_label.
***** Running training *****
  Num examples = 33940
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2122
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,2.3312
1000,2.3187
1500,2.3114
2000,2.3183


Saving model checkpoint to distilbert-mlm-guarro&maharjan_goodreads_NERed_overlap0/checkpoint-2122
Configuration saved in distilbert-mlm-guarro&maharjan_goodreads_NERed_overlap0/checkpoint-2122/config.json
Model weights saved in distilbert-mlm-guarro&maharjan_goodreads_NERed_overlap0/checkpoint-2122/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tmp/tmpphua2yey
Configuration saved in /tmp/tmpphua2yey/config.json
Model weights saved in /tmp/tmpphua2yey/pytorch_model.bin


VBox(children=(Label(value=' 255.58MB of 255.58MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
train/epoch,▁▃▅▇█
train/global_step,▁▃▅▇█
train/learning_rate,█▆▃▁
train/loss,█▄▁▃
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.0
train/global_step,2122.0
train/learning_rate,0.0
train/loss,2.3183
train/total_flos,4499125742161920.0
train/train_loss,2.31798
train/train_runtime,1312.1483
train/train_samples_per_second,25.866
train/train_steps_per_second,1.617
