# Installs, Imports, Drive Connection, WandB Connection

##### Installs

In [1]:
!pip install transformers
!pip install datasets
!pip install -U PyYAML
!pip install wandb

Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 10.0 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.7 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2
Collecting datasets
  Downloading datasets-1.9.0-py3-none-any.whl (262 kB)
[K     |████████████████████████████████| 262 kB 8.9 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-

##### Imports

In [3]:
from pathlib import Path
from datasets import load_dataset

##### Drive Connection

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##### Get Configuration

In [5]:
import configparser

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini')

['/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini']

##### WandB Connection

In [7]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=goodreads_pretrained_models

env: WANDB_LOG_MODEL=true
env: WANDB_PROJECT=goodreads_pretrained_models


In [8]:
wandb.login(key = config['WandB']['api_key'])

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Dataset Generator

### Load Text Data

#### goodreads_maharjan

For Within-task pre-training

In [10]:
base_path = Path(config['Datasets']['nered_goodreads_maharjan_path'])
dataset = load_dataset(str(base_path / 'goodreadsnered.py'))

KeyError: ignored

#### goodreads_guarro

For In-domain pre-training

In [None]:
base_path = Path(config['Datasets']['goodreads_guarro_path'])
dataset = load_dataset(str(base_path / 'goodreads_guarro_loading_script.py'))

### Tokenization

In [2]:
from transformers import AutoTokenizer

In [None]:
if eval(config['Model']['use_ner']):
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'], additional_special_tokens = ['[CHARACTER]'])
else:
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'])

In [None]:
from tokenization_algos import chunk_and_encode_examples_w_complete_sentences, chunk_and_encode_examples_w_overlap

In [None]:
chunked_encoded_dataset = dataset.map(chunk_and_encode_examples_w_overlap, remove_columns=dataset.column_names['train'], batched = True)

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

num_segments = 2

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  



# Pre Training

In [None]:
# Name our project from WandB
%env WANDB_PROJECT=goodreads_success_predictor
chunked_encoded_dataset = chunked_encoded_dataset.remove_columns(['book_title', 'genre', 'success_label'])

model_name = config['Model']['name']

from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import Trainer, TrainingArguments

num_epoch = 1

wandb_run_name = 'distilbert-mlm-guarro_goodreads'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    wandb_run_name,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size = 16,
    num_train_epochs = 1,
    # per_device_eval_batch_size=32,
    weight_decay=0.01,
    report_to = "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=chunked_encoded_dataset['train']
)

trainer.train()

wandb.finish()