In [24]:
# !pip install transformers torch sklearn tensorflow --user 

In [25]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle

# Based on data from pytorch and hugging face tutorials
# https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python

import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import random
from sklearn.model_selection import train_test_split

In [26]:
# Function copied from online tutorial to help reproducibility.
# https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python

def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch``

    Args:
        seed (int): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available

set_seed(1)

In [27]:
#file paths
wiki_train = "datasets/WikiLarge_Train.csv"
wiki_test = "datasets/WikiLarge_Test.csv"

In [28]:
train_df = pd.read_csv(wiki_train)
print(train_df.shape)
train_df.head()

(416768, 2)


Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [29]:
X_train, X_test, y_train, y_test = train_test_split(train_df["original_text"], train_df["label"], test_size=0.2, random_state=1)
label_names = ["Simple","Complex"]

In [30]:
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [31]:
model_name = "bert-base-uncased"
max_length = 128

In [32]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=466062, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=28, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Downloading', max=570, style=ProgressStyle(description_width=…




In [33]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

In [34]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, y_train)
valid_dataset = NewsGroupsDataset(valid_encodings, y_test)

In [38]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_names)).to("cuda")

In [15]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [16]:
training_args = TrainingArguments(
    output_dir='results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=600,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=5000,               # log & save weights each logging_steps
    save_steps=5000,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [17]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 333414
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 62517


Step,Training Loss,Validation Loss,Accuracy
5000,0.5575,0.525862,0.732706


***** Running Evaluation *****
  Num examples = 83354
  Batch size = 20
Saving model checkpoint to /content/drive/Shareddrives/Milestone 2 - James Mete - Matt Dannheisser/notebooks/results/checkpoint-5000
Configuration saved in /content/drive/Shareddrives/Milestone 2 - James Mete - Matt Dannheisser/notebooks/results/checkpoint-5000/config.json
Model weights saved in /content/drive/Shareddrives/Milestone 2 - James Mete - Matt Dannheisser/notebooks/results/checkpoint-5000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 83354
  Batch size = 20


In [None]:
trainer.evaluate()

In [36]:
model_path = "models/text-difficulty-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

NameError: name 'model' is not defined