## Video Tutorial

This EDA comes along with a video tutorial, check it out [here](https://www.youtube.com/watch?v=iiwEW-sg9KE&list=PL_49VD9KwQ_OJCqZOeOlSUQKcr1MyifOc&index=2).

In [1]:
# !pip install datasets
# !pip uninstall fsspec -y
# !pip install fsspec==2021.5.0

!pip install \
    /kaggle/input/huggingfaces/datasets/datasets* \
    /kaggle/input/huggingfaces/datasets/huggingface_hub* \
    /kaggle/input/huggingfaces/datasets/tqdm* \
    /kaggle/input/huggingfaces/datasets/xxhash*
!pip uninstall fsspec -y
!pip install /kaggle/input/huggingfaces/datasets/fsspec*

Processing /kaggle/input/huggingfaces/datasets/datasets-1.6.2-py3-none-any.whl
Processing /kaggle/input/huggingfaces/datasets/huggingface_hub-0.0.9-py3-none-any.whl
Processing /kaggle/input/huggingfaces/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/huggingfaces/datasets/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.59.0
    Uninstalling tqdm-4.59.0:
      Successfully uninstalled tqdm-4.59.0
Successfully installed datasets-1.6.2 huggingface-hub-0.0.9 tqdm-4.49.0 xxhash-2.0.2
Found existing installation: fsspec 0.8.7
Uninstalling fsspec-0.8.7:
  Successfully uninstalled fsspec-0.8.7
Processing /kaggle/input/huggingfaces/datasets/fsspec-2021.5.0-py3-none-any.whl
Installing collected packages: fsspec
Successfully installed fsspec-2021.5.0


In [2]:
%env WANDB_DISABLED=true

env: WANDB_DISABLED=true


In [3]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import torch
from torch import nn

sns.set()
%matplotlib inline

In [4]:
data_dir = '/kaggle/input/commonlitreadabilityprize'
train_data_path = os.path.join(data_dir, 'train.csv')
test_data_path = os.path.join(data_dir, 'test.csv')

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(len(train_df))
print(len(test_df))

2834
7


In [5]:
huggingface_dir = '/kaggle/input/huggingface-bert'
model_dir = os.path.join(huggingface_dir, 'bert-base-cased')

tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

Some weights of the model checkpoint at /kaggle/input/huggingface-bert/bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [6]:
model.classifier = nn.Linear(768, 1)
model.num_labels = 1

In [7]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([0]).unsqueeze(0)  # Batch size 1
inputs['labels'] = labels
outputs = model(**inputs)
loss = outputs.loss
logits = outputs.logits

print(outputs)
print(loss)
print(logits)

SequenceClassifierOutput(loss=tensor(1.5078, grad_fn=<MseLossBackward>), logits=tensor([[-1.2279]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
tensor(1.5078, grad_fn=<MseLossBackward>)
tensor([[-1.2279]], grad_fn=<AddmmBackward>)


In [8]:
train_datasets = load_dataset('csv', data_files=[train_data_path])
test_datasets = load_dataset('csv', data_files=[test_data_path])

Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-f73cb8e70052aef8/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f73cb8e70052aef8/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.
Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-26900ac0626744ab/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-26900ac0626744ab/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0. Subsequent calls will reuse this data.


In [9]:
def tokenize_function(examples):
    return tokenizer(examples['excerpt'], padding='max_length', truncation=True, max_length=512)

f_train_datasets = train_datasets.map(tokenize_function, batched=True)
f_train_datasets = f_train_datasets.remove_columns(['id', 'url_legal', 'license', 'excerpt', 'standard_error'])
f_train_datasets = f_train_datasets.rename_column('target', 'labels')
f_train_datasets = f_train_datasets.shuffle(seed=42)

f_test_datasets = test_datasets.map(tokenize_function, batched=True)
f_test_datasets = f_test_datasets.remove_columns(['url_legal', 'license', 'excerpt'])

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [10]:
n_samples = len(f_train_datasets['train'])
n_train = int(0.9 * n_samples)

f_train_dataset = f_train_datasets['train'].select(range(n_train))
f_eval_dataset = f_train_datasets['train'].select(range(n_train, n_samples))

f_test_dataset = f_test_datasets['train']

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits, labels = logits.squeeze(), labels.squeeze()
    rmse = np.sqrt(np.mean((labels - logits) ** 2))
    return {'RMSE': rmse}

In [12]:
# os.environ['WANDB_API_KEY'] = '19baf7fe1571ebd98eff8449df8e8cbc3d30c634'

In [13]:
training_args = TrainingArguments(
    'training_args',
    num_train_epochs = 5,
    logging_steps = 200,
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    evaluation_strategy = 'steps'
)

trainer = Trainer(
    model = model,
    train_dataset = f_train_dataset,
    eval_dataset = f_eval_dataset,
    compute_metrics = compute_metrics,
    args = training_args
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
# for x in f_train_dataset:
#     a = len(x['input_ids'])
#     print(a)

# # [len(v) for v in f_train_dataset[0].values()]

In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Rmse,Runtime,Samples Per Second
200,0.6778,0.556667,0.746101,5.2927,53.659
400,0.5321,0.675846,0.822099,5.2902,53.684
600,0.373,0.526141,0.725356,5.2639,53.952
800,0.2972,0.530737,0.728517,5.2739,53.85
1000,0.253,0.614836,0.784114,5.2628,53.964
1200,0.192,0.397765,0.630686,5.3062,53.522
1400,0.1581,0.504232,0.710093,5.2795,53.793


TrainOutput(global_step=1595, training_loss=0.32731007662686434, metrics={'train_runtime': 1151.9769, 'train_samples_per_second': 1.385, 'total_flos': 4242326853888000.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 1805897728, 'init_mem_gpu_alloc_delta': 433773056, 'init_mem_cpu_peaked_delta': 88604672, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 244830208, 'train_mem_gpu_alloc_delta': 1361674752, 'train_mem_cpu_peaked_delta': 267206656, 'train_mem_gpu_peaked_delta': 6504080384})

In [16]:
trainer.evaluate()

{'eval_loss': 0.3960549235343933,
 'eval_RMSE': 0.6293289661407471,
 'eval_runtime': 5.3445,
 'eval_samples_per_second': 53.139,
 'epoch': 5.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 289528832}

In [17]:
# model.save_pretrained('model_v1')

In [18]:
pred_output = trainer.predict(f_test_dataset)
pred_targets = pred_output.predictions.squeeze()
pred_ids = f_test_dataset['id']

submission = pd.DataFrame({
    'id': pred_ids,
    'target': pred_targets
})

submission.to_csv('submission.csv', index=False)