In [2]:
#| hide
# This notebook is an outline for 10 fold cross validation neural network
# Environment Setup
#! pdm add transformers
#! pdm add datasets
#! pdm add keras==2.6.*
#! pdm add torch==1.8.0 torchtext==0.9.0
#! pdm add torchtext

In [21]:
#| hide
import sys
sys.path.append('../__pypackages__/3.9/lib/')
print(sys.path)

['/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs', '/opt/anaconda3/lib/python39.zip', '/opt/anaconda3/lib/python3.9', '/opt/anaconda3/lib/python3.9/lib-dynload', '', '/afs/crc.nd.edu/user/p/painswor/.local/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/afs/crc.nd.edu/user/p/painswor/.ipython', '../__pypackages__/3.9/lib/', '/afs/crc.nd.edu/user/p/painswor/.cache/huggingface/modules', '../__pypackages__/3.9/lib/']


# Training Model

In [22]:
cleaned_data = '../data/processed-data/nn'

## Preprocessing

In [23]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from datasets import Dataset,DatasetDict,load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer

Set kfold to train model

In [6]:
kfold = 1

Read kfold data into dataset

In [7]:
raw_datasets = load_dataset("csv",data_files={'train': [f'{cleaned_data}/train/FAA-{kfold}.csv'], 'test': [f'{cleaned_data}/test/FAA-{kfold}.csv'],
                                                'val': [f'{cleaned_data}/val/FAA-{kfold}.csv']})

Using custom data configuration default-353e60438da98b3f


Downloading and preparing dataset csv/default to /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/datasets/csv/default-353e60438da98b3f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/datasets/csv/default-353e60438da98b3f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1757
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 550
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 440
    })
})

In [9]:
raw_datasets['train'][0]

{'text': 'WIND RAISED LIGHT SLING LOAD . CABLE HIT TAILROTOR. MADE HARD AUTOROTATE LANDING.                                  ',
 'label': 6}

Tokenize text column

In [10]:
model_nm = "bert-base-cased"

Create tokenizer

In [11]:
tokz = AutoTokenizer.from_pretrained(model_nm)

Tokenize inputs

In [12]:
def tok_func(x):
    return tokz(x["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tok_func, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Define datasets for training

In [13]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokz)

In [15]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train and Evaluate Model

In [16]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [18]:
training_args = TrainingArguments(
    output_dir="../output/",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    tokenizer=tokz,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [20]:
history = trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1757
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 110
  Number of trainable parameters = 108315655
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [22]:
trainer.save_model("../output/model")

Saving model checkpoint to ../model
Configuration saved in ../model/config.json
Model weights saved in ../model/pytorch_model.bin
tokenizer config file saved in ../model/tokenizer_config.json
Special tokens file saved in ../model/special_tokens_map.json


In [24]:
full_val_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 440
})

In [28]:
history

TrainOutput(global_step=110, training_loss=0.7636618874289772, metrics={'train_runtime': 89.0243, 'train_samples_per_second': 39.472, 'train_steps_per_second': 1.236, 'total_flos': 924613755340800.0, 'train_loss': 0.7636618874289772, 'epoch': 2.0})