In [2]:
#| hide
# This notebook is an outline for 10 fold cross validation neural network
# Environment Setup
#! pdm add transformers
#! pdm add datasets
#! pdm add keras==2.6.*
#! pdm add torch==1.8.0 torchtext==0.9.0
#! pdm add torchtext

In [3]:
#| hide
import sys
sys.path.append('../__pypackages__/3.9/lib/')
print(sys.path)

['/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs', '/opt/anaconda3/lib/python39.zip', '/opt/anaconda3/lib/python3.9', '/opt/anaconda3/lib/python3.9/lib-dynload', '', '/afs/crc.nd.edu/user/p/painswor/.local/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/afs/crc.nd.edu/user/p/painswor/.ipython', '../__pypackages__/3.9/lib/']


# Training Model

In [4]:
cleaned_data = '../data/cleaned-data'

## Preprocessing

In [5]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from datasets import Dataset,DatasetDict,load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer

2023-02-27 12:23:37.477282: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Set kfold to train model

In [6]:
kfold = 1

Read kfold data into dataset

In [7]:
raw_datasets = load_dataset("csv",data_files={'train': [f'{cleaned_data}/train/FAA-{kfold}.csv'], 'test': [f'{cleaned_data}/test/FAA-{kfold}.csv'],
                                                'val': [f'{cleaned_data}/val/FAA-{kfold}.csv']})

Using custom data configuration default-40f85013f7ddcf10
Found cached dataset csv (/afs/crc.nd.edu/user/p/painswor/.cache/huggingface/datasets/csv/default-40f85013f7ddcf10/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1757
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 550
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 440
    })
})

Tokenize text column

In [9]:
model_nm = "bert-base-cased"

Create tokenizer

In [10]:
tokz = AutoTokenizer.from_pretrained(model_nm)

Tokenize inputs

In [11]:
def tok_func(x):
    return tokz(x["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tok_func, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Define datasets for training

In [12]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]

## Train and Evaluate Model

In [13]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

In [14]:
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

2023-02-27 12:23:51.639036: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-27 12:23:51.641059: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tf_train_dataset = full_train_dataset.remove_columns(["text"]).with_format("tensorflow")
tf_eval_dataset = full_eval_dataset.remove_columns(["text"]).with_format("tensorflow")
tf_val_dataset = full_val_dataset.remove_columns(["text"]).with_format("tensorflow")

Define data collator to batch data and apply dynamic padding

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [17]:
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

In [18]:
eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
eval_tf_dataset = eval_tf_dataset.batch(8)

In [19]:
val_features = {x: tf_val_dataset[x] for x in tokenizer.model_input_names}
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf_val_dataset["label"]))
val_tf_dataset = val_tf_dataset.batch(8)

In [20]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [21]:
history = model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=2)

Epoch 1/2
  1/220 [..............................] - ETA: 2:29:33 - loss: 2.5610 - sparse_categorical_accuracy: 0.0000e+00

KeyboardInterrupt: 

In [None]:
acc_train = history.history['sparse_categorical_accuracy']
acc_val = history.history['val_sparse_categorical_accuracy']

In [None]:
loss_train = history.history['loss']
loss_val = history.history['val_loss']

In [None]:
create_and_save_plots(acc_train, acc_val, 'Accuracy',kfold)
create_and_save_plots(loss_train, loss_val, 'Loss', kfold)

In [None]:
results = model.evaluate(val_tf_dataset)

In [None]:
predictions = model.predict(val_tf_dataset)

In [None]:
actual_predictions = []
  for prediction in predictions.logits:
    max = -100
    pred = -1
    for index, val in enumerate(prediction):
        if val > max:
            max = val
            pred = index
    actual_predictions.append(pred)

In [103]:
val_df = pd.read_csv(f'{cleaned_data}/val/FAA-{kfold}.csv')

In [105]:
for index, item in val_df.iterrows():

    if item['label'] == actual_predictions[index]:
        correct = correct + 1 
    
    

In [106]:
print("Correct based on my actual predictions: ", correct/len(actual_predictions))

Correct based on my actual predictions:  0.2818181818181818


In [107]:
correct

124