In [1]:
#| hide
# This notebook is an outline for 10 fold cross validation neural network

# Environment Setup
#! pdm add transformers
#! pdm add datasets
#! pdm add keras==2.6.*
#! pdm add torch==1.8.0 torchtext==0.9.0
#! pdm add torchtext

In [2]:
#| hide
import sys
sys.path.append('../__pypackages__/3.9/lib/')
print(sys.path)

['/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs', '/opt/anaconda3/lib/python39.zip', '/opt/anaconda3/lib/python3.9', '/opt/anaconda3/lib/python3.9/lib-dynload', '', '/afs/crc.nd.edu/user/p/painswor/.local/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/afs/crc.nd.edu/user/p/painswor/.ipython', '../__pypackages__/3.9/lib/']


In [94]:
#| hide
import torch
#from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim


# Selecting Data

In [4]:
import pandas as pd

First lets define where our data is located:

In [5]:
cleaned_data = '../data/cleaned-data'

Now we can import our data

In [6]:
raw_data = pd.read_csv(f'{cleaned_data}/Maintenance_Text_data.csv')
raw_data.shape

(2763, 34)

Select the columns of relevance

In [7]:
df = pd.DataFrame()
df['text'] = raw_data['c119']
df['label'] = raw_data['c78']
df

Unnamed: 0,text,label
0,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,AU
1,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,ME
2,"2ND ILS APCH,ACFT'S G/S INOP.LOM TUNED TO WRON...",AU
3,PLT NOTED SOFT R BRAKE PEDAL DRG TAXI TO TKOF....,AU
4,TAXI OFF HARD SFC DUE TFC R MAIN GR BROKE THRO...,AF
...,...,...
2758,(-23) A/C RELOCATED TO NEW HANGAR TO CHECK SIZ...,II
2759,(-23) ON 2/23/08 @ APPROXIMATELY 2130 DURING T...,AF
2760,(-23) PILOT TOOK OFF FOR LEESBURG AIRPORT AND ...,II
2761,(-23) OWNER FORGOT TO FASTEN THE LOWER LEFT 4 ...,II


## Cleaning Data

Check which columns contain NaN values

In [8]:
df.isna().sum()

text     15
label     0
dtype: int64

Remove NaN values

In [9]:
df = df.fillna('Null')
df = df[df['text'] != 'Null']

Check there are no missing values left

In [10]:
df.isna().sum()

text     0
label    0
dtype: int64

Get a quick summary of all non-numeric columns in the dataset

In [11]:
df.describe(include=[object])

Unnamed: 0,text,label
count,2748,2748
unique,2742,8
top,FORCED LANDING AFTER POWER LOSS. FOUND WATER I...,II
freq,4,1942


Remove rows with one occurance

In [37]:
counts = df['label'].value_counts()
df = df[df['label'].isin(counts[counts > 1].index)]

## Encoding Data

In [49]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

In [50]:
ss = StratifiedShuffleSplit(n_splits=10, test_size=0.20, random_state=0)

In [51]:
ss.get_n_splits(df)

10

In [52]:
X, y = df["text"], df["label"]

In [54]:
for i, (train_index, test_index) in enumerate(ss.split(X, y)):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.20, random_state=0)
    
    Encoder = LabelEncoder()
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    y_val_encode = Encoder.fit_transform(y_val)
    
    final_train = pd.DataFrame({'text':X_train,'label':y_train})
    final_test = pd.DataFrame({'text':X_test,'label':y_test})
    final_val = pd.DataFrame({'text':X_val,'label':y_val_encode})
    
    final_train.to_csv(f'{cleaned_data}/train/FAA-{i}.csv', index=False)
    final_test.to_csv(f'{cleaned_data}/test/FAA-{i}.csv', index=False)
    final_val.to_csv(f'{cleaned_data}/val/FAA-{i}.csv', index=False)
    y_val.to_csv(f'{cleaned_data}/actual/FAA-{i}.csv', index=False)

## Tokenization

In [15]:
from datasets import Dataset,DatasetDict

Create a dataset from our dataframe

In [80]:
df = pd.read_csv(f'{cleaned_data}/train/FAA-0.csv')
df

Unnamed: 0,text,label
0,FORCED LANDING AFTER POWER LOSS. HELICOPTER HA...,5
1,NOSE CARGO COMPARTMENT DOOR CAME OPEN ON TAKEO...,5
2,(-23) PRIOR TO TAXI AND TAKEOFF THE PILOT FAIL...,5
3,(-23) THE PILOT OF N17655 BE-55 DECLARED AN EM...,5
4,"(-23) AMERICAN TRANS AIR, INC. FLIGHT 253 WAS ...",3
...,...,...
1752,MAINT NOT PERFORMED DUE PARTS NOT AVAILABLE. T...,2
1753,AIRCRAFT FAILED PRESSURIZE ON CLIMB OUT. RETUR...,5
1754,LANDED DEEP GRASS. FIRE ERUPTED UNDER THE 206L...,6
1755,FORCED LANDING AFTER ENGINE QUIT. FOUND FROZEN...,5


In [81]:
ds = Dataset.from_pandas(df)
ds.shape

(1757, 2)

Tokenize text column

In [82]:
model_nm = "bert-base-cased"

Create tokenizer

In [95]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

loading configuration file config.json from cache at /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33

Tokenize inputs

In [84]:
def tok_func(x):
    return tokz(x["text"], padding="max_length", truncation=True)

tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

## Splitting Dataset

In [85]:
dds = tok_ds.train_test_split(test_size=0.20, seed=0)
dds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1405
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 352
    })
})

In [86]:
train_dataset = dds['train']
eval_dataset = dds['test']

In [87]:
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1405
})

## Training Our Model

In [88]:
from transformers import TrainingArguments,Trainer

Define batch size, number of epochs, and learning rate

In [89]:
bs = 8
epochs = 2
lr = 5e-5

Define our Training Arguments

In [90]:
args = TrainingArguments(
    'outputs', 
    learning_rate=lr,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=bs, 
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, 
    weight_decay=0.01, 
    report_to='none')

PyTorch: setting up devices


In [96]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz)

loading configuration file config.json from cache at /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_t

RuntimeError: false INTERNAL ASSERT FAILED at "../c10/cuda/CUDAGraphsC10Utils.h":73, please report a bug to PyTorch. Unknown CUDA graph CaptureStatus32583

Create our model

In [92]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2198
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 138
  Number of trainable parameters = 108315655


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/parallel/parallel_apply.py", line 64, in _worker
    output = module(*input, **kwargs)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/transformers/models/bert/modeling_bert.py", line 1563, in forward
    outputs = self.bert(
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/transformers/models/bert/modeling_bert.py", line 1012, in forward
    embedding_output = self.embeddings(
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/transformers/models/bert/modeling_bert.py", line 238, in forward
    embeddings = self.dropout(embeddings)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/modules/dropout.py", line 59, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
  File "/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs/../__pypackages__/3.9/lib/torch/nn/functional.py", line 1252, in dropout
    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
RuntimeError: philox_cuda_state for an unexpected CUDA generator used during capture. In regions captured by CUDA graphs, you may only use the default CUDA RNG generator on the device that's current when capture begins. If you need a non-default (user-supplied) generator, or a generator on another device, please file an issue.


In [90]:
def train_evaluate(full_train_dataset, full_eval_dataset, full_val_dataset, full_heat_map, kfold):
    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

    # removes unessecary columns
    tf_train_dataset = full_train_dataset.remove_columns(["text"]).with_format("tensorflow")
    tf_eval_dataset = full_eval_dataset.remove_columns(["text"]).with_format("tensorflow")
    tf_val_dataset = full_val_dataset.remove_columns(["text"]).with_format("tensorflow")

    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    # batches the datasets
    train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
    train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
    train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

    eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
    eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
    eval_tf_dataset = eval_tf_dataset.batch(8)

    val_features = {x: tf_val_dataset[x] for x in tokenizer.model_input_names}
    val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf_val_dataset["label"]))
    val_tf_dataset = val_tf_dataset.batch(8)

    # compile and train the model
    model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
      metrics=tf.metrics.SparseCategoricalAccuracy(),
    )

    print(f'running model {kfold}')
    history = model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=2)
    print(f'finished model {kfold}')

    # plot figures and save them on individual basis
    acc_train = history.history['sparse_categorical_accuracy']
    acc_val = history.history['val_sparse_categorical_accuracy']

    loss_train = history.history['loss']
    loss_val = history.history['val_loss']

    create_and_save_plots(acc_train, acc_val, 'Accuracy',kfold)
    create_and_save_plots(loss_train, loss_val, 'Loss', kfold)

    print(f'Evaulating model {kfold}')
    # Evaluate the model on the test data using `evaluate`
    results = model.evaluate(val_tf_dataset)

    # Generate predictions (probabilities -- the output of the last layer)
    # on new data using `predict`
    predictions = model.predict(val_tf_dataset)

    # get actual predictions from model.predict
    actual_predictions = []
    for prediction in predictions.logits:
        max = -100
        pred = -1
    for index, val in enumerate(prediction):
        if val > max:
            max = val
            pred = index
    actual_predictions.append(pred)


    # generate heat map + and update full heatmap
    heat_map = np.zeros((7,7), dtype=float)
    val_df = pd.read_csv(f'./split_datasets/val/FAA-{kfold}.csv')
    correct = 0
    for index, item in val_df.iterrows():
        print(item['label'])
        print(actual_predictions[index])

    if item['label'] == actual_predictions[index]:
        correct = correct + 1 
    heat_map[6 - actual_predictions[index]][item['label']] = heat_map[ 6 - actual_predictions[index]][item['label']] + 1
    # full
    full_heat_map[6 - actual_predictions[index]][item['label']] = full_heat_map[ 6 - actual_predictions[index]][item['label']] + 1

    print(correct/len(actual_predictions))
    print("Correct based on my actual predictions: ", correct/len(actual_predictions))

    # normalize heat map
    normalize_save_heat_map(heat_map, kfold)

    return history, results, predictions, heat_map, full_heat_map

In [86]:
def create_and_save_plots(train, val, metric, kfold):
    print(f'plotting {kfold}')  
    epochs = range(1,3)

    plt.plot(epochs, train, 'g', label=f'Training {metric}')
    plt.plot(epochs, val, 'b', label=f'Validation {metric}')

    plt.title(f'Training and Validation {metric}')
    plt.xlabel('Epochs')
    plt.ylabel(f'{metric}')
    plt.legend()
    plt.savefig(f'./plots/FAA-test2{kfold}-{metric}.pdf')
    plt.clf()


In [87]:
def normalize_save_heat_map(heat_map, kfold):
    print(f'heat map {kfold}')  
    # normalize heat map
    for index, category in enumerate(heat_map):
    total = 0
    for val in category:
        total = total + val
    for index_2, val in enumerate(category):
        heat_map[index][index_2] = val / total

    fig, ax = plt.subplots(figsize=(11,9))
    fig.set_tight_layout(True)
    # color map
    labels = ['II','ME','AU','AF','DE','EQ','AI']
    y_labels = ['AI','EQ','DE','AF','AU','ME','II']

    sb.heatmap(heat_map,cmap="Blues",xticklabels=labels, yticklabels=y_labels, annot=True)
    plt.savefig(f'./heatmaps-test2{kfold}.pdf')
    plt.clf()


IndentationError: expected an indented block (692751855.py, line 5)

In [91]:
def create_and_write_log_dict(index, log_dict):
    acc_train = log_dict[1]['history'].history['sparse_categorical_accuracy']
    acc_val = log_dict[1]['history'].history['val_sparse_categorical_accuracy']

    loss_train = log_dict[1]['history'].history['loss']
    loss_val = log_dict[1]['history'].history['val_loss']

    heat_map = log_dict[1]['heat_map']
    full_heat_map = log_dict[1]['full_heat_map']

    temp_dict = {
      index: {
          'acc_train': acc_train,
          'acc_val': acc_val,
          'loss_val': loss_val,
          'loss_train': loss_train,
          'heat_map': heat_map,
          'full_heat_map': full_heat_map
      }
    }

    f = open('./log_dict_file_test.txt', 'a')
    f.write(f'\n\n\n{index}')
    f.write(str(temp_dict))
    f.close()



In [84]:
def once_run(index, log_dict, full_heat_map):
    full_train_dataset, full_eval_dataset, full_val_dataset = tokenize_data(index)
    history, results, predictions, heat_map, full_heat_map = train_evaluate(full_train_dataset, full_eval_dataset, full_val_dataset, full_heat_map, index)
    log_dict[index] = {
      'history': history,
      'results': results,
      'predictions': predictions,
      'heat_map': heat_map,
      'full_heat_map': full_heat_map
    }
    create_and_write_log_dict(index, log_dict)

In [None]:
full_heat_map = np.zeros((7,7), dtype=float)
log_dict = {}
#for index in range(1,10):
one_run(1, log_dict, full_heat_map)