In [1]:
# This notebook is an outline for 10 fold cross validation neural network

# Dataset
# dataset used for this project must be generated from Subsets/Maintenance_Text_data.csv then processed into "text" & "label" 
# columns with the NN_df_format.ipynb
# it uses 7 different maintance labels of crash codes
# the directory structure of this directory is needed to be able to save the appropriatte
# heat maps and plots

# Written by Joseph McCombs
# Please contact for support or questions
#Edited by Lindsey Michie: Nov 2022

# Environment Setup
#! pip install transformers
#! pip install datasets
#! pip install keras==2.6.* # downgrade from 2.9
#!pip install -U torch==1.8.0 torchtext==0.9.0 #Needed this verison of PyTorch and torchtext
#!pip install -U torchtext

In [2]:
#| hide
import sys
sys.path.append('../__pypackages__/3.9/lib/')
print(sys.path)

['/afs/crc.nd.edu/group/TAI/Users/painswor/nbdev-framework-example/nbs', '/opt/anaconda3/lib/python39.zip', '/opt/anaconda3/lib/python3.9', '/opt/anaconda3/lib/python3.9/lib-dynload', '', '/afs/crc.nd.edu/user/p/painswor/.local/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages', '/opt/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/afs/crc.nd.edu/user/p/painswor/.ipython', '../__pypackages__/3.9/lib/']


In [3]:
import transformers; print(transformers.__version__)

2023-02-27 12:11:11.677923: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


4.26.1


Import Libraries needed

In [4]:
import pandas as pd

In [15]:
import torch
#from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
import numpy as np
import pandas as pd
from datasets import load_dataset

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sb

In [6]:
cleaned_data = '../data/cleaned-data'

In [7]:
raw_data = pd.read_csv(f'{cleaned_data}/Maintenance_Text_data.csv')
raw_data.shape

(2763, 34)

In [8]:
df = pd.DataFrame()
df['text'] = raw_data['c119']
df['label'] = raw_data['c78']
df

Unnamed: 0,text,label
0,TAILWHEEL COCKED RIGHT PRIOR TO TKOF. ...,AU
1,TOW PLANE BECAME AIRBORNE THEN SETTLED.STUDENT...,ME
2,"2ND ILS APCH,ACFT'S G/S INOP.LOM TUNED TO WRON...",AU
3,PLT NOTED SOFT R BRAKE PEDAL DRG TAXI TO TKOF....,AU
4,TAXI OFF HARD SFC DUE TFC R MAIN GR BROKE THRO...,AF
...,...,...
2758,(-23) A/C RELOCATED TO NEW HANGAR TO CHECK SIZ...,II
2759,(-23) ON 2/23/08 @ APPROXIMATELY 2130 DURING T...,AF
2760,(-23) PILOT TOOK OFF FOR LEESBURG AIRPORT AND ...,II
2761,(-23) OWNER FORGOT TO FASTEN THE LOWER LEFT 4 ...,II


In [9]:
df.isna().sum()

text     15
label     0
dtype: int64

In [10]:
df = df.fillna('Null')
df = df[df['text'] != 'Null']

In [11]:
df.isna().sum()

text     0
label    0
dtype: int64

In [12]:
df.describe(include=[object])

Unnamed: 0,text,label
count,2748,2748
unique,2742,8
top,FORCED LANDING AFTER POWER LOSS. FOUND WATER I...,II
freq,4,1942


In [13]:
counts = df['label'].value_counts()
df = df[df['label'].isin(counts[counts > 1].index)]

In [16]:
Corpus = df

# Step - a : Remove blank rows if any.
Corpus = Corpus.fillna('Null')
Corpus = Corpus[Corpus['text'] != 'Null']

# Step - a.1 : Remove any non text fields
Corpus['integer_field'] = Corpus.apply(lambda x: str(x['text']).isnumeric(), axis=1)
Corpus = Corpus[Corpus['integer_field'] == False]
Corpus = Corpus[['label','text']]

X = Corpus['text']
y = Corpus['label']

# creates 10 fold cross validation datasets
ss = StratifiedShuffleSplit(n_splits=10, test_size=0.20, random_state=0)
ss.get_n_splits(X, y)

# Train: 60%
# Validation: 20%
# Test: 20%
temp_index = 0
for train_index, test_index in ss.split(X,y):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.20, random_state=0)
    Encoder = LabelEncoder()
    y_train = Encoder.fit_transform(y_train)
    y_test = Encoder.fit_transform(y_test)
    y_val_encode = Encoder.fit_transform(y_val)
    final_train = pd.DataFrame({'text':X_train,'label':y_train})
    final_test = pd.DataFrame({'text':X_test,'label':y_test})
    final_val = pd.DataFrame({'text':X_val,'label':y_val_encode})
    final_train.to_csv(f'{cleaned_data}/train/FAA-{temp_index}.csv', index=False)
    final_test.to_csv(f'{cleaned_data}/test/FAA-{temp_index}.csv', index=False)
    final_val.to_csv(f'{cleaned_data}/val/FAA-{temp_index}.csv', index=False)
    y_val.to_csv(f'{cleaned_data}/actual/FAA-{temp_index}.csv', index=False)
    temp_index = temp_index + 1

In [17]:
def tokenize_data(index):
  print(f"tokenizing index {index}")
  # load the firs train and test datasets as DataDictionary
  raw_datasets = load_dataset("csv",data_files={'train': [f'{cleaned_data}/train/FAA-{index}.csv'], 'test': [f'{cleaned_data}/test/FAA-{index}.csv'],
                                                'val': [f'{cleaned_data}/val/FAA-{index}.csv']})

  tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

  def tokenize_function(examples):
      return tokenizer(examples["text"], padding="max_length", truncation=True)

  tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  full_train_dataset = tokenized_datasets["train"]
  full_eval_dataset = tokenized_datasets["test"]
  full_val_dataset = tokenized_datasets["val"]

  return full_train_dataset, full_eval_dataset, full_val_dataset

In [20]:
def create_and_save_plots(train, val, metric, kfold):
  print(f'plotting {kfold}')  
  epochs = range(1,3)

  plt.plot(epochs, train, 'g', label=f'Training {metric}')
  plt.plot(epochs, val, 'b', label=f'Validation {metric}')

  plt.title(f'Training and Validation {metric}')
  plt.xlabel('Epochs')
  plt.ylabel(f'{metric}')
  plt.legend()
  plt.savefig(f'./plots/FAA-test2{kfold}-{metric}.pdf')
  plt.clf()


In [7]:
def normalize_save_heat_map(heat_map, kfold):
  print(f'heat map {kfold}')  
  # normalize heat map
  for index, category in enumerate(heat_map):
    total = 0
    for val in category:
      total = total + val
    for index_2, val in enumerate(category):
      heat_map[index][index_2] = val / total

  fig, ax = plt.subplots(figsize=(11,9))
  fig.set_tight_layout(True)
  # color map
  labels = ['II','ME','AU','AF','DE','EQ','AI']
  y_labels = ['AI','EQ','DE','AF','AU','ME','II']
    
  sb.heatmap(heat_map,cmap="Blues",xticklabels=labels, yticklabels=y_labels, annot=True)
  plt.savefig(f'./heatmaps-test2{kfold}.pdf')
  plt.clf()


In [24]:
def train_evaluate(full_train_dataset, full_eval_dataset, full_val_dataset, full_heat_map, kfold):
  model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=7)

  # removes unessecary columns
  tf_train_dataset = full_train_dataset.remove_columns(["text"]).with_format("tensorflow")
  tf_eval_dataset = full_eval_dataset.remove_columns(["text"]).with_format("tensorflow")
  tf_val_dataset = full_val_dataset.remove_columns(["text"]).with_format("tensorflow")

  tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
  # batches the datasets
  train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
  train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
  train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

  eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
  eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
  eval_tf_dataset = eval_tf_dataset.batch(8)

  val_features = {x: tf_val_dataset[x] for x in tokenizer.model_input_names}
  val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_features, tf_val_dataset["label"]))
  val_tf_dataset = val_tf_dataset.batch(8)

  # compile and train the model
  model.compile(
      optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
      metrics=tf.metrics.SparseCategoricalAccuracy(),
  )

  print(f'running model {kfold}')
  history = model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=2)
  print(f'finished model {kfold}')

  # plot figures and save them on individual basis
  acc_train = history.history['sparse_categorical_accuracy']
  acc_val = history.history['val_sparse_categorical_accuracy']

  loss_train = history.history['loss']
  loss_val = history.history['val_loss']

  #create_and_save_plots(acc_train, acc_val, 'Accuracy',kfold)
  #create_and_save_plots(loss_train, loss_val, 'Loss', kfold)

  print(f'Evaulating model {kfold}')
  # Evaluate the model on the test data using `evaluate`
  results = model.evaluate(val_tf_dataset)

  # Generate predictions (probabilities -- the output of the last layer)
  # on new data using `predict`
  predictions = model.predict(val_tf_dataset)

  # get actual predictions from model.predict
  actual_predictions = []
  for prediction in predictions.logits:
    max = -100
    pred = -1
    for index, val in enumerate(prediction):
      if val > max:
        max = val
        pred = index
    actual_predictions.append(pred)
  val_df = pd.read_csv(f'{cleaned_data}/val/FAA-{kfold}.csv')
  correct = 0
  for index, item in val_df.iterrows():
    print(item['label'])
    print(actual_predictions[index])

    if item['label'] == actual_predictions[index]:
      correct = correct + 1 
    
  print(correct/len(actual_predictions))
  print("Correct based on my actual predictions: ", correct/len(actual_predictions))
  print(correct)

In [None]:
# generate heat map + and update full heatmap
  heat_map = np.zeros((7,7), dtype=float)
  val_df = pd.read_csv(f'{cleaned_data}/val/FAA-{kfold}.csv')
  correct = 0
  for index, item in val_df.iterrows():
    print(item['label'])
    print(actual_predictions[index])

    if item['label'] == actual_predictions[index]:
      correct = correct + 1 
    heat_map[6 - actual_predictions[index]][item['label']] = heat_map[ 6 - actual_predictions[index]][item['label']] + 1
    # full
    full_heat_map[6 - actual_predictions[index]][item['label']] = full_heat_map[ 6 - actual_predictions[index]][item['label']] + 1

  print(correct/len(actual_predictions))
  print("Correct based on my actual predictions: ", correct/len(actual_predictions))
  
  # normalize heat map
  normalize_save_heat_map(heat_map, kfold)

  return history, results, predictions, heat_map, full_heat_map

In [22]:
def create_and_write_log_dict(index, log_dict):
  acc_train = log_dict[1]['history'].history['sparse_categorical_accuracy']
  acc_val = log_dict[1]['history'].history['val_sparse_categorical_accuracy']

  loss_train = log_dict[1]['history'].history['loss']
  loss_val = log_dict[1]['history'].history['val_loss']

  heat_map = log_dict[1]['heat_map']
  full_heat_map = log_dict[1]['full_heat_map']

  temp_dict = {
      index: {
          'acc_train': acc_train,
          'acc_val': acc_val,
          'loss_val': loss_val,
          'loss_train': loss_train,
          'heat_map': heat_map,
          'full_heat_map': full_heat_map
      }
  }

  f = open('./log_dict_file_test.txt', 'a')
  f.write(f'\n\n\n{index}')
  f.write(str(temp_dict))
  f.close()



In [20]:
def one_run(index, log_dict, full_heat_map):
  full_train_dataset, full_eval_dataset, full_val_dataset = tokenize_data(index)
  history, results, predictions, heat_map, full_heat_map = train_evaluate(full_train_dataset, full_eval_dataset, full_val_dataset, full_heat_map, index)
  log_dict[index] = {
      'history': history,
      'results': results,
      'predictions': predictions,
      'heat_map': heat_map,
      'full_heat_map': full_heat_map
  }
  #create_and_write_log_dict(index, log_dict)

In [25]:
full_heat_map = np.zeros((7,7), dtype=float)
log_dict = {}
#for index in range(1,10):
one_run(1, log_dict, full_heat_map)

Using custom data configuration default-40f85013f7ddcf10
Found cached dataset csv (/afs/crc.nd.edu/user/p/painswor/.cache/huggingface/datasets/csv/default-40f85013f7ddcf10/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


tokenizing index 1


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/datasets/csv/default-40f85013f7ddcf10/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-00d9bb664a62f236.arrow
Loading cached processed dataset at /afs/crc.nd.edu/user/p/painswor/.cache/huggingface/datasets/csv/default-40f85013f7ddcf10/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-3ef6c970bc579717.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


running model 1
Epoch 1/2
Epoch 2/2
finished model 1
Evaulating model 1
4
5
3
5
4
5
4
5
4
5
4
5
4
5
5
5
3
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
3
5
4
5
4
5
2
5
2
5
4
5
0
5
4
5
2
5
4
5
4
5
2
5
4
5
4
5
4
5
4
5
5
5
4
5
4
5
2
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
5
5
4
5
4
5
4
5
4
5
3
5
4
5
4
5
4
5
4
5
4
5
5
5
4
5
4
5
4
5
2
5
5
5
5
5
5
5
4
5
4
5
4
5
4
5
4
5
4
5
5
5
4
5
4
5
4
5
5
5
5
5
4
5
4
5
4
5
5
5
4
5
2
5
5
5
4
5
5
5
4
5
4
5
5
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
5
5
5
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
5
5
4
5
4
5
5
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
2
5
5
5
4
5
5
5
5
5
4
5
4
5
5
5
4
5
4
5
4
5
4
5
4
5
0
5
4
5
4
5
2
5
4
5
3
5
5
5
4
5
4
5
4
5
1
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
2
5
5
5
4
5
4
5
3
5
4
5
4
5
4
5
4
5
2
5
4
5
4
5
4
5
4
5
0
5
2
5
4
5
4
5
2
5
4
5
4
5
4
5
4
5
4
5
4
5
5
5
4
5
2
5
4
5
4
5
4
5
5
5
4
5
4
5
4
5
4
5
5
5
4
5
4
5
4
5
4
5
2
5
4
5
2
5
2
5
4
5
4
5
4
5
2
5
5
5
2
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5
4
5


TypeError: cannot unpack non-iterable NoneType object

In [12]:
import json
f = open('./log_dict_file_test.txt','a')
f.write(str(log_dict))
f.close()