In [0]:
import tensorflow as tf
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(0) 

### Test the loaded models and tokenizers.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
train_path = 'gdrive/My Drive/convert to Bert/train/'
test_path = 'gdrive/My Drive/convert to Bert/test/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Load UCF files

In [0]:
broad_cat_dict={'I': ['A'],
                'II': ['B'],
                'III': ['C', 'D'],
                'IV': ['E', 'F', 'G', 'H'],
                'V': ['I', 'J', 'K', 'L', 'M', 'N', 'O', 'P'],
                'VI': ['Q'],
                'VII': ['R', 'S', 'T', 'U', 'V', 'W'],
                'VIII': ['X'],
                'IX': ['Y'],
                'X': ['Z'],
              }

In [0]:
import pandas as pd
import os

def split_dataset(data_path):
  file_list=os.listdir(data_path)
  df_data=pd.DataFrame()
  for file in file_list:
    print(data_path+file)
    df_data=pd.concat([df_data, pd.read_pickle(data_path+file, compression='gzip')])

  df_data['mission_prgrm_spellchk']=df_data['TAXPAYER_NAME']+' '+df_data['mission_spellchk']+' '+df_data['prgrm_dsc_spellchk'] # Using spell-checked.
  df_data['broad_cat']=df_data['NTEE1'].apply(ntee2cat)
  print(len(df_data['mission_prgrm_spellchk']), len(df_data['NTEE1'].drop_duplicates()), len(df_data['broad_cat'].drop_duplicates()))
  return df_data

def ntee2cat(string):
  return [s for s in broad_cat_dict.keys() if string in broad_cat_dict[s]][0]


In [0]:
df_train = split_dataset(train_path)


gdrive/My Drive/convert to Bert/train/df_ucf_train.pkl.gz_1of4
gdrive/My Drive/convert to Bert/train/df_ucf_train.pkl.gz_2of4
gdrive/My Drive/convert to Bert/train/df_ucf_train.pkl.gz_3of4
gdrive/My Drive/convert to Bert/train/df_ucf_train.pkl.gz_4of4
gdrive/My Drive/convert to Bert/train/df_ucf_train.pkl.gz_0of4
154424 25 9


In [0]:
df_test = split_dataset(test_path)

gdrive/My Drive/convert to Bert/test/df_ucf_test.pkl.gz
38607 25 9


In [0]:
pip install transformers



In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [0]:
from sklearn import preprocessing

#encoding function
def padding_text(dataset):
  input_ids=[]
  attention_masks=[]
  text_list =  dataset['mission_prgrm_spellchk']
  for text in text_list:
    encode_plus = tokenizer.encode_plus(            
                        text,    
                        add_special_tokens = True,
                        max_length = 512,        
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt')   
    input_ids.append(encode_plus['input_ids'])
    attention_masks.append(encode_plus['attention_mask'])
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  lb_broad_cat = preprocessing.LabelEncoder().fit_transform(dataset['broad_cat'])
  lb_major_group = preprocessing.LabelEncoder().fit_transform(dataset['NTEE1'])
  lb_broad_cat = torch.tensor(lb_broad_cat)
  lb_major_group = torch.tensor(lb_major_group)
  return input_ids, attention_masks, lb_broad_cat, lb_major_group

In [0]:
#encoding train and test dataset
input_ids_train, attention_masks_train, lb_broad_group_train, lb_major_group_train = padding_text(df_train)
input_ids_test, attention_masks_test, lb_broad_cat_test, lb_major_group_test = padding_text(df_test)

In [0]:
from torch.utils.data import TensorDataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import *

#combining data and loading Bert
def train_test_dataset(train_label, test_label, num_label, batch_size):
  
  train_dataset = TensorDataset(input_ids_train, attention_masks_train, train_label)
  test_dataset = TensorDataset(input_ids_test, attention_masks_test, test_label)
  

  train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset),
              batch_size = batch_size )

  validation_dataloader = DataLoader(test_dataset,
              sampler = SequentialSampler(test_dataset), 
              batch_size = batch_size)
  
  global model

  model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                                num_labels=num_label) 
  model.cuda()
  
  
  return train_dataloader, validation_dataloader

In [0]:
from transformers import get_linear_schedule_with_warmup

#function for adjusting parameters
def parameters(learning_rate, eps, num_epoch):
  
  global epochs
  epochs = num_epoch
  optimizer = AdamW(model.parameters(),
                    lr = learning_rate, 
                    eps = eps 
                  )
  
  total_steps = len(train_dataloader) * epochs

  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, 
                                              num_training_steps = total_steps)
  return optimizer, scheduler

In [0]:
import numpy as np
#function that will evaluate the accuracy of the model
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
import random

#loading data into the Bert model

def run_bert(train_dataloader, validation_dataloader, optimizer, scheduler):

  seed = 42
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed) 

  loss_values = []

  for epoch_i in range(0, epochs):

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_loss = 0

      
      model.train()
      model.resize_token_embeddings(len(tokenizer))

    
      for step, batch in enumerate(train_dataloader):

          if step % 2000 == 0 and not step == 0:
              elapsed = format_time(time.time() - t0)
              
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)
          
          model.zero_grad() 
          
          outputs = model(b_input_ids, 
                      token_type_ids=None, 
                      attention_mask=b_input_mask, 
                      labels=b_labels)
          
          loss = outputs[0] 
          total_loss += loss.item()
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          
          optimizer.step()

          scheduler.step()

      avg_train_loss = total_loss / len(train_dataloader)            
      
      loss_values.append(avg_train_loss)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
          
      #               Validation

      print("\n")
      print("Validation...")

      t0 = time.time()

      model.eval()

      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      for batch in validation_dataloader:
          
          batch = tuple(t.to(device) for t in batch)
          
          b_input_ids, b_input_mask, b_labels = batch

          with torch.no_grad():        

      
              outputs = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask)
          logits = outputs[0]

          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          
          eval_accuracy += tmp_eval_accuracy

          nb_eval_steps += 1

      # Report the final accuracy for this validation run.
      print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
      print("  Validation took: {:}".format(format_time(time.time() - t0)))

  print("")
  print("Training complete!")

In [0]:
#load broad group label
train_dataloader, validation_dataloader= train_test_dataset(train_label = lb_major_group_train, 
                                                            test_label = lb_major_group_test, 
                                                            num_label = 25, batch_size = 8)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




In [0]:
optimizer, scheduler = parameters(learning_rate = 5e-5, eps = 1e-12, num_epoch = 4)
run_bert(train_dataloader, validation_dataloader, optimizer, scheduler)


Training...
  Batch 2,000  of  19,303.    Elapsed: 0:15:47.
  Batch 4,000  of  19,303.    Elapsed: 0:31:34.
  Batch 6,000  of  19,303.    Elapsed: 0:47:19.
  Batch 8,000  of  19,303.    Elapsed: 1:03:05.
  Batch 10,000  of  19,303.    Elapsed: 1:18:51.
  Batch 12,000  of  19,303.    Elapsed: 1:34:36.
  Batch 14,000  of  19,303.    Elapsed: 1:50:21.
  Batch 16,000  of  19,303.    Elapsed: 2:06:06.
  Batch 18,000  of  19,303.    Elapsed: 2:21:52.

  Average training loss: 0.48
  Training epcoh took: 2:32:09


Validation...
  Accuracy: 0.86
  Validation took: 0:11:05

Training...
  Batch 2,000  of  19,303.    Elapsed: 0:15:46.
  Batch 4,000  of  19,303.    Elapsed: 0:31:34.
  Batch 6,000  of  19,303.    Elapsed: 0:47:22.
  Batch 8,000  of  19,303.    Elapsed: 1:03:09.
  Batch 10,000  of  19,303.    Elapsed: 1:18:55.
  Batch 12,000  of  19,303.    Elapsed: 1:34:41.
  Batch 14,000  of  19,303.    Elapsed: 1:50:26.
  Batch 16,000  of  19,303.    Elapsed: 2:06:11.
  Batch 18,000  of  19,303.

In [0]:
#load broad group label
train_dataloader, validation_dataloader= train_test_dataset(train_label = lb_broad_group_train, test_label = lb_broad_cat_test, num_label = 9, batch_size = 8)

In [0]:
optimizer, scheduler = parameters(learning_rate = 5e-5, eps = 1e-12, num_epoch = 4)
run_bert(train_dataloader, validation_dataloader, optimizer, scheduler)


Training...
  Batch 2,000  of  19,303.    Elapsed: 0:15:05.
  Batch 4,000  of  19,303.    Elapsed: 0:30:09.
  Batch 6,000  of  19,303.    Elapsed: 0:45:13.
  Batch 8,000  of  19,303.    Elapsed: 1:00:18.
  Batch 10,000  of  19,303.    Elapsed: 1:15:24.
  Batch 12,000  of  19,303.    Elapsed: 1:30:29.
  Batch 14,000  of  19,303.    Elapsed: 1:45:33.
  Batch 16,000  of  19,303.    Elapsed: 2:00:37.
  Batch 18,000  of  19,303.    Elapsed: 2:15:41.

  Average training loss: 0.50
  Training epcoh took: 2:25:31


Validation...
  Accuracy: 0.89
  Validation took: 0:11:20

Training...


KeyboardInterrupt: ignored