In [None]:
import pickle
import json
import os
import math
import unidecode
import tensorflow as tf
import redshift_connector
import pandas as pd
import numpy as np
from datetime import datetime

from collections import Counter
from math import ceil
from sklearn.model_selection import train_test_split

In [None]:
# HuggingFace library to train a tokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

### Combining the training data from 001 notebook and artificial data

In [None]:
# Affiliation strings with no affiliation
empty_affs = pd.read_csv("empty_affs_to_train.csv") \
[['original_affiliation']]
empty_affs['affiliation_id'] = -1


# Artificially created affiliation strings with no affiliation (created in 001_Exploration notebook)
artificial_empty_affs = pd.read_parquet("artificial_empty_affs.parquet") \
[['original_affiliation']]
artificial_empty_affs['affiliation_id'] = -1

# All training samples that have less than 50 different version of the affiliation text
# ---- Created in previous notebook
lower_than = pd \c
.read_parquet("lower_than_50.parquet")

# All training samples that have more than 50 different version of the affiliation text
# ---- Created in previous notebook
more_than = pd.read_parquet("more_than_50.parquet")

print(empty_affs.shape)
print(artificial_empty_affs.shape)
print(lower_than.shape)
print(more_than.shape)

In [None]:
full_affs_data = pd.concat([artificial_empty_affs, more_than, lower_than, empty_affs], 
                           axis=0).reset_index(drop=True)

In [None]:
full_affs_data.shape

### Processing and splitting the data

In [None]:
full_affs_data['processed_text'] = full_affs_data['original_affiliation'].apply(unidecode.unidecode)

In [None]:
train_data, val_data = train_test_split(full_affs_data, train_size=0.975, random_state=1)
train_data = train_data.reset_index(drop=True).copy()
val_data = val_data.reset_index(drop=True).copy()

In [None]:
affs_list_train = train_data['processed_text'].tolist()
affs_list_val = val_data['processed_text'].tolist()

In [None]:
try:
    os.system("rm aff_text.txt")
    print("Done")
except:
    pass

In [None]:
# save the affiliation text that will be used to train a tokenizer
with open("aff_text.txt", "w") as f:
    for aff in affs_list_train:
        f.write(f"{aff}\n")

In [None]:
try:
    os.system("rm basic_model_tokenizer")
    print("Done")
except:
    pass

In [None]:
full_affs_data.to_parquet("full_affs_data_tokenized.parquet")

### Creating the tokenizer for the basic model

In [None]:
wordpiece_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

# NFD Unicode, lowercase, and getting rid of accents (to make sure text is as readable as possible)
wordpiece_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

# Splitting on whitespace
wordpiece_tokenizer.pre_tokenizer = Whitespace()

# Training a tokenizer on the training dataset
trainer = WordPieceTrainer(vocab_size=3816, special_tokens=["[UNK]"])
files = ["aff_text.txt"]
wordpiece_tokenizer.train(files, trainer)

wordpiece_tokenizer.save("basic_model_tokenizer")

### Further processing of data with tokenizer

In [None]:
def max_len_and_pad(tok_sent):
    """
    Truncates sequences with length higher than max_len and also pads the sequence
    with zeroes up to the max_len.
    """
    max_len = 128
    tok_sent = tok_sent[:max_len]
    tok_sent = tok_sent + [0]*(max_len - len(tok_sent))
    return tok_sent

def create_affiliation_vocab(x):
    """
    Checks if affiliation is in vocab and if not, adds to the vocab.
    """
    if x not in affiliation_vocab.keys():
        affiliation_vocab[x]=len(affiliation_vocab)
    return [affiliation_vocab[x]]

In [None]:
# initializing an empty affiliation vocab
affiliation_vocab = {}

# tokenizing the training dataset
tokenized_output = []
for i in affs_list_train:
    tokenized_output.append(wordpiece_tokenizer.encode(i).ids)
    
train_data['original_affiliation_tok'] = tokenized_output

# tokenizing the validation dataset
tokenized_output = []
for i in affs_list_val:
    tokenized_output.append(wordpiece_tokenizer.encode(i).ids)
    
val_data['original_affiliation_tok'] = tokenized_output

# applying max length cutoff and padding
train_data['original_affiliation_model_input'] = train_data['original_affiliation_tok'].apply(max_len_and_pad)
val_data['original_affiliation_model_input'] = val_data['original_affiliation_tok'].apply(max_len_and_pad)

# creating the label affiliation vocab
train_data['label'] = train_data['affiliation_id'].apply(lambda x: create_affiliation_vocab(x))
val_data['label'] = val_data['affiliation_id'].apply(lambda x: [affiliation_vocab.get(x)])

# saving the affiliation vocab
with open("affiliation_vocab.pkl","wb") as f:
    pickle.dump(affiliation_vocab, f)

### Creating TFRecords from the training and validation datasets

In [None]:
def create_tfrecords_dataset(data, iter_num, dataset_type='train'):
    """
    Creates a TF Dataset that can then be saved to a file to make it faster to read
    data during training and allow for transferring of data between compute instances.
    """
    ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(data['original_affiliation_model_input'].to_list()),
                              tf.data.Dataset.from_tensor_slices(data['label'].to_list())))
    
    serialized_features_dataset = ds.map(tf_serialize_example)
    
    filename = f"./training_data/{dataset_type}/{str(iter_num).zfill(4)}.tfrecord"
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(serialized_features_dataset)

In [None]:
def tf_serialize_example(f0, f1):
    """
    Serialization function.
    """
    tf_string = tf.py_function(serialize_example, (f0, f1), tf.string)
    return tf.reshape(tf_string, ())

In [None]:
def serialize_example(features, label):
    """
    Takes in features and outputs them to a serialized string that can be written to
    a file using the TFRecord Writer.
    """
    features_list = tf.train.Int64List(value=features.numpy().tolist())
    label_list = tf.train.Int64List(value=label.numpy().tolist())
    
    features_feature = tf.train.Feature(int64_list = features_list)
    label_feature = tf.train.Feature(int64_list = label_list)
    
    features_for_example = {
        'features': features_feature,
        'label': label_feature
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=features_for_example))
    
    return example_proto.SerializeToString()

In [None]:
# Making sure data is in the correct format before going into TFRecord
train_data['original_affiliation_model_input'] = train_data['original_affiliation_model_input'] \
.apply(lambda x: np.asarray(x, dtype=np.int64))

val_data['original_affiliation_model_input'] = val_data['original_affiliation_model_input'] \
.apply(lambda x: np.asarray(x, dtype=np.int64))

In [None]:
os.system("mkdir -p ./training_data/train/")
os.system("mkdir -p ./training_data/val/")
print("Done")

#### Creating the Train Dataset

In [None]:
%%time
for i in range(ceil(train_data.shape[0]/500000)):
    print(i)
    low = i*500000
    high = (i+1)*500000
    create_tfrecords_dataset(train_data.iloc[low:high,:], i, 'train')

#### Creating the Validation Dataset

In [None]:
%%time
for i in range(ceil(val_data.shape[0]/80000)):
    print(i)
    low = i*80000
    high = (i+1)*80000
    create_tfrecords_dataset(val_data.iloc[low:high,:], i, 'val')

### Loading the Data

In [None]:
def _parse_function(example_proto):
    """
    Parses the TFRecord file.
    """
    feature_description = {
        'features': tf.io.FixedLenFeature((128,), tf.int64),
        'label': tf.io.FixedLenFeature((1,), tf.int64)
    }

    example = tf.io.parse_single_example(example_proto, feature_description)

    features = example['features']
    label = example['label'][0]

    return features, label

In [None]:
def get_dataset(path, data_type='train'):
    """
    Takes in a path to the TFRecords and returns a TF Dataset to be used for training.
    """
    tfrecords = [f"{path}{data_type}/{x}" for x in os.listdir(f"{path}{data_type}/") if x.endswith('tfrecord')]
    tfrecords.sort()
    
    
    raw_dataset = tf.data.TFRecordDataset(tfrecords, num_parallel_reads=AUTO)
    parsed_dataset = raw_dataset.map(_parse_function, num_parallel_calls=AUTO)

    parsed_dataset = parsed_dataset.apply(tf.data.experimental.dense_to_ragged_batch(512,drop_remainder=True))
    return parsed_dataset

In [None]:
train_data_path = "./training_data/"
AUTO = tf.data.experimental.AUTOTUNE
training_data = get_dataset(train_data_path, data_type='train')
validation_data = get_dataset(train_data_path, data_type='val')

### Load Vocab

In [None]:
# Loading the affiliation (target) vocab
with open("affiliation_vocab.pkl","rb") as f:
    affiliation_vocab_id = pickle.load(f)

In [None]:
inverse_affiliation_vocab = {i:j for j,i in affiliation_vocab_id.items()}

### Creating Model

In [None]:
# Hyperparameters to tune
emb_size = 128
max_len = 128
num_layers = 6
num_heads = 8
dense_1 = 2048
dense_2 = 1024
learn_rate = 0.0001

In [None]:
def scheduler(epoch, curr_lr):
    """
    Setting up a exponentially decaying learning rate.
    """
    rampup_epochs = 3
    exp_decay = 0.17
    def lr(epoch, beg_lr, rampup_epochs, exp_decay):
        if epoch < rampup_epochs:
            return beg_lr
        else:
            return beg_lr * math.exp(-exp_decay * epoch)
    return lr(epoch, start_lr, rampup_epochs, exp_decay)

In [None]:
# Allow for use of multiple GPUs
mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    # Model Inputs
    tokenized_aff_string_ids = tf.keras.layers.Input((128,), dtype=tf.int64, name='tokenized_aff_string_input')

    # Embedding Layers
    tokenized_aff_string_emb_layer = tf.keras.layers.Embedding(input_dim=3816, 
                                                               output_dim=int(emb_size), 
                                                               mask_zero=True, 
                                                               trainable=True,
                                                               name="tokenized_aff_string_embedding")

    tokenized_aff_string_embs = tokenized_aff_string_emb_layer(tokenized_aff_string_ids)
        
    # First dense layer
    dense_output = tf.keras.layers.Dense(int(dense_1), activation='relu', 
                                             kernel_regularizer='L2', name="dense_1")(tokenized_aff_string_embs)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_1")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_1")(dense_output)
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(dense_output)

    # Second dense layer
    dense_output = tf.keras.layers.Dense(int(dense_2), activation='relu', 
                                             kernel_regularizer='L2', name="dense_2")(pooled_output)
    dense_output = tf.keras.layers.Dropout(0.20, name="dropout_2")(dense_output)
    dense_output = tf.keras.layers.LayerNormalization(epsilon=1e-6, name="layer_norm_2")(dense_output)

    # Last dense layer
    final_output = tf.keras.layers.Dense(len(affiliation_vocab_id), activation='softmax', name='cls')(dense_output)

    model = tf.keras.Model(inputs=tokenized_aff_string_ids, outputs=final_output)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learn_rate, beta_1=0.9, 
                                                     beta_2=0.99),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    curr_date = datetime.now().strftime("%Y%m%d")

    filepath_1 = f"./models/{curr_date}_{dense_1}d1_{dense_2}d2/" \


    filepath = filepath_1 + "model_epoch{epoch:02d}ckpt"

    # Adding in checkpointing
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', 
                                                          verbose=0, save_best_only=False,
                                                          save_weights_only=False, mode='auto',
                                                          save_freq='epoch')
    
    # Adding in early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=4)
    
    start_lr = float(learn_rate)
    
    # Adding in a learning rate schedule to decrease learning rate in later epochs
    lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)
    
    callbacks = [model_checkpoint, early_stopping, lr_schedule]
    

In [None]:
model.summary()

### Training the Model

In [None]:
history = model.fit(training_data, epochs=20, validation_data=validation_data, verbose=1, callbacks=callbacks)

In [None]:
json.dump(str(history.history), open(f"{filepath_1}_20EPOCHS_HISTORY.json", 'w+'))