In [None]:
import pickle
import json
import redshift_connector
import pandas as pd
pd.set_option("display.max_colwidth", None)
import numpy as np

from collections import Counter
from math import ceil
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf
from datasets import load_dataset
from transformers import create_optimizer, TFAutoModelForSequenceClassification, DistilBertTokenizer
from transformers import DataCollatorWithPadding, TFDistilBertForSequenceClassification
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

## Loading Affiliation Data

In [None]:
# Loading the data from the 002a notebook to get the processed text
full_affs_data = pd.read_parquet("full_affs_data_tokenized.parquet")

In [None]:
def create_affiliation_vocab(x):
    """
    Checks if affiliation is in vocab and if not, adds to the vocab.
    """
    if x not in affiliation_vocab.keys():
        affiliation_vocab[x]=len(affiliation_vocab)
    return affiliation_vocab[x]

In [None]:
# initializing an empty affiliation vocab
affiliation_vocab = {}

# creating the label affiliation vocab
full_affs_data['label'] = full_affs_data['affiliation_id'].apply(lambda x: create_affiliation_vocab(x))

### Splitting into Train/Val

In [None]:
train_data, val_data = train_test_split(full_affs_data[['processed_text','label']], 
                                        train_size=0.975, random_state=1)

In [None]:
train_data.to_parquet("train_data.parquet")
val_data.to_parquet("val_data.parquet")

### Tokenizing Affiliation String

In [None]:
# Loading the standard DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", return_tensors='tf')

In [None]:
# Using the HuggingFace library to load the dataset
train_dataset = load_dataset("parquet", data_files={'train': 'train_data.parquet'})
val_dataset = load_dataset("parquet", data_files={'val': 'val_data.parquet'})

In [None]:
MAX_LEN = 256

def preprocess_function(examples):
    return tokenizer(examples["processed_text"], truncation=True, padding=True, 
                     max_length=MAX_LEN)

In [None]:
# Tokenizing the train dataset
tokenized_train_data = train_dataset.map(preprocess_function, batched=True)

In [None]:
# Tokenizing the validation dataset
tokenized_val_data = val_dataset.map(preprocess_function, batched=True)

### Creating the model

In [None]:
# Hyperparameters to tune
batch_size = 256
num_epochs = 20
batches_per_epoch = len(tokenized_train_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

In [None]:
# Allow for use of multiple GPUs
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

    # Turning dataset into TF dataset
    tf_train_dataset = tokenized_train_data["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "label"],
    shuffle=True,
    batch_size=batch_size, 
    collate_fn=data_collator)

    # Turning dataset into TF dataset
    tf_val_dataset = tokenized_val_data["val"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "label"],
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator)

    # Using HuggingFace library to create optimizer
    optimizer, schedule = create_optimizer(init_lr=1e-4, num_warmup_steps=0, num_train_steps=total_train_steps)
    
    # Loading the DistilBERT model and weights with a classification head
    model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                                 num_labels=len(affiliation_vocab))
    model.compile(optimizer=optimizer)

In [None]:
model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=num_epochs)

In [None]:
tf_save_directory = "./language_model"

In [None]:
# Saving the model, tokenizer, and affiliation (target) vocab
tokenizer.save_pretrained(tf_save_directory)
model.save_pretrained(tf_save_directory)
with open(f"{tf_save_directory}/vocab.pkl", "wb") as f:
    pickle.dump(affiliation_vocab, f)