In [1]:
import pickle
import json
import pandas as pd
pd.set_option("display.max_colwidth", None)
import numpy as np
import random
from collections import Counter
from math import ceil
# from sklearn.model_selection import train_test_split

In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [3]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras import mixed_precision
from transformers.keras_callbacks import KerasMetricCallback
import evaluate
from datasets import load_dataset, load_metric, list_metrics
from transformers import create_optimizer
from transformers import create_optimizer, TFAutoModelForSequenceClassification, DistilBertTokenizer
from transformers import DataCollatorWithPadding, TFDistilBertForSequenceClassification
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [4]:
# !pip install transformers==4.35.2
# !pip install datasets==2.15.0
# !pip install evaluate
# !pip install accelerate -U

### Processing Training Data

In [5]:
def merge_title_and_abstract(title, abstract):
    if isinstance(title, str):
        if isinstance(abstract, str):
            return f"<TITLE> {title}\n<ABSTRACT> {abstract}"
        else:
            return f"<TITLE> {title}"
    else:
        if isinstance(abstract, str):
            return f"<TITLE> NONE\n<ABSTRACT> {abstract}"
        else:
            return ""

In [6]:
def create_vocab(df, column):
    # Create a vocab out of the column
    vocab = df[column].unique()

    # Create a dict that maps vocab to integers
    vocab_to_int = {word: i for i, word in enumerate(vocab)}
    
    inv_vocab_to_int = {i:j for j,i in vocab_to_int.items()}

    return vocab_to_int, inv_vocab_to_int

In [7]:
def save_pickle(dictionary, file_path):
    # Save the dictionary as a pickle file
    with open(file_path, 'wb') as f:
        pickle.dump(dictionary, f)

In [8]:
def open_pickle(pickle_path):
    # Open the pickle file
    with open(pickle_path, 'rb') as f:
        pickle_dict = pickle.load(f)

    return pickle_dict

In [9]:
all_data = pd.read_parquet("{path_to_all_training_data_from_003_spark_file}")

In [10]:
all_data['processed_data'] = all_data.apply(lambda x: merge_title_and_abstract(x.new_title, x.abstract), axis=1)

In [11]:
all_data['full_label'] = all_data.apply(lambda x: f"{x.micro_cluster_id}: {x.long_label}", axis=1)

In [12]:
all_data['full_label'].nunique()

4521

In [13]:
shuffled_data = all_data.sample(all_data.shape[0], random_state=0)

In [14]:
shuffled_data.shape

(4521000, 9)

In [15]:
train = shuffled_data.iloc[:4300000].copy()
val = shuffled_data.iloc[4300000:].copy()

In [16]:
train['full_label'].nunique()

4521

In [17]:
target_vocab, inv_target_vocab = create_vocab(train, 'full_label')

In [18]:
train['label'] = train['full_label'].apply(lambda x: target_vocab[x])

In [19]:
val['label'] = val['full_label'].apply(lambda x: target_vocab[x])

In [20]:
for i in range(98):
    train[['paper_id','processed_data','label']] \
        .iloc[44000*i:44000*(i+1)] \
        .to_parquet(f"./training_data/train/train_{i}.parquet")

In [21]:
for i in range(8):
    val[['paper_id','processed_data','label']] \
        .iloc[30000*i:30000*(i+1)] \
        .to_parquet(f"./training_data/val/val_{i}.parquet")

In [22]:
_ = save_pickle(target_vocab, './training_data/target_vocab.pkl')
_ = save_pickle(inv_target_vocab , './training_data/inv_target_vocab.pkl')

### Model Training

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["processed_data"], truncation=True, padding='longest')

In [11]:
# Hyperparameters to tune
batch_size = 256
num_epochs = 20
model_name = "bert-base-multilingual-cased"
task = "openalex-topic-classification"
metric = evaluate.load("accuracy")

In [12]:
target_vocab = open_pickle('./training_data/target_vocab.pkl')
inv_target_vocab = open_pickle('./training_data/inv_target_vocab.pkl')

In [13]:
# Loading the standard DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Using the HuggingFace library to load the dataset
all_dataset = load_dataset("parquet", data_files={'train': [f'./training_data/train/train_{i}.parquet' for i in range(98)], 
                                                'val': [f'./training_data/val/val_{i}.parquet' for i in range(8)]})

In [16]:
# Tokenizing the train dataset
tokenized_data = all_dataset.map(preprocess_function, batched=True, num_proc=8)

In [17]:
batches_per_epoch = tokenized_data['train'].num_rows // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Allow for use of multiple GPUs
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Loading the model and weights with a classification head
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, 
                                                                 num_labels=len(inv_target_vocab), 
                                                                 id2label=inv_target_vocab, 
                                                                 label2id=target_vocab)
    model.bert.embeddings.trainable = False
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath='./model_checkpoints/{epoch:02d}-{val_loss:.2f}.keras',
        save_weights_only=False,
        save_best_only=False)

    tf_train_set = model.prepare_tf_dataset(
        tokenized_data["train"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )
    
    tf_validation_set = model.prepare_tf_dataset(
        tokenized_data["val"],
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )


    optimizer, schedule = create_optimizer(init_lr=6e-5, num_warmup_steps=500, num_train_steps=total_train_steps)

    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=1)
    callbacks = [metric_callback, model_checkpoint_callback, early_stopping]
    
    
    model.compile(optimizer=optimizer)

In [21]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  177853440 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3476649   
                                                                 
Total params: 181330089 (691.72 MB)
Trainable params: 89121705 (339.97 MB)
Non-trainable params: 92208384 (351.75 MB)
_________________________________________________________________


In [None]:
history = model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=callbacks)

In [24]:
_ = save_pickle(history.history, './training_data/training_history.pkl')

In [25]:
model_name = "bert-base-multilingual-cased"
task = "openalex-topic-classification-title-abstract"

In [26]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
model.push_to_hub(f"OpenAlex/{model_name}-finetuned-{task}")

In [None]:
tokenizer.push_to_hub(f"OpenAlex/{model_name}-finetuned-{task}")