# Aside: Working with TensorFlow Datasets

The goal of this notebook is to show the same NLP setup except with an alteraction where we go from HF dataset to TFDS. This is sometimes preferred for x,y,z, but for the remainder of the research HF datasets will be used. 

This is purely to demonstrate the compatability with TFDS - no signficant analytical advances were made here in the context of the greater project goals.

In [None]:
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

# Load the dataset from Hugging Face and preprocess it
def load_and_preprocess_data():
    # Load the dataset
    dataset = load_dataset("financial_phrasebank", "sentences_allagree")

    # Initialize the tokenizer
    checkpoint = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(
            examples["sentence"],
            padding="max_length",
            truncation=True,
            max_length=512
        )
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    return tokenized_datasets

# Create a TFDS for training, validation, and testing
def create_tf_datasets(tokenized_datasets, batch_size=8):
    # Convert Hugging Face datasets to TensorFlow datasets
    tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols=["label"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=None
    )
    return tf_train_dataset

# Use the dataset for training a model
def train_model(tf_train_dataset):
    # Load the pre-trained model
    model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

    # Define the optimizer, loss, and metrics
    optimizer = tf.keras.optimizers.Adam()
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = ['accuracy']

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    model_complete.summary()

    # Train the model
    history = model.fit(tf_train_dataset, epochs=3)

    return model, history

# Main function to run the steps
def main():
    # Load and preprocess the dataset
    tokenized_datasets = load_and_preprocess_data()

    # Create the TensorFlow datasets
    tf_train_dataset = create_tf_datasets(tokenized_datasets)

    # Train the model using the TensorFlow datasets
    model_tfds, history_tfds = train_model(tf_train_dataset)

    return model_tfds, history_tfds

# Run the main function
model_whole_tfds, history_whole_tfds = main()
