In [None]:
### Init ###

# Packages
from typing import Callable, Any, Dict, List

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TrainerCallback

import evaluate

import os, json

from multiprocessing import Pool

# Constants
earthquake_prompt_features = ["DATE", "PLACE", "LATITUDE", "LONGITUDE", "DEPTH"]
earthquake_prompt_template = "on DATE utc, an earthquake struck at PLACE. the epicenter was located at latitude LATITUDE, longitude LONGITUDE, with a depth of DEPTH km beneath the earth's surface."

train_test_split = 0.6
eval_test_split = 0.5

logging_steps_epochs = 1/3

save_strategy = "epoch"
save_steps = 0
save_total_limit = 3

load_checkpoint = 0

num_train_epochs = 3
per_device_train_batch_size = 8
per_device_eval_batch_size = per_device_train_batch_size

shuffle_seed = 42

num_proc = os.cpu_count()

# Models paths
models_paths = []

models_paths.append("distilbert/distilbert-base-uncased") # Distilbert-base-uncased: 67M params
# models_paths.append("FacebookAI/roberta-base") # Roberta-base: 110M params
# models_paths.append("google-bert/bert-base-uncased") # Bert-base-uncased: 110M params
# models_paths.append("google-bert/bert-large-uncased") # Bert-large-uncased: 340M params

# Datasets paths
datasets_paths = []

# datasets_paths.append("Datasets/Earthquakes-180d-filtered.csv") # Earthquakes-180d Dataset
datasets_paths.append("Datasets/Earthquakes-1990-2023-filtered.csv") # Earthquakes-1990-2023 Dataset

datasets_prompts_paths = {dataset_path: dataset_path.replace("filtered", "prompts") for dataset_path in datasets_paths}

datasets_prompts_tokenized_paths = {dataset_path: {model_path: datasets_prompts_paths[dataset_path].replace(".csv", f"-tokenized/{model_path.lower()}.parquet")
                                    for model_path in models_paths} for dataset_path in datasets_paths}

datasets_prompts_tokenized_subsets_sizes = {}

# datasets_prompts_tokenized_subsets_sizes[datasets_paths[0]] = {"18K": 18000}
# datasets_prompts_tokenized_subsets_sizes[datasets_paths[1]] = {"1M": int(1e6), "2M": int(2e6), "3M": int(3e6)}
datasets_prompts_tokenized_subsets_sizes[datasets_paths[0]] = {"1M": int(1e6)}

datasets_prompts_tokenized_subsets_paths = {dataset_path: {model_path: {dataset_prompts_tokenized_subset_name:
                                   datasets_prompts_tokenized_paths[dataset_path][model_path].replace(".parquet", "-" + dataset_prompts_tokenized_subset_name + ".parquet")
                                   for dataset_prompts_tokenized_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                                   for model_path in models_paths} for dataset_path in datasets_paths}

datasets_subsets_logging_steps = {dataset_path: {dataset_prompts_tokenized_subset_name: int(logging_steps_epochs * train_test_split * datasets_prompts_tokenized_subsets_sizes[dataset_path][dataset_prompts_tokenized_subset_name]/per_device_eval_batch_size)
                                   for dataset_prompts_tokenized_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                                   for dataset_path in datasets_paths}

In [None]:
### Methods ###

def load_dataset_from_file(dataset_path: str):
    return load_dataset(dataset_path.split(".")[-1], data_files = dataset_path)["train"]

def load_model(model_path: str):
    return AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 1)

def create_dataset(dataset, dataset_path: str, create_dataset: Callable, create_dataset_params: Dict[str, Any],
                   load_dataset: bool = True, save_dataset: bool = True):
    print(f"Start of creation of dataset ({dataset_path})")
    
    # Load dataset
    if load_dataset: dataset = load_dataset_from_file(dataset)

    # Create dataset
    dataset = create_dataset(dataset, **create_dataset_params)

    # Save dataset
    if save_dataset:
        if dataset_path.endswith(".csv"): dataset.to_csv(dataset_path)
        elif dataset_path.endswith(".parquet"): dataset.to_parquet(dataset_path)

    print(f"End of creation of dataset ({dataset_path})")

    return dataset

def create_prompts_dataset(dataset, prompt_template: str, prompt_features: List[str], target_feature: str):
    
    # Packages
    from functools import reduce

    remove_features = dataset.column_names

    def create_prompt(instance):
        features = {prompt_feature: str(instance[prompt_feature.lower()]).lower() for prompt_feature in prompt_features}
        prompt = reduce(lambda prompt, feature: prompt.replace(*feature, 1), features.items(), prompt_template)
        
        return {"prompt": prompt, "labels": instance[target_feature]}
    
    dataset_prompts = dataset.map(create_prompt, num_proc = num_proc)
    dataset_prompts = dataset_prompts.remove_columns(remove_features)

    return dataset_prompts

def create_train_eval_test_datasets(dataset):
    train_test_dataset = dataset.train_test_split(test_size = 1 - train_test_split, seed = shuffle_seed)
    eval_test_dataset = train_test_dataset["test"].train_test_split(test_size = 1 - eval_test_split, seed = shuffle_seed)
    return DatasetDict({"train": train_test_dataset["train"], "eval": eval_test_dataset["train"], "test": eval_test_dataset["test"]})

def create_tokenized_dataset(dataset, tokenizer):
    return dataset.map(lambda instance: tokenizer(instance["prompt"], padding = "max_length", truncation = True), batched = True, num_proc = num_proc)

def create_subset(dataset, subset_size: int):
    # Sample dataset
    return dataset.shuffle(seed = shuffle_seed).select(range(subset_size if subset_size <= len(dataset) else len(dataset)))

In [None]:
### Load models ###

models = {}
models_tokenizers = {}

for model_path in models_paths:
    models[model_path] = load_model(model_path)
    models_tokenizers[model_path] = AutoTokenizer.from_pretrained(model_path)

    models_tokenizers[model_path].max_length = models[model_path].config.max_position_embeddings

    print(f"### ({model_path}) Model Configuration ###")
    print(models[model_path].config)

In [None]:
### Create prompts datasets ###

datasets_prompts = {dataset_path: create_dataset(dataset_path, datasets_prompts_paths[dataset_path], create_prompts_dataset,
                    {"prompt_template": earthquake_prompt_template, "prompt_features": earthquake_prompt_features, "target_feature": "magnitude"},
                    True, False)
                    for dataset_path in datasets_paths}

In [None]:
### Load prompts datasets ###

datasets_prompts = {dataset_path: load_dataset_from_file(datasets_prompts_paths[dataset_path]) for dataset_path in datasets_paths}

In [None]:
### Create prompts tokenized datasets ###

datasets_prompts_tokenized = {dataset_path: {model_path: create_dataset(datasets_prompts[dataset_path],
                              datasets_prompts_tokenized_paths[dataset_path][model_path], create_tokenized_dataset,
                              {"tokenizer": models_tokenizers[model_path]}, False, False)
                              for model_path in models_paths} for dataset_path in datasets_paths}

In [None]:
### Load prompts tokenized datasets ###

datasets_prompts_tokenized = {dataset_path: {model_path: load_dataset_from_file(datasets_prompts_tokenized_paths[dataset_path][model_path])
                              for model_path in models_paths} for dataset_path in datasets_paths}

In [None]:
### Create prompts tokenized datasets subsets ###

datasets_prompts_tokenized_subsets = {dataset_path: {model_path: {dataset_prompts_tokenized_subset_name: create_dataset(datasets_prompts_tokenized[dataset_path][model_path],
                                      datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenized_subset_name],
                                      create_subset, {"subset_size": dataset_prompts_tokenized_subset_size}, False, False)
                                      for (dataset_prompts_tokenized_subset_name, dataset_prompts_tokenized_subset_size) in datasets_prompts_tokenized_subsets_sizes[dataset_path].items()}
                                      for model_path in models_paths} for dataset_path in datasets_paths}

In [None]:
### Load prompts tokenized datasets subsets ###

datasets_prompts_tokenized_subsets = {dataset_path: {model_path: {dataset_prompts_tokenize_subset_name:
                                      load_dataset_from_file(datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenize_subset_name])
                                      for dataset_prompts_tokenize_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                                      for model_path in models_paths} for dataset_path in datasets_paths}

In [None]:
### Create train, eval and test datasets ###

datasets_prompts_tokenized_subsets_split = {dataset_path: {model_path: {dataset_prompts_tokenize_subset_name:
                                            create_dataset(datasets_prompts_tokenized_subsets[dataset_path][model_path][dataset_prompts_tokenize_subset_name],
                                            datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenize_subset_name],
                                            create_train_eval_test_datasets, {}, False, False)
                                            for dataset_prompts_tokenize_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                                            for model_path in models_paths} for dataset_path in datasets_paths}

In [None]:
### Models training methods ###

mse = evaluate.load("mse")

def compute_mse(eval_prediction):
    logits, labels = eval_prediction
    if isinstance(logits, tuple): logits = logits[0]
    return mse.compute(predictions = logits, references = labels)

class TestEvalCallback(TrainerCallback):
    def __init__(self, dataset, model_trainer):
        self.test_dataset = dataset["test"]
        self.model_trainer = model_trainer
        self.test_results = []

    def on_train_begin(self, args, state, control, **kwargs):
        test_result = self.model_trainer.evaluate(eval_dataset = self.test_dataset)
        test_result = {"epoch": state.epoch, "step": state.global_step, "test_result": test_result}
        self.test_results.append(test_result)
        print(test_result)
    
    def on_log(self, args, state, control, logs = None, **kwargs):
        if logs and "loss" in logs:
            test_result = self.model_trainer.evaluate(eval_dataset = self.test_dataset)
            test_result = {"epoch": state.epoch, "step": state.global_step, "test_result": test_result}
            self.test_results.append(test_result)
            print(test_result)

    def on_save(self, args, state, control, **kwargs):
        checkpoint_dir = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
        os.makedirs(checkpoint_dir, exist_ok = True)
        save_path = os.path.join(checkpoint_dir, "test_results.json")
        with open(save_path, "w") as file:
            json.dump(self.test_results, file, indent = 2)

def train_model(model_path: str, model_name: str, logging_steps: int, tokenized_dataset):
    model_trainer = Trainer(
        model_init = (lambda: load_model(model_path)),
        args = TrainingArguments(
            output_dir = model_name,
            eval_strategy = "steps",
            logging_steps = logging_steps,
            save_strategy = save_strategy,
            save_steps = save_steps,
            save_total_limit = save_total_limit,
            per_device_train_batch_size = per_device_train_batch_size,
            per_device_eval_batch_size = per_device_eval_batch_size,
            num_train_epochs = num_train_epochs,
            seed = shuffle_seed,
            data_seed = shuffle_seed),
        train_dataset = tokenized_dataset["train"],
        eval_dataset = tokenized_dataset["eval"],
        compute_metrics = compute_mse,
    )

    model_tester = TestEvalCallback(tokenized_dataset, model_trainer)

    model_trainer.add_callback(model_tester)

    if "/checkpoint-" in model_path: model_trainer.train(resume_from_checkpoint = model_path)
    else: model_trainer.train()

    return (model_trainer.model, model_tester.test_results)

In [None]:
### Train models ###

trained_models = {}
trained_models_subsets_datasets_tests_results = {}

for dataset_path in datasets_paths:
    trained_models[dataset_path] = {}
    trained_models_subsets_datasets_tests_results[dataset_path] = {}
    
    for model_path in models_paths:
        trained_models[dataset_path][model_path] = {}
        trained_models_subsets_datasets_tests_results[dataset_path][model_path] = {}
        
        for (dataset_prompts_tokenized_subset_split_name, dataset_prompts_tokenized_subset_split) in datasets_prompts_tokenized_subsets_split[dataset_path][model_path].items():
            model_name = "".join(datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenized_subset_split_name].replace("Datasets/", "Models/").split(".")[:-1])
            if load_checkpoint > 0: model_path = model_name + f"/checkpoint-{load_checkpoint}"
            logging_steps = datasets_subsets_logging_steps[dataset_path][dataset_prompts_tokenized_subset_split_name]
            
            print(f"### Training model ({model_name}) ###")
            trained_model, model_test_results = train_model(model_path, model_name, logging_steps, dataset_prompts_tokenized_subset_split)
            
            trained_models[dataset_path][model_path][dataset_prompts_tokenized_subset_split_name] = trained_model
            trained_models_subsets_datasets_tests_results[dataset_path][model_path][dataset_prompts_tokenized_subset_split_name] = model_test_results

In [None]:
### Load trained models ###

trained_models = {dataset_path: {model_path: {dataset_prompts_tokenized_subset_name:
                  load_model("".join(datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenized_subset_name].replace("Datasets/", "Models/").split(".")[:-1]) + f"/checkpoint-{load_checkpoint}")
                  for dataset_prompts_tokenized_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                  for model_path in models_paths} for dataset_path in datasets_paths}