In [20]:
### Init ###

# Packages
from typing import Callable, Any, Dict, List
from functools import reduce

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate

import os

from multiprocessing import Pool

# Constants
earthquake_prompt_features = ["DATE", "PLACE", "LATITUDE", "LONGITUDE", "DEPTH"]
earthquake_prompt_template = "on DATE utc, an earthquake struck at PLACE. the epicenter was located at latitude LATITUDE, longitude LONGITUDE, with a depth of DEPTH km beneath the earth's surface."

train_test_split = 0.6
eval_test_split = 0.5

logging_steps = 500

save_strategy = "epoch"
save_steps = 2000
save_total_limit = 3

num_train_epochs = 3
per_device_train_batch_size = 8
per_device_eval_batch_size = per_device_train_batch_size

shuffle_seed = 42

num_proc = os.cpu_count()

# Models paths
models_paths = []

models_paths.append("distilbert/distilbert-base-uncased") # Distilbert-base-uncased: 67M params
# models_paths.append("FacebookAI/roberta-base") # Roberta-base: 110M params
# models_paths.append("google-bert/bert-base-uncased") # Bert-base-uncased: 110M params
# models_paths.append("google-bert/bert-large-uncased") # Bert-large-uncased: 340M params

# Datasets paths
datasets_paths = []

datasets_paths.append("Datasets/Earthquakes-180d-filtered.csv") # Earthquakes-180d Dataset
# datasets_paths.append("Datasets/Earthquakes-1990-2023-filtered.csv") # Earthquakes-1990-2023 Dataset

datasets_prompts_paths = {dataset_path: dataset_path.replace("filtered", "prompts") for dataset_path in datasets_paths}

datasets_prompts_tokenized_paths = {dataset_path: {model_path: datasets_prompts_paths[dataset_path].replace(".csv", f"-{model_path.lower()}-tokenized.parquet")
                                    for model_path in models_paths} for dataset_path in datasets_paths}

datasets_prompts_tokenized_subsets_sizes = {}

datasets_prompts_tokenized_subsets_sizes[datasets_paths[0]] = {"18K": 18000}
# datasets_prompts_tokenized_subsets_sizes[datasets_paths[1]] = {"1M": int(1e6), "2M": int(2e6), "3M": int(3e6)}
# datasets_prompts_tokenized_subsets_sizes[datasets_paths[0]] = {"1M": int(1e6)}

datasets_prompts_tokenized_subsets_paths = {dataset_path: {model_path: {dataset_prompts_tokenized_subset_name:
                                   datasets_prompts_tokenized_paths[dataset_path][model_path].replace(".parquet", "-" + dataset_prompts_tokenized_subset_name + ".parquet")
                                   for dataset_prompts_tokenized_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                                   for model_path in models_paths} for dataset_path in datasets_paths}

In [21]:
### Methods ###

def load_dataset_from_file(dataset_path: str):
    return load_dataset(dataset_path.split(".")[-1], data_files = dataset_path)["train"]

def create_dataset(dataset, dataset_path: str, create_dataset: Callable, create_dataset_params: Dict[str, Any],
                   load_dataset: bool = True, save_dataset: bool = True):
    print(f"Start of creation of dataset ({dataset_path})")
    
    # Load dataset
    if load_dataset: dataset = load_dataset_from_file(dataset)

    # Create dataset
    dataset = create_dataset(dataset, **create_dataset_params)

    # Save dataset
    if save_dataset: dataset.to_csv(dataset_path)

    print(f"End of creation of dataset ({dataset_path})")

    # Print dataset
    # print_dataset(f"Dataset ({dataset_path.replace(".csv", "")})", dataset)

    return dataset

def create_prompts_dataset(dataset, prompt_template: str, prompt_features: List[str], target_feature: str):
    
    # Packages
    from functools import reduce

    remove_features = dataset.column_names

    def create_prompt(instance):
        features = {prompt_feature: str(instance[prompt_feature.lower()]).lower() for prompt_feature in prompt_features}
        prompt = reduce(lambda prompt, feature: prompt.replace(*feature, 1), features.items(), prompt_template)
        
        return {"prompt": prompt, "labels": instance[target_feature]}
    
    dataset_prompts = dataset.map(create_prompt, num_proc = num_proc)
    dataset_prompts = dataset_prompts.remove_columns(remove_features)

    return dataset_prompts

    # for instance in dataset:
    #     features = {prompt_feature: str(instance[prompt_feature.lower()]).lower() for prompt_feature in prompt_features}
    #     prompt = reduce(lambda prompt, feature: prompt.replace(*feature, 1), features.items(), prompt_template)
    #     dataset_prompts.append({"prompt": prompt, "labels": instance[target_feature]})
    # return Dataset.from_list(dataset_prompts)

def create_train_eval_test_datasets(dataset):
    train_test_dataset = dataset.train_test_split(test_size = 1 - train_test_split, seed = shuffle_seed)
    eval_test_dataset = train_test_dataset["test"].train_test_split(test_size = 1 - eval_test_split, seed = shuffle_seed)
    return DatasetDict({"train": train_test_dataset["train"], "eval": eval_test_dataset["train"], "test": eval_test_dataset["test"]})

def create_tokenized_dataset(dataset, tokenizer):
    return dataset.map(lambda instance: tokenizer(instance["prompt"], padding = "max_length", truncation = True), batched = True, num_proc = num_proc)

def create_subset(dataset, subset_size: int):
    # Sample dataset
    #subset = dataset.sample(n = (subset_size if subset_size <= len(dataset) else len(dataset)))
    subset = dataset.shuffle(seed = shuffle_seed).select(range(subset_size if subset_size <= len(dataset) else len(dataset)))

    # Reset index
    # subset.reset_index(drop = True, inplace = True)

    return subset

In [22]:
### Create prompts datasets ###

datasets_prompts = {dataset_path: create_dataset(dataset_path, datasets_prompts_paths[dataset_path], create_prompts_dataset,
                    {"prompt_template": earthquake_prompt_template, "prompt_features": earthquake_prompt_features, "target_feature": "magnitude"})
                    for dataset_path in datasets_paths}

Start of creation of dataset (Datasets/Earthquakes-180d-prompts.csv)


Map (num_proc=16):   0%|          | 0/17976 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

End of creation of dataset (Datasets/Earthquakes-180d-prompts.csv)


In [None]:
### Load prompts datasets ###

datasets_prompts = {dataset_path: load_dataset(datasets_prompts_paths[dataset_path]) for dataset_path in datasets_paths}

In [23]:
### Load models ###

models = {}
models_tokenizers = {}

for model_path in models_paths:
    models[model_path] = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 1, device_map = {"": 0})
    models_tokenizers[model_path] = AutoTokenizer.from_pretrained(model_path)

    models_tokenizers[model_path].max_length = models[model_path].config.max_position_embeddings

    print(f"### ({model_path}) Model Configuration ###")
    print(models[model_path].config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### (distilbert/distilbert-base-uncased) Model Configuration ###
DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.45.2",
  "vocab_size": 30522
}



In [24]:
### Create prompts tokenized datasets ###

datasets_prompts_tokenized = {dataset_path: {model_path: create_dataset(datasets_prompts[dataset_path],
                              datasets_prompts_tokenized_paths[dataset_path][model_path], create_tokenized_dataset,
                              {"tokenizer": models_tokenizers[model_path]}, False, True)
                              for model_path in models_paths} for dataset_path in datasets_paths}

Start of creation of dataset (Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized.parquet)


Map (num_proc=16):   0%|          | 0/17976 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

End of creation of dataset (Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized.parquet)


In [None]:
### Load prompts tokenized datasets ###

# datasets_prompts_tokenized = {dataset_path: {model_path: load_dataset(datasets_prompts_tokenized_paths[dataset_path][model_path])
#                               for model_path in models_paths} for dataset_path in datasets_paths}

Creating parquet from Arrow format:   0%|          | 0/3125 [00:00<?, ?ba/s]

8745124887

In [25]:
### Create prompts tokenized datasets subsets ###

datasets_prompts_tokenized_subsets = {dataset_path: {model_path: {dataset_prompts_tokenized_subset_name: create_dataset(datasets_prompts_tokenized[dataset_path][model_path],
                             datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenized_subset_name],
                             create_subset, {"subset_size": dataset_prompts_tokenized_subset_size}, False, False)
                             for (dataset_prompts_tokenized_subset_name, dataset_prompts_tokenized_subset_size) in datasets_prompts_tokenized_subsets_sizes[dataset_path].items()}
                             for model_path in models_paths} for dataset_path in datasets_paths}

Start of creation of dataset (Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized-18K.parquet)
End of creation of dataset (Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized-18K.parquet)


In [None]:
### Load prompts tokenized datasets subsets ###

# datasets_prompts_tokenized_subsets = {dataset_path: {model_path: {dataset_prompts_tokenize_subset_name:
#                                       load_dataset(datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenize_subset_name])
#                                       for dataset_prompts_tokenize_subset_name in datasets_prompts_tokenized_subsets_sizes.keys()}
#                                       for model_path in models_paths} for dataset_path in datasets_paths}

# print(datasets_prompts_tokenized_subsets)

{'distilbert/distilbert-base-uncased': {'18K': Dataset({
    features: ['prompt', 'magnitude', 'input_ids', 'attention_mask'],
    num_rows: 17976
})}, 'FacebookAI/roberta-base': {'18K': Dataset({
    features: ['prompt', 'magnitude', 'input_ids', 'attention_mask'],
    num_rows: 17976
})}, 'google-bert/bert-base-uncased': {'18K': Dataset({
    features: ['prompt', 'magnitude', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 17976
})}, 'google-bert/bert-large-uncased': {'18K': Dataset({
    features: ['prompt', 'magnitude', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 17976
})}}
{'distilbert/distilbert-base-uncased': {'18K': 'Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized-18K.csv'}, 'FacebookAI/roberta-base': {'18K': 'Datasets/Earthquakes-180d-prompts-facebookai/roberta-base-tokenized-18K.csv'}, 'google-bert/bert-base-uncased': {'18K': 'Datasets/Earthquakes-180d-prompts-google-bert/bert-base-uncased-tokenized-18K.csv'},

In [26]:
### Create train, eval and test datasets ###

datasets_prompts_tokenized_subsets_split = {dataset_path: {model_path: {dataset_prompts_tokenize_subset_name:
                                            create_dataset(datasets_prompts_tokenized_subsets[dataset_path][model_path][dataset_prompts_tokenize_subset_name],
                                            datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenize_subset_name],
                                            create_train_eval_test_datasets, {}, False, False)
                                            for dataset_prompts_tokenize_subset_name in datasets_prompts_tokenized_subsets_sizes[dataset_path].keys()}
                                            for model_path in models_paths} for dataset_path in datasets_paths}

# print(datasets_prompts_tokenized_subsets_split)

Start of creation of dataset (Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized-18K.parquet)
End of creation of dataset (Datasets/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized-18K.parquet)


In [27]:
### Models training methods ###

mse = evaluate.load("mse")

def compute_mse(eval_prediction):
    logits, labels = eval_prediction
    if isinstance(logits, tuple): logits = logits[0]
    return mse.compute(predictions = logits, references = labels)

def train_model(model, model_name: str, tokenized_dataset):
    Trainer(
        model = model,
        args = TrainingArguments(
            output_dir = "Models/" + model_name,
            eval_strategy = "steps",
            logging_steps = logging_steps,
            save_strategy = save_strategy,
            save_steps = save_steps,
            save_total_limit = save_total_limit,
            per_device_train_batch_size = per_device_train_batch_size,
            per_device_eval_batch_size = per_device_eval_batch_size,
            auto_find_batch_size = True,
            num_train_epochs = num_train_epochs,
            seed = shuffle_seed,
            data_seed = shuffle_seed),
        train_dataset = tokenized_dataset["train"],
        eval_dataset = tokenized_dataset["eval"],
        compute_metrics = compute_mse,
    ).train()

In [30]:
### Train models ###

for dataset_path in datasets_paths:
    for model_path in models_paths:
        for (dataset_prompts_tokenized_subset_split_name, dataset_prompts_tokenized_subset_split) in datasets_prompts_tokenized_subsets_split[dataset_path][model_path].items():
            model_name = model_path + "/" + "".join(datasets_prompts_tokenized_subsets_paths[dataset_path][model_path][dataset_prompts_tokenized_subset_split_name].split("/", maxsplit = 1)[1].split(".")[:-1])
            print(f"### Training model ({model_name}) ###")
            train_model(models[model_path], model_name, dataset_prompts_tokenized_subset_split)

### Training model (distilbert/distilbert-base-uncased/Earthquakes-180d-prompts-distilbert/distilbert-base-uncased-tokenized-18K) ###


  0%|          | 0/4047 [00:00<?, ?it/s]

{'loss': 0.4883, 'grad_norm': 4.4373602867126465, 'learning_rate': 4.382258463059056e-05, 'epoch': 0.37}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.3410135805606842, 'eval_mse': 0.3410135798185388, 'eval_runtime': 98.1031, 'eval_samples_per_second': 36.645, 'eval_steps_per_second': 4.587, 'epoch': 0.37}
{'loss': 0.3226, 'grad_norm': 4.988509178161621, 'learning_rate': 3.7645169261181124e-05, 'epoch': 0.74}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.30101853609085083, 'eval_mse': 0.3010185140652373, 'eval_runtime': 118.857, 'eval_samples_per_second': 30.246, 'eval_steps_per_second': 3.786, 'epoch': 0.74}
{'loss': 0.2792, 'grad_norm': 3.1520936489105225, 'learning_rate': 3.1467753891771684e-05, 'epoch': 1.11}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.2462080419063568, 'eval_mse': 0.24620805719349464, 'eval_runtime': 123.3587, 'eval_samples_per_second': 29.143, 'eval_steps_per_second': 3.648, 'epoch': 1.11}
{'loss': 0.2508, 'grad_norm': 12.74351692199707, 'learning_rate': 2.5290338522362245e-05, 'epoch': 1.48}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.24487605690956116, 'eval_mse': 0.24487604719475026, 'eval_runtime': 123.7295, 'eval_samples_per_second': 29.055, 'eval_steps_per_second': 3.637, 'epoch': 1.48}
{'loss': 0.2388, 'grad_norm': 8.427264213562012, 'learning_rate': 1.9112923152952806e-05, 'epoch': 1.85}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.21749182045459747, 'eval_mse': 0.21749181982826066, 'eval_runtime': 118.6388, 'eval_samples_per_second': 30.302, 'eval_steps_per_second': 3.793, 'epoch': 1.85}
{'loss': 0.22, 'grad_norm': 2.6051442623138428, 'learning_rate': 1.2935507783543367e-05, 'epoch': 2.22}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.2152383178472519, 'eval_mse': 0.21523830849211972, 'eval_runtime': 114.7588, 'eval_samples_per_second': 31.327, 'eval_steps_per_second': 3.921, 'epoch': 2.22}
{'loss': 0.2164, 'grad_norm': 2.0292117595672607, 'learning_rate': 6.7580924141339264e-06, 'epoch': 2.59}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.21650002896785736, 'eval_mse': 0.21650005185066418, 'eval_runtime': 113.6435, 'eval_samples_per_second': 31.634, 'eval_steps_per_second': 3.96, 'epoch': 2.59}
{'loss': 0.2008, 'grad_norm': 1.9443855285644531, 'learning_rate': 5.806770447244872e-07, 'epoch': 2.97}


  0%|          | 0/450 [00:00<?, ?it/s]

{'eval_loss': 0.20560233294963837, 'eval_mse': 0.20560233078109713, 'eval_runtime': 123.1276, 'eval_samples_per_second': 29.197, 'eval_steps_per_second': 3.655, 'epoch': 2.97}
{'train_runtime': 3450.3417, 'train_samples_per_second': 9.377, 'train_steps_per_second': 1.173, 'train_loss': 0.27654232541690615, 'epoch': 3.0}


In [31]:
print(models[models_paths[0]].device)

cuda:0
