WANDB ISSUES <br>
https://github.com/wandb/wandb/issues/4534 <br>
to delete runs before N hours. Note that N > duration time of the longest run, just to be make sure you do not delete an active run <br>

`wandb sync --clean-old-hours N` <br>

to not clean the online run folders. More details in our documentation here.https://docs.wandb.ai/ref/cli/wandb-sync <br>

`--no-include-online` <br>

.cache where artifact files are cached to boost performance <br>

`wandb artifact cache cleanup 10GB` <br>
https://docs.wandb.ai/ref/cli/wandb-artifact/wandb-artifact-cache/wandb-artifact-cache-cleanup <br>
https://community.wandb.ai/t/how-to-speed-up-batch-delete-of-files-artifacts/4251/6

WANDB directing files to /tmp via ipykernel <br>
https://scipy-ipython.readthedocs.io/en/latest/install/kernel_install.html <br>

To register ipykernel, first activate virtual environment, then install and then register <br>
`python -m pip install ipykernel` <br>
`python -m ipykernel install --name {MACHINE_NAME} --display-name "{DISPLAY_NAME}" --user`<br>
`python -m ipykernel install --name=mari_kernel --user` <br>

To remove kernel <br>
`jupyter kernelspec uninstall {KERNEL_NAME}`<br>

In [1]:
import os 
# print(os.environ["TMPDIR"])
# print(os.environ["TEMP"])
# print(os.environ["TMP"])

# print(os.environ["TRANSFORMERS_CACHE"])
# print(os.environ["HF_DATASETS_CACHE"])
# print(os.environ["HF_HOME"])

# KeyError

In [2]:
os.environ["WANDB_DIR"] = "/gaueko0/users/nmishra/multiling_fludetection/evals"
os.environ["WANDB_CACHE_DIR"] = "/gaueko0/users/nmishra/multiling_fludetection/evals"
os.environ["WANDB_CONFIG_DIR"] = "/gaueko0/users/nmishra/multiling_fludetection/evals"
os.environ["WANDB_ARTIFACT_DIR"] = "/gscratch3/users/nmishra/gridsearch_revisedcateg"
os.environ["WANDB_ARTIFACT_LOCATION"] = "/gscratch3/users/nmishra/gridsearch_revisedcateg"
os.environ["WANDB_DATA_DIR"] = "/gscratch3/users/nmishra/gridsearch_revisedcateg"

In [3]:
# os.environ["WANDB_SILENT"] = "true" # /home/my_user/code
print(os.environ["WANDB_DIR"]) # /gaueko0/users/nmishra/multiling_fludetection
print(os.environ["WANDB_CACHE_DIR"])
print(os.environ["WANDB_CONFIG_DIR"])
print(os.environ["WANDB_ARTIFACT_DIR"])
print(os.environ["WANDB_ARTIFACT_LOCATION"])
print(os.environ["WANDB_DATA_DIR"])

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0" #,5,6"
print(os.environ["CUDA_VISIBLE_DEVICES"])

/gaueko0/users/nmishra/multiling_fludetection/evals
/gaueko0/users/nmishra/multiling_fludetection/evals
/gaueko0/users/nmishra/multiling_fludetection/evals
/gscratch3/users/nmishra/gridsearch_revisedcateg
/gscratch3/users/nmishra/gridsearch_revisedcateg
/gscratch3/users/nmishra/gridsearch_revisedcateg
0


In [4]:
!nvidia-smi

Thu Mar 28 11:14:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:01:00.0 Off |                    0 |
| N/A   23C    P0              59W / 500W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000000:41:00.0 Off |  

In [5]:
# -*- coding: utf-8 -*-
"""
1. Read annotated multilingual ILI data using CustomDataset.
2. Convert encoded features and labels to dataset objects for integration with transformers model training.

""" 

import sys
import json
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset

    
class CustomDataset(object):
    def __init__(self, file_name, savepath=None):
        
        self._file_name = file_name
        if savepath is not None:
            self._savepath = Path(savepath)
            self._savepath.mkdir(parents=True, exist_ok=True)
        
        self.data =  pd.read_csv(self._file_name)
        self.tweets = self.data['tweet']
        self.labels = self.data['final_annotation']

    def __len__(self):    
        if len(self.tweets) != len(self.labels):
            raise sys.exit(f"Number of tweets({len(self.tweets)}) and its labels({len(self.labels)}) do not match.")
        else:
            return len(self.labels)
        
    def __getitem__(self, idx):
        tweet = self.tweets.iloc[idx] 
        label = self.labels.iloc[idx] 
        return tweet, label
    
    def getsplitidx(self, test_split=0.2, valid_split=None, group='lang', stratify_label='final_annotation'):
        
        # group day by language and then perform stratified split by categories and save indices as json
        lang_split_idx = {}
        for grp, grp_df in self.data.groupby(group): 
            train, test = train_test_split(grp_df, test_size=test_split, stratify=grp_df[stratify_label])
            if valid_split is not None:
                train, valid = train_test_split(train, test_size=valid_split, stratify=train[stratify_label])
        
            print(f"\nDistribution of classes in train set in {grp}\n{train[stratify_label].value_counts()}")
            print(f"Distribution of classes in test set in {grp}\n\{test[stratify_label].value_counts()}")
            lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                   'test_idx':test.index.values.tolist()
                                  }
            if valid_split is not None:
                print(f"Distribution of classes in valid set in {grp}\n{valid[stratify_label].value_counts()}\n")
                lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                       'test_idx':test.index.values.tolist(), 
                                       'valid_idx':valid.index.values.tolist()
                                      }
                
        if self._savepath is not None:
            with open(self._savepath.joinpath("lang_split_idx.json"), "w")  as f:
                json.dump(lang_split_idx, f)
        return lang_split_idx

# https://huggingface.co/transformers/v3.5.1/custom_datasets.html    
class EncodedDataset(torch.utils.data.Dataset): # torch
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)
    
    def __getdatasetsplits__(self, indices):
        split = self.__getitem__(indices)
        dataset = Dataset.from_dict(split)
        return dataset
    
    def splitdata(self, split_indices):
        allsplits = []
        for i in split_indices:
            allsplits.append(self.__getdatasetsplits__(i))
        return allsplits

In [6]:
# 2-*- coding: utf-8 -*-
"""
1. Process data based on model checkpoint and configurations provided in final_configs.json 

"""

from collections import Counter

import numpy as np
from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer
from torch.utils.data import Subset, DataLoader

class DataProcessor(object):
    
    def __init__(self, config, encoder=LabelEncoder(), return_type_ids=False):
        
        self._config = config
        self._encoder = encoder
        self._return_type_ids = return_type_ids
        
        self.model_checkpoint = self._config['MODEL_CHECKPOINT']
        if ((self._config['MAX_LEN'] is not None and self._config['MAX_LEN']>128) and ('bernice' in self.model_checkpoint)):
            self._config['MAX_LEN'] = 128
            print(f"Max length for {self.model_checkpoint} set to 128 by default")
        print(f"Final configurations for processing training + validation data\n{self._config}")

    def label_encoder(self, target):
        le = self._encoder
        return le.fit_transform(target)

    def tokenizer(self):    
        # statistical tokenizer # subwords, chunks of words 
        return AutoTokenizer.from_pretrained(self.model_checkpoint, 
                                             use_fast = False,    # use one of the fast tokenizers (backed by Rust), available for almost all models
                                             # max_length=self._config['MAX_LEN'] # pass max length only when encoding not when instantiating the tokenizer
                                             )
    
    def feature_encoder(self, features):
        tokenizer = self.tokenizer()

        feature_encodings = tokenizer.batch_encode_plus(
            features.astype(str).values.tolist(), 
            padding=True, 
            truncation=True, 
            max_length=self._config['MAX_LEN'],
            # is_split_into_words=True, # added for multilingual versions refer 4624.err
            # return_attention_mask=True,
            return_token_type_ids=self._return_type_ids, 
            return_tensors='pt',
            )
        print(f"Dimensions of encoded features: {feature_encodings['input_ids'].shape}")
        print(f"Encoding contains: {[i for i in feature_encodings.keys()]}")
        return feature_encodings

    def encoded_data(self, features, labels):
        encoded_features = self.feature_encoder(features)
        encoded_labels = self.label_encoder(labels)
        if encoded_features['input_ids'].shape[0] == encoded_labels.shape[0]:
            return encoded_features, encoded_labels
        else:
            print("encoded features and labels do not have same length")

In [7]:
import time
import shutil

import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback, TrainerCallback

import evaluate
# from evaluater import evaluation_display

import wandb

def compute_metrics(eval_pred, eval_metric="f1"):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric = evaluate.load(eval_metric)
    return metric.compute(predictions=predictions, references=labels, average="macro")
    # precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    # acc = accuracy_score(labels, predictions)
    # return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}
    # accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    # f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    # return {"accuracy": accuracy, "f1": f1}

class ModelTrainer(object):

    def __init__(self, model_checkpoint, train_dataset, valid_dataset, test_dataset, 
                 tokenizer, savepath, cachepath, target_names=None, lang_to_train='all'):     
        
        self._model_checkpoint = model_checkpoint
        self.model_name = self._model_checkpoint.split("/")[-1]
        
        self._train_dataset = train_dataset
        self._valid_dataset = valid_dataset
        
        self._tokenizer = tokenizer
        
        self._cachepath = cachepath#.joinpath('.cache')
        self._savepath = savepath
        self.modelpath = self._savepath.joinpath('models')
        self.modelpath.mkdir(parents=True, exist_ok=True)
        
        
        self._target_names = target_names
        self.num_labels = len(self._target_names)
        
        self._lang_to_train = lang_to_train
        
    def get_model(self):
        model = AutoModelForSequenceClassification.from_pretrained(self._model_checkpoint,
                                                                   num_labels=self.num_labels,
                                                                   cache_dir=self._cachepath,
                                                                #    output_attentions=False,
                                                                #    output_hidden_states=False,
                                                                #    ignore_mismatched_sizes=True,
                                                                )
        # wandb.watch(model, criterion, log="all", log_freq=10)
        # print(model.config)
        return model
    
    def train_eval(self, config=None, metric_name="f1"):
        out_dir = self.modelpath.joinpath(f"{self.model_name}-{self._lang_to_train}-finetuned")
        log_dir = self.modelpath.parent.joinpath('logs').joinpath(f"{self.model_name}-{self._lang_to_train}-finetuned")
        log_dir.mkdir(parents=True, exist_ok=True)

        # attributes to customize the training
        with wandb.init(config=config):
            config = wandb.config
            print(f"\nTraining {self.model_name} using with configurations:\n{config}")
            
            # https://discuss.huggingface.co/t/cuda-out-of-memory-when-using-trainer-with-compute-metrics/2941/22
            args = TrainingArguments(
                save_total_limit = 2,
                output_dir = str(out_dir),
                overwrite_output_dir = True,
                logging_dir = str(log_dir),
                
                learning_rate = config.learning_rate,
                per_device_train_batch_size = config.batch_size,
                per_device_eval_batch_size = config.batch_size,
                num_train_epochs = config.epochs,
                # weight_decay = config.weight_decay,
                
                evaluation_strategy = "epoch",
                save_strategy = "epoch",  
                logging_strategy='epoch', 

                eval_accumulation_steps=1,
                metric_for_best_model = metric_name,
                push_to_hub=False, # push the model to the Hub regularly during training
                load_best_model_at_end = True,
                # report_to='wandb',  # turn on wandb logging
                )
        
        # https://huggingface.co/docs/transformers/main_classes/trainer#trainer
        trainer = Trainer(
            model_init = self.get_model,
            args = args,
            train_dataset = self._train_dataset,
            eval_dataset = self._valid_dataset,
            tokenizer = self._tokenizer,
            compute_metrics = compute_metrics,
            callbacks = [EarlyStoppingCallback(3, 0.0)]
            )
        
        torch.cuda.empty_cache()
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))
        
        trainer.train()
        trainer.evaluate()
        
        print(f"free space by deleting: {out_dir}")
        shutil.rmtree(out_dir, ignore_errors=True)

# mytrainer = trainer.train_eval()

In [10]:
# -*- coding: utf-8 -*-
"""
The script for wrapper function to run multilingual_ILI_classification_wandb.py
"""

import time
import shutil
import logging as log
import wandb 

# from reader import EncodedDataset
# from preprocessor import DataProcessor
# from trainer_wandb import ModelTrainer

def getsplit(lang_split_idx, key='train_idx'):
    idx_list = [v[key] for k,v in lang_split_idx.items()]
    # print(len(idx_list))
    idx_list = [i for eachlist in idx_list for i in eachlist]
    return idx_list
    
def mlm_evaluation(lang_split_idx, tweets, params, sweep_config, split_path, dirname, lang_to_train, cache_path, 
                   hyperparams=None, save=False, n_searches=5):

    # get train, valid and test split for selected languages
    train_idx = getsplit(lang_split_idx, key='train_idx')
    valid_idx = getsplit(lang_split_idx, key='valid_idx')
    test_idx = getsplit(lang_split_idx, key='test_idx')
    print(f"Distribution of data in train, validation and test splits: {len(train_idx)}, {len(valid_idx)}, {len(test_idx)}")

    # save test set for evaluation later
    if save:
        test_df = tweets.data.iloc[test_idx]
        test_df.rename_axis('index').to_csv(split_path.joinpath(f"{dirname}_{lang_to_train}.csv"))

    for model_checkpoint in params['MODEL_CHECKPOINT']:
        # print(model_checkpoint)
        model_params = {'MODEL_CHECKPOINT':model_checkpoint, 
                        'MAX_LEN':params['MAX_LEN'],
                        'TARGET_NAMES':params['TARGET_NAMES']
                       } 
        
        # convert data to encoded features and labels
        start_time = time.time()
        processor = DataProcessor(model_params, return_type_ids=True)
        feature_encodings, label_encodings = processor.encoded_data(tweets.data['tweet'], tweets.data['final_annotation'])

        # obtain encoded train, valid and test data as dataset object
        encoded_dataset = EncodedDataset(feature_encodings, label_encodings)
        train_dataset, valid_dataset, test_dataset = encoded_dataset.splitdata([train_idx, valid_idx, test_idx])
        
        try:     
            # train with hyperparameter provided    
            trainer = ModelTrainer(model_checkpoint, train_dataset, valid_dataset, test_dataset, processor.tokenizer(), 
                                   split_path, cache_path, model_params['TARGET_NAMES'], lang_to_train)
            sweep_id = wandb.sweep(sweep=sweep_config, entity=sweep_config['entity'], project=sweep_config['project'])
            wandb.agent(sweep_id, trainer.train_eval, count=n_searches)
            wandb.finish()  
            print(f"COMPLETED - {trainer.model_name} trained on {lang_to_train} languages")

            # delete wandb folder related to this model checkpoint
            print(f"free space by deleting: {split_path.parent.joinpath('wandb')}")
            shutil.rmtree(split_path.parent.joinpath('wandb'), ignore_errors=True)
            
        except Exception as error:
            print("An error occurred:", error)
            log.exception('Failed')
            pass 
        
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Total execution time to finetune {trainer.model_name} on {lang_to_train} language(s) is {execution_time}\n")

In [11]:
# -*- coding: utf-8 -*-
"""
The script to run to perform gridsearch using wandb
"""

import os
import time
from pathlib import Path
import argparse
import json
import yaml
import logging as log

from dotenv import load_dotenv
# import pprint

import numpy as np
import torch

# from reader import CustomDataset, EncodedDataset
# from preprocessor import DataProcessor
# from trainer import hyperparams, ModelTrainer
# from evaluater import PredictionEvaluater
# from wandb_wrapper import mlm_evaluation

# args
DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata_revised.csv")
PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/params.json")
CONFIG_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/sweep_config.yml")
OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evals/gridsearch_revisedcateg")
WANDB_DATA_PATH = Path("/gscratch3/users/nmishra/gridsearch_revisedcateg")

load_dotenv()
# wandb.login()

# os.environ["TRANSFORMERS_CACHE"] = str(OUT_PATH.joinpath(".cache"))
# os.environ["HF_DATASETS_CACHE"] = str(OUT_PATH.joinpath(".cache"))
# os.environ['HF_HOME'] = str(OUT_PATH.joinpath(".cache"))
# os.environ["WANDB_CACHE_DIR"] = str(OUT_PATH.joinpath(".cache"))

# ensure outpath exists if not create
OUT_PATH.mkdir(parents=True, exist_ok=True)

WANDB_DATA_PATH = str(WANDB_DATA_PATH.joinpath(OUT_PATH.stem))
os.environ["WANDB_DIR"] = os.environ["WANDB_CACHE_DIR"] = os.environ["WANDB_CONFIG_DIR"] = str(OUT_PATH)
os.environ["WANDB_ARTIFACT_DIR"] = os.environ["WANDB_ARTIFACT_LOCATION"] = os.environ["WANDB_DATA_DIR"] = WANDB_DATA_PATH 
print(f"wandb artifacts and data saved in \n{WANDB_DATA_PATH } \nand rest in {OUT_PATH}\n")

# read data
data_path = DATA_FILE.parent
tweets = CustomDataset(DATA_FILE, data_path)
print(f"Number of tweets in data: {tweets.__len__()}")
print(f"Distribution of classes in all data {tweets.labels.value_counts()}")

with open (PARAMS_FILE, "r") as f:
    params = json.load(f)
params['MAX_LEN'] = None if params['MAX_LEN']=='None' else params['MAX_LEN']
params['TARGET_NAMES'] = np.unique(tweets.labels).tolist()
print(f"Configuration setup: {params}\n")  

with open (CONFIG_FILE, "r") as f:
    sweep_config = yaml.safe_load(f)
print(f"Configuration setup for gridsearch: {sweep_config['parameters']}\n")

wandb artifacts and data saved in 
/gscratch3/users/nmishra/gridsearch_revisedcateg/gridsearch_revisedcateg 
and rest in /gaueko0/users/nmishra/multiling_fludetection/evals/gridsearch_revisedcateg

Number of tweets in data: 4284
Distribution of classes in all data final_annotation
3. Not Related to ILI or COVID-19 Infection    2587
1. Likely ILI infection                        1697
Name: count, dtype: int64
Configuration setup: {'MODEL_CHECKPOINT': ['jhu-clsp/bernice', 'cardiffnlp/twitter-xlm-roberta-base', 'bert-base-multilingual-uncased', 'microsoft/mdeberta-v3-base', 'xlm-roberta-base'], 'LANG': 'all', 'BATCH_SIZE': 16, 'LEARNING_RATE': 3.74e-05, 'EPOCHS': 6, 'MAX_LEN': 128, 'SPLITS': [[0.6, 0.2, 0.2]], 'TARGET_NAMES': ['1. Likely ILI infection', '3. Not Related to ILI or COVID-19 Infection']}

Configuration setup for gridsearch: {'learning_rate': {'distribution': 'log_uniform_values', 'min': 1e-05, 'max': 0.1}, 'batch_size': {'values': [8, 16, 32]}, 'epochs': {'value': 1}}



In [13]:
for split in params['SPLITS']:

    # cache dir
    cache_path = OUT_PATH.parent.joinpath('.cache')
    cache_path.mkdir(parents=True, exist_ok=True)
    print(f"Cache in {cache_path}")

    # data split index info
    dirname = f"testset{'_'.join([str(i) for i in split])}"
    split_path = OUT_PATH.joinpath(dirname)
    print(f"Reading data split index from: {split_path}")
    with open(split_path.joinpath('split_idx.json'), 'r') as f:
        split_idx = json.load(f) 
        
    # determine languages for which to get split index
    if params['LANG']=='all':
        languages = [i for i in split_idx]
    else:
        languages = [i for i in split_idx if i in params['LANG'].split(',')]
    
    # train on all languages and then on each language
    lang_split_idx = {i:split_idx[i] for i in languages}
    print(f"Training data used for {params['LANG']} languages")
    mlm_evaluation(lang_split_idx, tweets, params, sweep_config, split_path, dirname, params['LANG'], cache_path)
    
    for lang_to_train in languages:
        #if lang_to_train=='fr' or lang_to_train=='it':
        print(f"\nTraining data used for {lang_to_train} language")
        lang_split_idx = {}
        lang_split_idx[lang_to_train] = split_idx[lang_to_train]
        mlm_evaluation(lang_split_idx, tweets, params, sweep_config, split_path, dirname, lang_to_train, cache_path)

In [9]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(torch.cuda.get_device_name())
    print(torch.cuda.device_count())
    print('Allocated Memory:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached Memory:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Tesla V100-PCIE-32GB
1
Allocated Memory: 6.2 GB
Cached Memory:    9.2 GB
The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.
