# Table of contents
1. [Run Setting](#runsettings)
2. [Reader](#reader)
3. [Processor](#processor)
4. [Trainer](#trainer)
5. [Evaluater](#evaluater)
6. [Wrapper](#wrapper)
7. [Load Data](#loaddata)
8. [Do Training](#dotrianing)

## Configurations to run the script
<a name="runsettings"></a>

- THIS RUN IS A RESULT OF TRANING USING BEST COMBINATION OF HYPERPARAMETERS
- THE DATA HAS BEEN ENCODED BY TAKING THE ENTIRE DATA FOR ALL OR PER LANG (TRAIN+VALID+TEST)
- HYPERPARAMETERS ARE OBTAINED BY TRANING WITH DATA ENCODED THROUGH THE ENTIRE DATA FOR ALL LANGUAGES
- ALL & EACH
- RESULTS in 8927.out

In [1]:
!wandb disabled

W&B disabled.


In [2]:
import os
# os.environ["WANDB_SILENT"] = "true"

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print(os.environ["CUDA_VISIBLE_DEVICES"])

# os.environ["TOKENIZERS_PARALLELISM"] = "false" 

0


In [3]:
! CUDA_LAUNCH_BLOCKING=1

CUDA_LAUNCH_BLOCKING=1: Command not found.


In [4]:
!nvidia-smi

Fri Mar  8 14:44:57 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98                 Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A30                     Off | 00000000:3B:00.0 Off |                    0 |
| N/A   25C    P0              24W / 165W |     18MiB / 24576MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A30                     Off | 00000000:D8:00.0 Off |  

## Reader 
<a name="reader"></a>


In [5]:
# -*- coding: utf-8 -*-
"""
1. Read annotated multilingual ILI data using CustomDataset.
2. Convert encoded features and labels to dataset objects for integration with transformers model training.

""" 

import sys
import json
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset

    
class CustomDataset(object):
    def __init__(self, file_name, savepath=None):
        
        self._file_name = file_name
        if savepath is not None:
            self._savepath = Path(savepath)
            self._savepath.mkdir(parents=True, exist_ok=True)
        
        self.data =  pd.read_csv(self._file_name)
        self.tweets = self.data['tweet']
        self.labels = self.data['final_annotation']

    def __len__(self):    
        if len(self.tweets) != len(self.labels):
            raise sys.exit(f"Number of tweets({len(self.tweets)}) and its labels({len(self.labels)}) do not match.")
        else:
            return len(self.labels)
        
    def __getitem__(self, idx):
        tweet = self.tweets.iloc[idx] 
        label = self.labels.iloc[idx] 
        return tweet, label
    
    def getsplitidx(self, test_split=0.2, valid_split=None, group='lang', stratify_label='final_annotation'):
        
        # group day by language and then perform stratified split by categories and save indices as json
        lang_split_idx = {}
        for grp, grp_df in self.data.groupby(group): 
            train, test = train_test_split(grp_df, test_size=test_split, stratify=grp_df[stratify_label])
            if valid_split is not None:
                train, valid = train_test_split(train, test_size=valid_split, stratify=train[stratify_label])
        
            print(f"\nDistribution of classes in train set in {grp}\n{train[stratify_label].value_counts()}")
            print(f"Distribution of classes in test set in {grp}\n\{test[stratify_label].value_counts()}")
            lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                   'test_idx':test.index.values.tolist()
                                  }
            if valid_split is not None:
                print(f"Distribution of classes in valid set in {grp}\n{valid[stratify_label].value_counts()}\n")
                lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                       'test_idx':test.index.values.tolist(), 
                                       'valid_idx':valid.index.values.tolist()
                                      }
                
        if self._savepath is not None:
            with open(self._savepath.joinpath("lang_split_idx.json"), "w")  as f:
                json.dump(lang_split_idx, f)
        return lang_split_idx

# https://huggingface.co/transformers/v3.5.1/custom_datasets.html    
class EncodedDataset(torch.utils.data.Dataset): # torch
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)
    
    def __getdatasetsplits__(self, indices):
        split = self.__getitem__(indices)
        dataset = Dataset.from_dict(split)
        return dataset
    
    def splitdata(self, split_indices):
        allsplits = []
        for i in split_indices:
            allsplits.append(self.__getdatasetsplits__(i))
        return allsplits

## Preprocessor 
<a name="processor"></a>

In [6]:
# 2-*- coding: utf-8 -*-
"""
1. Process data based on model checkpoint and configurations provided in final_configs.json 

"""

from collections import Counter

import numpy as np
from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer
from torch.utils.data import Subset, DataLoader

class DataProcessor(object):
    
    def __init__(self, config, encoder=LabelEncoder(), return_type_ids=False):
        
        self._config = config
        self._encoder = encoder
        self._return_type_ids = return_type_ids
        
        self.model_checkpoint = self._config['MODEL_CHECKPOINT']
        if ((self._config['MAX_LEN'] is not None and self._config['MAX_LEN']>128) and ('bernice' in self.model_checkpoint)):
            self._config['MAX_LEN'] = 128
            print(f"Max length for {self.model_checkpoint} set to 128 by default")
        print(f"\nFinal configurations for processing training + validation data\n{self._config}")

    def label_encoder(self, target):
        le = self._encoder
        return le.fit_transform(target)

    def tokenizer(self):    
        # statistical tokenizer # subwords, chunks of words 
        return AutoTokenizer.from_pretrained(self.model_checkpoint, 
                                             use_fast = False,    # use one of the fast tokenizers (backed by Rust), available for almost all models
                                             # max_length=self._config['MAX_LEN'] # pass max length only when encoding not when instantiating the tokenizer
                                             )
    
    def feature_encoder(self, features):
        tokenizer = self.tokenizer()

        feature_encodings = tokenizer.batch_encode_plus(
            features.astype(str).values.tolist(), 
            padding=True, 
            truncation=True, 
            max_length=self._config['MAX_LEN'],
            # is_split_into_words=True, # added for multilingual versions refer 4624.err
            # return_attention_mask=True,
            return_token_type_ids=self._return_type_ids, 
            return_tensors='pt',
            )
        print(f"Dimensions of encoded features: {feature_encodings['input_ids'].shape}")
        print(f"Encoding contains: {[i for i in feature_encodings.keys()]}")
        return feature_encodings

    def encoded_data(self, features, labels):
        encoded_features = self.feature_encoder(features)
        encoded_labels = self.label_encoder(labels)
        if encoded_features['input_ids'].shape[0] == encoded_labels.shape[0]:
            return encoded_features, encoded_labels
        else:
            print("encoded features and labels do not have same length")

## Trainer 
<a name="trainer"></a>

In [7]:
# 3-*- coding: utf-8 -*-
"""
1. Train model once the best hyperparameters from final_configs.json that were identified using classification_wandb.py
2. Get predictions from the trained model with boolen flag

"""

import time
import sys
import shutil
from types import SimpleNamespace

import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback, TrainerCallback

import evaluate
# from evaluater import evaluation_display

hyperparams = ['MODEL_CHECKPOINT','BATCH_SIZE','LEARNING_RATE','EPOCHS','MAX_LEN']

# class PrintClassificationCallback(TrainerCallback):
#     def on_evaluate(self, args, state, control, logs=None, **kwargs):
#         print("Called after evaluation phase")

class ModelTrainer(object):

    def __init__(self, run_config, train_dataset, valid_dataset, test_dataset, 
                 tokenizer, savepath, cachepath, lang_to_train='all'):     

        self._run_config = run_config
        if not all(k in self._run_config.keys() for k in hyperparams):
            sys.exit(f"provide all required hyperparams: {hyperparams}")
            
        self.model_checkpoint = self._run_config['MODEL_CHECKPOINT']
        self.model_name = self.model_checkpoint.split('/')[-1]
    
        self._train_dataset = train_dataset
        self._valid_dataset = valid_dataset
        self._test_dataset = test_dataset
        self._train_dataset.cleanup_cache_files()

        self._tokenizer = tokenizer
        
        self._cachepath = cachepath
        self.modelpath = savepath.joinpath('models')
        self.modelpath.mkdir(parents=True, exist_ok=True)

        self._lang_to_train = lang_to_train
        self.num_labels = len(self._run_config['TARGET_NAMES'])
        print(f"{self.num_labels} classes in {self._lang_to_train} language")

    def get_model(self):
        model = AutoModelForSequenceClassification.from_pretrained(self.model_checkpoint,
                                                                   num_labels = self.num_labels,
                                                                   cache_dir = self._cachepath,
                                                                #    output_attentions=False,
                                                                #    output_hidden_states=False,
                                                                #    ignore_mismatched_sizes=True,
                                                                )
        # print(model)
        return model
        
    def compute_metrics(self, eval_pred, eval_metric="f1"):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        # evaluation_display(labels, predictions, self.num_labels, self._target_names) # print only for now
        metric = evaluate.load(eval_metric)
        return metric.compute(predictions=predictions, references=labels, average="macro")

    def train_eval(self, get_pred=False, metric_name="f1", hyperparam_search=False):
        
        out_dir = self.modelpath.joinpath(f"{self.model_name}-{self._lang_to_train}-finetuned")
        print(f"Model to be saved in {out_dir}")
        log_dir = self.modelpath.parent.joinpath('logs').joinpath(f"{self.model_name}-{self._lang_to_train}-finetuned")
        log_dir.mkdir(parents=True, exist_ok=True)
        
        config = SimpleNamespace(**{i.lower():j for i,j in self._run_config.items() if i in hyperparams})
        print(f"\nFinal configurations for training the model:\n{config}")
        
        # attributes to customize the training
        args = TrainingArguments(
            save_total_limit=2,
            output_dir=str(out_dir),
            overwrite_output_dir = True,
            
            learning_rate = config.learning_rate,
            per_device_train_batch_size = config.batch_size,
            per_device_eval_batch_size = config.batch_size,
            num_train_epochs = config.epochs,
            # weight_decay = config.weight_decay,
            
            evaluation_strategy = "epoch",
            save_strategy = "epoch",   
            logging_strategy = 'epoch',
            
            logging_steps= 1,
            eval_accumulation_steps = 1,
            
            metric_for_best_model = metric_name,
            load_best_model_at_end = True,
            
            push_to_hub = False, # push the model to the Hub regularly during training
            # report_to='wandb',  # turn on wandb logging
            )

        # https://huggingface.co/docs/transformers/main_classes/trainer#trainer
        # https://github.com/huggingface/transformers/blob/v4.35.2/src/transformers/trainer.py#L231
        trainer = Trainer(
            model_init = self.get_model,
            args = args,
            train_dataset = self._train_dataset,
            eval_dataset = self._valid_dataset,
            tokenizer = self._tokenizer,
            compute_metrics = self.compute_metrics,
            callbacks = [EarlyStoppingCallback(3, 0.0)]
            )
        
        torch.cuda.empty_cache()
        # print(torch.cuda.memory_summary(device=None, abbreviated=True))
        
        # try later setup hyperparam here instead of wandb
        # # https://huggingface.co/docs/transformers/hpo_train
        # if hyperparam_search:
        #     best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")
        #     print(f"best run for {self._model_checkpoint} is {best_run.hyperparameters.items()}")
        #     for n, v in best_run.hyperparameters.items():
        #         setattr(trainer.args, n, v)

        trainer.train()    
        print(f"free space by deleting: {out_dir}")
        shutil.rmtree(out_dir, ignore_errors=True)
        
        if get_pred:
            print(f"\nGetting Predictions on Test dataset")
            logits, labels, metrics = trainer.predict(self._test_dataset)
            predictions = np.argmax(logits, axis=-1)
            return trainer, (labels, predictions)    
        else:
            return trainer

## Evaluater 
<a name="evaluater"></a>

In [21]:
## 4-*- coding: utf-8 -*-
"""
1. Evaluate the model performance per language.
2. Evaluate performance per category during each epoch.

"""

import numpy as np

import evaluate
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

def evaluation_display(y, y_pred, labels_map=None, plot=False):
    
    if labels_map is not None:
        labels_idx = [k for k,v in labels_map.items() if k in y.unique()]
        labels_name = [v for k,v in labels_map.items() if k in y.unique()]
    else:
        labels_idx = None
        labels_name = None
        
    f1 = f1_score(y, y_pred, labels=labels_idx, average='macro')
    acc = accuracy_score(y, y_pred)
    class_report = classification_report(y, y_pred, labels=labels_idx, target_names=labels_name)

    # y_score = pred probabilityes
    # fpr, tpr, _ = roc_curve(y_test, y_score)
    # roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    # cm = multilabel_confusion_matrix(y, y_pred)
    cm = confusion_matrix(y, y_pred, normalize='true')
    
    if plot:
        print(f"f1: {f1}\nacc: {acc}\n{class_report}")
        fig, ax = plt.subplots(figsize=(4,7))
        cm_disp = ConfusionMatrixDisplay(cm).plot(ax=ax, cmap='Blues', colorbar=False)
        c_bar = fig.add_axes([ax.get_position().x1+0.01, ax.get_position().y0, 0.05, ax.get_position().height])
        plt.colorbar(cm_disp.im_,  cax=c_bar)
        plt.show()
    else:
        print(f"f1: {f1}\nacc: {acc}\n{class_report}\n{cm}")
    
class PredictionEvaluater(object):

    def __init__(self, prediction_set, target_names=None, savepath=None, model_name=None):     

        self._labels, self._predictions = prediction_set
        
        self._target_names = target_names
        self.num_labels = len(target_names)
        self.label_map = {k:v for k,v in zip(range(self.num_labels), self._target_names)}
        print(f"Number of labels in target_names is {self.num_labels}")

        self.savepath = savepath.joinpath('predictions')
        self.savepath.mkdir(parents=True, exist_ok=True)
        
        self._model_name = model_name if model_name is not None else "model"
        
    def compute_metrics(self, eval_pred, eval_metric="accuracy"):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        metric = evaluate.load(eval_metric)
        return metric.compute(predictions=preds, references=labels)
    
    def append_predictions(self, test_df):
        # append prediction and encoded labels to original test data
        df = test_df.copy()
        df[f"{self._model_name}_prediction"] = self._predictions
        df["annotation"] = self._labels
        if self.savepath is not None:
            sub_df = df[[f"{self._model_name}_prediction","annotation"]]
            sub_df.rename_axis('index').to_csv(self.savepath.joinpath(f"{self._model_name}_predictions.csv"))
        return df
        
    def evaluation_report(self, test_df, lang_eval=True):
        df = self.append_predictions(test_df)
        evaluation_display(df["annotation"], df[f"{self._model_name}_prediction"], self.label_map)
        # if language evaluation is required
        if lang_eval:
            for lang, lang_df in df.groupby("lang"):
                print(f"\nEvaluation for language: {lang}")
                evaluation_display(lang_df["annotation"], lang_df[f"{self._model_name}_prediction"], self.label_map)

## Wrapper 
<a name="wrapper"></a>

In [22]:
# -*- coding: utf-8 -*-
"""
The script for wrapper function to run multilingual_ILI_classification.py
"""

import time
import sys
import logging as log

import pandas as pd

# from reader import EncodedDataset
# from preprocessor import DataProcessor
# from trainer import hyperparams, ModelTrainer
# from evaluater import PredictionEvaluater

def getsplitidx(lang_split_idx, key='train_idx'):
    idx_list = [v[key] for k,v in lang_split_idx.items()]
    # print(len(idx_list))
    idx_list = [i for eachlist in idx_list for i in eachlist]
    return idx_list

def getsplit(lang_split_idx, tweets, savepath, save=False):
    
    # get train, valid and test split for selected languages
    train_idx = getsplitidx(lang_split_idx, key='train_idx')
    valid_idx = getsplitidx(lang_split_idx, key='valid_idx')
    test_idx = getsplitidx(lang_split_idx, key='test_idx')
    print(f"Distribution of data in train, validation and test splits: {len(train_idx)}, {len(valid_idx)}, {len(test_idx)}")
    
    # save test set for evaluation later
    train_df = tweets.data.iloc[train_idx]
    valid_df = tweets.data.iloc[valid_idx]
    test_df = tweets.data.iloc[test_idx]
    if save:
        test_df.rename_axis('index').to_csv(savepath)

    return train_df, valid_df, test_df
    
def mlm_evaluation(train_df, valid_df, test_df, config, lang, lang_eval, split_path, cache_path, hyperparams=hyperparams):
    
    # ensure all parameters for trianing exists
    CONFIG = {k.upper():v for k,v in config.items()}
    if not all(k in CONFIG.keys() for k in hyperparams):
        sys.exit(f"provide all required hyperparams: {hyperparams}. Received only {CONFIG.keys()}")
        
    # combine data for preprocessing
    alldata = pd.concat([train_df, valid_df, test_df])
    original_index = alldata.index
    alldata.reset_index(inplace=True)
    
    start_time = time.time()
    try:
        # process data into encoded features and labels and then to dataset object
        print(f"Working with {CONFIG['MODEL_CHECKPOINT']}")
        processor = DataProcessor(CONFIG, return_type_ids=True)
        feature_encodings, label_encodings = processor.encoded_data(alldata['tweet'], alldata['final_annotation'])
        encoded_dataset = EncodedDataset(feature_encodings, label_encodings)
    
        # for now unable to use hugging face trainer without validation set
        # refer to Training arguments and then change if required to train on train+valid data
        
        # train with data splits and configurations provided
        n, n_train, n_valid, n_test = alldata.shape[0], train_df.shape[0], valid_df.shape[0], test_df.shape[0]
        train_dataset, valid_dataset, test_dataset = encoded_dataset.splitdata([range(n_train), 
                                                                                range(n_train, n_train+n_valid), 
                                                                                range(n_train+n_valid, n)
                                                                               ])
        print(f"Distribution of data splits for {lang} language is {train_dataset.shape}, {valid_dataset.shape}, {test_dataset.shape}")
        trainer = ModelTrainer(CONFIG, train_dataset, valid_dataset, test_dataset, processor.tokenizer(), split_path, cache_path, lang)
        model, prediction_set = trainer.train_eval(get_pred=True)
        print(f"\n{trainer.model_name} trained on {trainer._lang_to_train} languages")
            
        # evaluate on test set
        evaluater = PredictionEvaluater(prediction_set, CONFIG['TARGET_NAMES'], split_path, f"{trainer.model_name}_{lang}")
        evaluater.evaluation_report(test_df, lang_eval)
     
    except Exception as error:
        print("An error occurred:", error)
        log.exception('Failed')
        pass 
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"\nTotal execution time to finetune {CONFIG['MODEL_CHECKPOINT']} on {lang} language(s) is {execution_time}")    

## Load Data 
<a name="loaddata"></a>

In [23]:
# -*- coding: utf-8 -*-
"""
The script to run model once the best hyperparameters are identified using classification_wandb.py
Update required in final_configs.json 
"""

import time
from pathlib import Path 
import argparse
import json
import logging as log
from copy import deepcopy
import numpy as np

# from reader import CustomDataset
# from wrapper import mlm_evaluation

# # args
# parser = argparse.ArgumentParser(description="Twitter Meta Analysis")
# parser.add_argument("--data_file", type=str, help="File name including directory where data resides.")
# parser.add_argument("--params_file", type=str, help="File where parameters to run the model are provided.")
# parser.add_argument("--output_dir", type=str, help="Directory where output are to be stores.")
# args = parser.parse_args()

# DATA_FILE = Path(args.data_file)
# PARAMS_FILE = Path(args.params_file)
# OUT_PATH = Path(args.output_dir)

DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata.csv")
PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/params.tsv")
OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/eval_new")

# read data
data_path = DATA_FILE.parent
tweets = CustomDataset(DATA_FILE, data_path)
print(f"Number of tweets in data: {tweets.__len__()}")
print(f"Distribution of classes in all data {tweets.labels.value_counts()}")

# hyperparameters
params = pd.read_csv(PARAMS_FILE, sep='\t')
# params['split'].apply(ast.literal_eval)
print(f"Configuration setup read from {PARAMS_FILE}")   

# where MLMs are cached
cache_path = OUT_PATH.parent.joinpath('.cache')
cache_path.mkdir(parents=True, exist_ok=True)
print(f"Cache in {cache_path}")

Number of tweets in data: 4400
Distribution of classes in all data final_annotation
3. Not Related to ILI or COVID-19 Infection       2492
1. Likely ILI infection                           1553
4. Ambiguous/Unsure                                238
2. Likely COVID-19 Infection (after 2020 only)     117
Name: count, dtype: int64
Configuration setup read from /gaueko0/users/nmishra/multiling_fludetection/params.tsv
Cache in /gaueko0/users/nmishra/multiling_fludetection/.cache


## Do Training 
<a name="dotraining"></a>

In [24]:
# read data split index
SPLITS = params['split'].unique()
for split in SPLITS:
    dirname = f"testset{split.replace(',','_')}"
    split_path = OUT_PATH.joinpath(dirname)
    print(f"Reading data split index from: {split_path}")
    with open(split_path.joinpath('split_idx.json'), 'r') as f:
        split_idx = json.load(f) 
   
    for lang, lang_params in params.groupby('lang'):      
        # get split idx 
        if lang=='all':
            # for all languages in the list
            print(f"\nTrain using data from all languages")
            languages = [i for i in params['lang'].unique() if i!= 'all']
            lang_split_idx = {i:split_idx[i] for i in languages}
            lang_eval = True
        else:
            # for each language in the list
            print(f"\nTrain using data from {lang} languages")
            lang_split_idx = {}
            lang_split_idx[lang] = split_idx[lang]
            lang_eval = False

        # get train, valid and test split for selected languages
        train_df, valid_df, test_df = getsplit(lang_split_idx, tweets, split_path.joinpath(f"{dirname}_{lang}.csv"))
        
        # get training parameters for models and train
        target_names = sorted(test_df['final_annotation'].unique().tolist()) # because english lang has only three labels
        training_params = lang_params.to_dict(orient='records')
        for config in training_params:
            config['target_names'] = target_names
            mlm_evaluation(train_df, valid_df, test_df, config, lang, lang_eval, split_path, cache_path)

Reading data split index from: /gaueko0/users/nmishra/multiling_fludetection/eval_new/testset0.6_0.2_0.2

Train using data from all languages
Distribution of data in train, validation and test splits: 2640, 880, 880
Working with cardiffnlp/twitter-xlm-roberta-base

Final configurations for processing training + validation data
{'LANG': 'all', 'SPLIT': '0.6,0.2,0.2', 'N_LABELS': 4, 'MAX_LEN': 128, 'MODEL_CHECKPOINT': 'cardiffnlp/twitter-xlm-roberta-base', 'BATCH_SIZE': 32, 'EPOCHS': 1, 'LEARNING_RATE': 3.571430010972717e-05, 'TARGET_NAMES': ['1. Likely ILI infection', '2. Likely COVID-19 Infection (after 2020 only)', '3. Not Related to ILI or COVID-19 Infection', '4. Ambiguous/Unsure']}
Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for all language is (2640, 4), (880, 4), (880, 4)




4 classes in all language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/eval_new/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-all-finetuned

Final configurations for training the model:
namespace(max_len=128, model_checkpoint='cardiffnlp/twitter-xlm-roberta-base', batch_size=32, epochs=1, learning_rate=3.571430010972717e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [18]:
# # read data split index
# splits = params['split'].unique()
# for split in splits:
    
#     cache_path = OUT_PATH.parent.joinpath('.cache')
#     cache_path.mkdir(parents=True, exist_ok=True)
#     print(f"Cache in {cache_path}")

#     dirname = f"testset{'_'.join([str(i) for i in split])}"
#     split_path = OUT_PATH.joinpath(dirname)
#     print(f"Reading data split index from: {split_path}")
#     with open(split_path.joinpath('split_idx.json'), 'r') as f:
#         split_idx = json.load(f) 
        
#     # determine languages for which to get split index
#     if params['LANG']=='all':
#         languages = [i for i in split_idx]
#     else:
#         languages = [i for i in split_idx if i in params['LANG'].split(',')]
    
#     # # train on all languages and then on each language
#     # lang_split_idx = {i:split_idx[i] for i in languages}
#     # print(f"Training data used for {params['LANG']} languages")
#     # mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, params['LANG'], cache_path)
    
#     for lang_to_train in languages:
#         if lang_to_train=='es': #or lang_to_train=='it':
#             print(f"\nTraining data used for {lang_to_train} language")
#             lang_split_idx = {}
#             lang_split_idx[lang_to_train] = split_idx[lang_to_train]
#             mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, lang_to_train, cache_path)

In [25]:
# # read data split index
# for split in params['SPLITS'][:1]:
    
#     cache_path = OUT_PATH.parent.joinpath('.cache')
#     cache_path.mkdir(parents=True, exist_ok=True)
#     print(f"Cache in {cache_path}")

#     dirname = f"testset{'_'.join([str(i) for i in split])}"
#     split_path = OUT_PATH.joinpath(dirname)
#     print(f"Reading data split index from: {split_path}")
#     with open(split_path.joinpath('split_idx.json'), 'r') as f:
#         split_idx = json.load(f) 
        
#     # determine languages for which to get split index
#     if params['LANG']=='all':
#         languages = [i for i in split_idx]
#     else:
#         languages = [i for i in split_idx if i in params['LANG'].split(',')]
    
#     # train on all languages and then on each language
#     lang_split_idx = {i:split_idx[i] for i in languages}
#     print(f"Training data used for {params['LANG']} languages")
#     # mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, params['LANG'], cache_path)
    
#     for lang_to_train in languages:
#         print(f"\nTraining data used for {lang_to_train} language")
#         lang_split_idx = {}
#         lang_split_idx[lang_to_train] = split_idx[lang_to_train]
#         mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, lang_to_train, cache_path)