# Table of contents
1. [Run Setting](#runsettings)
2. [Reader](#reader)
    1. [Sub paragraph](#subparagraph1)
3. [Processor](#processor)
4. [Trainer](#trainer)
5. [Evaluater](#evaluater)
6. [Wrapper](#wrapper)
7. [Load Data](#loaddata)
8. [Do Training](#dotrianing)

## Configurations to run the script
<a name="runsettings"></a>
Details can be added here.

In [1]:
!wandb disabled

W&B disabled.


In [2]:
import os
# os.environ["WANDB_SILENT"] = "true"

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print(os.environ["CUDA_VISIBLE_DEVICES"])

# os.environ["TOKENIZERS_PARALLELISM"] = "false" 

0


In [3]:
!CUDA_LAUNCH_BLOCKING=1

CUDA_LAUNCH_BLOCKING=1: Command not found.


In [4]:
!nvidia-smi

Sat Mar  9 12:32:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:01:00.0 Off |                    0 |
| N/A   24C    P0              59W / 500W |      4MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000000:41:00.0 Off |  

## Reader 
<a name="reader"></a>


In [6]:
# -*- coding: utf-8 -*-
"""
1. Read annotated multilingual ILI data using CustomDataset.
2. Convert encoded features and labels to dataset objects for integration with transformers model training.

""" 

import sys
import json
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset

    
class CustomDataset(object):
    def __init__(self, file_name, savepath=None):
        
        self._file_name = file_name
        if savepath is not None:
            self._savepath = Path(savepath)
            self._savepath.mkdir(parents=True, exist_ok=True)
        
        self.data =  pd.read_csv(self._file_name)
        self.tweets = self.data['tweet']
        self.labels = self.data['final_annotation']

    def __len__(self):    
        if len(self.tweets) != len(self.labels):
            raise sys.exit(f"Number of tweets({len(self.tweets)}) and its labels({len(self.labels)}) do not match.")
        else:
            return len(self.labels)
        
    def __getitem__(self, idx):
        tweet = self.tweets.iloc[idx] 
        label = self.labels.iloc[idx] 
        return tweet, label
    
    def getsplitidx(self, test_split=0.2, valid_split=None, group='lang', stratify_label='final_annotation'):
        
        # group day by language and then perform stratified split by categories and save indices as json
        lang_split_idx = {}
        for grp, grp_df in self.data.groupby(group): 
            train, test = train_test_split(grp_df, test_size=test_split, stratify=grp_df[stratify_label])
            if valid_split is not None:
                train, valid = train_test_split(train, test_size=valid_split, stratify=train[stratify_label])
        
            print(f"\nDistribution of classes in train set in {grp}\n{train[stratify_label].value_counts()}")
            print(f"Distribution of classes in test set in {grp}\n\{test[stratify_label].value_counts()}")
            lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                   'test_idx':test.index.values.tolist()
                                  }
            if valid_split is not None:
                print(f"Distribution of classes in valid set in {grp}\n{valid[stratify_label].value_counts()}\n")
                lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                       'test_idx':test.index.values.tolist(), 
                                       'valid_idx':valid.index.values.tolist()
                                      }
                
        if self._savepath is not None:
            with open(self._savepath.joinpath("lang_split_idx.json"), "w")  as f:
                json.dump(lang_split_idx, f)
        return lang_split_idx

# https://huggingface.co/transformers/v3.5.1/custom_datasets.html    
class EncodedDataset(torch.utils.data.Dataset): # torch
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)
    
    def __getdatasetsplits__(self, indices):
        split = self.__getitem__(indices)
        dataset = Dataset.from_dict(split)
        return dataset
    
    def splitdata(self, split_indices):
        allsplits = []
        for i in split_indices:
            allsplits.append(self.__getdatasetsplits__(i))
        return allsplits

## Preprocessor 
<a name="processor"></a>

In [7]:
# 2-*- coding: utf-8 -*-
"""
1. Process data based on model checkpoint and configurations provided in final_configs.json 

"""

from collections import Counter

import numpy as np
from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer
from torch.utils.data import Subset, DataLoader

class DataProcessor(object):
    
    def __init__(self, config, encoder=LabelEncoder(), return_type_ids=False):
        
        self._config = config
        self._encoder = encoder
        self._return_type_ids = return_type_ids
        
        self.model_checkpoint = self._config['MODEL_CHECKPOINT']
        if ((self._config['MAX_LEN'] is not None and self._config['MAX_LEN']>128) and ('bernice' in self.model_checkpoint)):
            self._config['MAX_LEN'] = 128
            print(f"Max length for {self.model_checkpoint} set to 128 by default")
        print(f"\nFinal configurations for processing training + validation data\n{self._config}")

    def label_encoder(self, target):
        le = self._encoder
        return le.fit_transform(target)

    def tokenizer(self):    
        # statistical tokenizer # subwords, chunks of words 
        return AutoTokenizer.from_pretrained(self.model_checkpoint, 
                                             use_fast = False,    # use one of the fast tokenizers (backed by Rust), available for almost all models
                                             # max_length=self._config['MAX_LEN'] # pass max length only when encoding not when instantiating the tokenizer
                                             )
    
    def feature_encoder(self, features):
        tokenizer = self.tokenizer()

        feature_encodings = tokenizer.batch_encode_plus(
            features.astype(str).values.tolist(), 
            padding=True, 
            truncation=True, 
            max_length=self._config['MAX_LEN'],
            # is_split_into_words=True, # added for multilingual versions refer 4624.err
            # return_attention_mask=True,
            return_token_type_ids=self._return_type_ids, 
            return_tensors='pt',
            )
        print(f"Dimensions of encoded features: {feature_encodings['input_ids'].shape}")
        print(f"Encoding contains: {[i for i in feature_encodings.keys()]}")
        return feature_encodings

    def encoded_data(self, features, labels):
        encoded_features = self.feature_encoder(features)
        encoded_labels = self.label_encoder(labels)
        if encoded_features['input_ids'].shape[0] == encoded_labels.shape[0]:
            return encoded_features, encoded_labels
        else:
            print("encoded features and labels do not have same length")

## Trainer 
<a name="trainer"></a>

In [8]:
# 3-*- coding: utf-8 -*-
"""
1. Train model once the best hyperparameters from final_configs.json that were identified using classification_wandb.py
2. Get predictions from the trained model with boolen flag

"""

import time
import sys
import shutil
from types import SimpleNamespace

import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import EarlyStoppingCallback, TrainerCallback

import evaluate
# from evaluater import evaluation_display

hyperparams = ['MODEL_CHECKPOINT','BATCH_SIZE','LEARNING_RATE','EPOCHS','MAX_LEN']

# class PrintClassificationCallback(TrainerCallback):
#     def on_evaluate(self, args, state, control, logs=None, **kwargs):
#         print("Called after evaluation phase")

class ModelTrainer(object):

    def __init__(self, run_config, train_dataset, valid_dataset, test_dataset, 
                 tokenizer, savepath, cachepath, lang_to_train='all'):     

        self._run_config = run_config
        if not all(k in self._run_config.keys() for k in hyperparams):
            sys.exit(f"provide all required hyperparams: {hyperparams}")
            
        self.model_checkpoint = self._run_config['MODEL_CHECKPOINT']
        self.model_name = self.model_checkpoint.split('/')[-1]
    
        self._train_dataset = train_dataset
        self._valid_dataset = valid_dataset
        self._test_dataset = test_dataset
        self._train_dataset.cleanup_cache_files()

        self._tokenizer = tokenizer

        self._savepath = savepath
        self.modelpath = self._savepath.joinpath('models')
        self.modelpath.mkdir(parents=True, exist_ok=True)
        self._cachepath = cachepath

        self._lang_to_train = lang_to_train
        # self._target_names = self._run_config['TARGET_NAMES']
        self.num_labels = len(self._run_config['TARGET_NAMES'])
        print(f"{self.num_labels} classes in {self._lang_to_train} language")

    def get_model(self):
        model = AutoModelForSequenceClassification.from_pretrained(self.model_checkpoint,
                                                                   num_labels = self.num_labels,
                                                                   cache_dir = self._cachepath,
                                                                #    output_attentions=False,
                                                                #    output_hidden_states=False,
                                                                #    ignore_mismatched_sizes=True,
                                                                )
        # print(model)
        return model
        
    def compute_metrics(self, eval_pred, eval_metric="f1"):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        # evaluation_display(labels, predictions, self.num_labels, self._target_names) # print only for now
        metric = evaluate.load(eval_metric)
        return metric.compute(predictions=predictions, references=labels, average="macro")

    def train_eval(self, get_pred=False, metric_name="f1", hyperparam_search=False):
        
        out_dir = self.modelpath.joinpath(f"{self.model_name}-{self._lang_to_train}-finetuned")
        print(f"Model to be saved in {out_dir}")
        log_dir = self.modelpath.parent.joinpath('logs').joinpath(f"{self.model_name}-{self._lang_to_train}-finetuned")
        log_dir.mkdir(parents=True, exist_ok=True)
        
        config = SimpleNamespace(**{i.lower():j for i,j in self._run_config.items() if i in hyperparams})
        print(f"\nTraining model using with configurations:\n{config}")
        
        # attributes to customize the training
        args = TrainingArguments(
            save_total_limit=2,
            output_dir=str(out_dir),
            overwrite_output_dir = True,
            logging_dir = str(log_dir),
            
            learning_rate = config.learning_rate,
            per_device_train_batch_size = config.batch_size,
            per_device_eval_batch_size = config.batch_size,
            num_train_epochs = config.epochs,
            # weight_decay = config.weight_decay,
            
            evaluation_strategy = "epoch",
            save_strategy = "epoch",   
            logging_strategy = 'epoch',
            
            # logging_steps= 1,
            eval_accumulation_steps = 1,
            
            metric_for_best_model = metric_name,
            load_best_model_at_end = True,
            
            push_to_hub = False, # push the model to the Hub regularly during training
            # report_to='wandb',  # turn on wandb logging
            )

        # https://huggingface.co/docs/transformers/main_classes/trainer#trainer
        # https://github.com/huggingface/transformers/blob/v4.35.2/src/transformers/trainer.py#L231
        trainer = Trainer(
            model_init = self.get_model,
            args = args,
            train_dataset = self._train_dataset,
            eval_dataset = self._valid_dataset,
            tokenizer = self._tokenizer,
            compute_metrics = self.compute_metrics,
            callbacks = [EarlyStoppingCallback(3, 0.0)]
            )
        
        torch.cuda.empty_cache()
        # print(torch.cuda.memory_summary(device=None, abbreviated=True))
        
        # try later setup hyperparam here instead of wandb
        # # https://huggingface.co/docs/transformers/hpo_train
        # if hyperparam_search:
        #     best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")
        #     print(f"best run for {self._model_checkpoint} is {best_run.hyperparameters.items()}")
        #     for n, v in best_run.hyperparameters.items():
        #         setattr(trainer.args, n, v)

        trainer.train()
        trainer.evaluate()
        print(f"free space by deleting: {out_dir}")
        shutil.rmtree(out_dir, ignore_errors=True)
        
        if get_pred:
            print(f"\nGetting Predictions on Test dataset")
            logits, labels, metrics = trainer.predict(self._test_dataset)
            predictions = np.argmax(logits, axis=-1)
            return trainer, (labels, predictions)    
        else:
            return trainer

## Evaluater 
<a name="evaluater"></a>

In [10]:
## 4-*- coding: utf-8 -*-
"""
1. Evaluate the model performance per language.
2. Evaluate performance per category during each epoch.

"""

import numpy as np

import evaluate
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

def evaluation_display(y, y_pred, labels_map=None, plot=False):
    
    if labels_map is not None:
        labels_idx = [k for k,v in labels_map.items() if k in y.unique()]
        labels_name = [v for k,v in labels_map.items() if k in y.unique()]
    else:
        labels_idx = None
        labels_name = None
        
    f1 = f1_score(y, y_pred, labels=labels_idx, average='macro')
    acc = accuracy_score(y, y_pred)
    class_report = classification_report(y, y_pred, labels=labels_idx, target_names=labels_name)

    # y_score = pred probabilityes
    # fpr, tpr, _ = roc_curve(y_test, y_score)
    # roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
    # cm = multilabel_confusion_matrix(y, y_pred)
    cm = confusion_matrix(y, y_pred, normalize='true')
    
    if plot:
        print(f"f1: {f1}\nacc: {acc}\n{class_report}")
        fig, ax = plt.subplots(figsize=(4,7))
        cm_disp = ConfusionMatrixDisplay(cm).plot(ax=ax, cmap='Blues', colorbar=False)
        c_bar = fig.add_axes([ax.get_position().x1+0.01, ax.get_position().y0, 0.05, ax.get_position().height])
        plt.colorbar(cm_disp.im_,  cax=c_bar)
        plt.show()
    else:
        print(f"f1: {f1}\nacc: {acc}\n{class_report}\n{cm}")
    
class PredictionEvaluater(object):

    def __init__(self, prediction_set, target_names=None, savepath=None, model_name=None):     

        self._labels, self._predictions = prediction_set
        
        self._target_names = target_names
        self.num_labels = len(target_names)
        self.label_map = {k:v for k,v in zip(range(self.num_labels), self._target_names)}
        print(f"Number of labels in target_names is {self.num_labels}")

        self.savepath = savepath.joinpath('predictions')
        self.savepath.mkdir(parents=True, exist_ok=True)
        
        self._model_name = model_name if model_name is not None else "model"
        
    def compute_metrics(self, eval_pred, eval_metric="accuracy"):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        metric = evaluate.load(eval_metric)
        return metric.compute(predictions=preds, references=labels)
    
    def append_predictions(self, test_df):
        # append prediction and encoded labels to original test data
        df = test_df.copy()
        df[f"{self._model_name}_prediction"] = self._predictions
        df["annotation"] = self._labels
        if self.savepath is not None:
            sub_df = df[[f"{self._model_name}_prediction","annotation"]]
            sub_df.rename_axis('index').to_csv(self.savepath.joinpath(f"{self._model_name}_predictions.csv"))
        return df
        
    def evaluation_report(self, test_df, lang_eval=True):
        df = self.append_predictions(test_df)
        evaluation_display(df["annotation"], df[f"{self._model_name}_prediction"], self.label_map)
        # if language evaluation is required
        if lang_eval:
            for lang, lang_df in df.groupby("lang"):
                print(f"\nEvaluation for language: {lang}")
                evaluation_display(lang_df["annotation"], lang_df[f"{self._model_name}_prediction"], self.label_map)

## Wrapper 
<a name="wrapper"></a>

In [11]:
# -*- coding: utf-8 -*-
"""
The script for wrapper function to run multilingual_ILI_classification.py
"""

import time
import logging as log

import pandas as pd

# from reader import EncodedDataset
# from preprocessor import DataProcessor
# from trainer import hyperparams, ModelTrainer
# from evaluater import PredictionEvaluater

def getsplitidx(lang_split_idx, key='train_idx'):
    idx_list = [v[key] for k,v in lang_split_idx.items()]
    idx_list = [i for eachlist in idx_list for i in eachlist]
    return idx_list

def mlm_evaluation(lang_split_idx, tweets, config, split_path, cache_path, lang, lang_eval, hyperparams=hyperparams, save=False):

    # obtain datasplit index
    train_idx = getsplitidx(lang_split_idx, key='train_idx')
    valid_idx = getsplitidx(lang_split_idx, key='valid_idx')
    test_idx = getsplitidx(lang_split_idx, key='test_idx')
    print(f"Distribution of data in train, validation and test splits: {len(train_idx)}, {len(valid_idx)}, {len(test_idx)}")

    test_df = tweets.data.iloc[test_idx]
    if save:
        test_df.rename_axis('index').to_csv(split_path.joinpath(f"{split_path.stem}_{lang}.csv"))
        # add target names into config outside of the function
        # config['target_names'] = sorted(test_df['final_annotation'].unique().tolist())
    
    # ensure all parameters for trianing exists
    CONFIG = {k.upper():v for k,v in config.items()}
    if not all(k in CONFIG.keys() for k in hyperparams):
        sys.exit(f"provide all required hyperparams: {hyperparams}. Received only {CONFIG.keys()}")
    
    start_time = time.time()
    try:
        # encode the data using the model checkpoint
        print(f"Working with {CONFIG['MODEL_CHECKPOINT']}")
        processor = DataProcessor(CONFIG, return_type_ids=True)
        feature_encodings, label_encodings = processor.encoded_data(tweets.data['tweet'], tweets.data['final_annotation'])
    
        # obtain encoded train, valid and test data as dataset object
        encoded_dataset = EncodedDataset(feature_encodings, label_encodings)
        train_dataset, valid_dataset, test_dataset = encoded_dataset.splitdata([train_idx, valid_idx, test_idx])
        print(f"Distribution of data splits for {lang} language is {train_dataset.shape}, {valid_dataset.shape}, {test_dataset.shape}")

        trainer = ModelTrainer(CONFIG, train_dataset, valid_dataset, test_dataset, processor.tokenizer(), split_path, cache_path, lang)
        model, prediction_set = trainer.train_eval(get_pred=True)
        print(f"\n{trainer.model_name} trained on {trainer._lang_to_train} languages")
        
        # delete wandb and model folder related to this model checkpoint
        print(f"free space by deleting: {cache_path.parent.joinpath('models')}")
        shutil.rmtree(cache_path.parent.joinpath('models'), ignore_errors=True)
            
        # evaluate on test set
        evaluater = PredictionEvaluater(prediction_set, CONFIG['TARGET_NAMES'], split_path, f"{trainer.model_name}_{lang}")
        evaluater.evaluation_report(test_df, lang_eval)
     
    except Exception as error:
        print("An error occurred:", error)
        log.exception('Failed')
        pass 
        
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Total execution time to finetune {trainer.model_name} on {trainer._lang_to_train} language(s) is {execution_time}")

## Load Data 
<a name="loaddata"></a>

In [12]:
# -*- coding: utf-8 -*-
"""
The script to run model once the best hyperparameters are identified using classification_wandb.py
Update required in final_configs.json 
"""

import time
from pathlib import Path 
import argparse
import json
import logging as log
from copy import deepcopy
import numpy as np

# from reader import CustomDataset
# from wrapper import mlm_evaluation

# # args
# parser = argparse.ArgumentParser(description="Twitter Meta Analysis")
# parser.add_argument("--data_file", type=str, help="File name including directory where data resides.")
# parser.add_argument("--params_file", type=str, help="File where parameters to run the model are provided.")
# parser.add_argument("--output_dir", type=str, help="Directory where output are to be stores.")
# args = parser.parse_args()

# DATA_FILE = Path(args.data_file)
# PARAMS_FILE = Path(args.params_file)
# OUT_PATH = Path(args.output_dir)

DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata.csv")
PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/params.tsv")
OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evalnew")

# read data
data_path = DATA_FILE.parent
tweets = CustomDataset(DATA_FILE, data_path)
print(f"Number of tweets in data: {tweets.__len__()}")
print(f"Distribution of classes in all data {tweets.labels.value_counts()}")

# hyperparameters
params = pd.read_csv(PARAMS_FILE, sep='\t')
target_names = np.unique(tweets.labels).tolist()
# params['split'].apply(ast.literal_eval)
print(f"Configuration setup read from {PARAMS_FILE}")   

# where MLMs are cached
cache_path = OUT_PATH.parent.joinpath('.cache')
cache_path.mkdir(parents=True, exist_ok=True)
print(f"Cache in {cache_path}")

Number of tweets in data: 4400
Distribution of classes in all data final_annotation
3. Not Related to ILI or COVID-19 Infection       2492
1. Likely ILI infection                           1553
4. Ambiguous/Unsure                                238
2. Likely COVID-19 Infection (after 2020 only)     117
Name: count, dtype: int64
Configuration setup read from /gaueko0/users/nmishra/multiling_fludetection/params.tsv
Cache in /gaueko0/users/nmishra/multiling_fludetection/.cache


## Do Training 
<a name="dotraining"></a>

In [13]:
# read data split index
SPLITS = params['split'].unique()
for split in SPLITS:
    dirname = f"testset{split.replace(',','_')}"
    split_path = OUT_PATH.joinpath(dirname)
    print(f"Reading data split index from: {split_path}")
    with open(split_path.joinpath('split_idx.json'), 'r') as f:
        split_idx = json.load(f) 
   
    for lang, lang_params in params.groupby('lang'):  
        # if lang=='all' or lang=='en':
        # get split idx 
        if lang=='all':
            # for all languages in the list
            print(f"\nTrain using data from all languages")
            languages = [i for i in params['lang'].unique() if i!= 'all']
            lang_split_idx = {i:split_idx[i] for i in languages}
            lang_eval = True
        else:
            # for each language in the list
            print(f"\nTrain using data from {lang} languages")
            lang_split_idx = {}
            lang_split_idx[lang] = split_idx[lang]
            lang_eval = False
        
        # get training parameters for models and train
        training_params = lang_params.to_dict(orient='records')
        for config in training_params:
            config['target_names'] = target_names
            print(config)
            mlm_evaluation(lang_split_idx, tweets, config, split_path, cache_path, lang, lang_eval)

Reading data split index from: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2

Train using data from all languages
{'lang': 'all', 'split': '0.6,0.2,0.2', 'n_labels': 4, 'max_len': 128, 'model_checkpoint': 'cardiffnlp/twitter-xlm-roberta-base', 'batch_size': 8, 'epochs': 7, 'learning_rate': 3.571430010972717e-05, 'target_names': ['1. Likely ILI infection', '2. Likely COVID-19 Infection (after 2020 only)', '3. Not Related to ILI or COVID-19 Infection', '4. Ambiguous/Unsure']}
Distribution of data in train, validation and test splits: 2640, 880, 880
Working with cardiffnlp/twitter-xlm-roberta-base

Final configurations for processing training + validation data
{'LANG': 'all', 'SPLIT': '0.6,0.2,0.2', 'N_LABELS': 4, 'MAX_LEN': 128, 'MODEL_CHECKPOINT': 'cardiffnlp/twitter-xlm-roberta-base', 'BATCH_SIZE': 8, 'EPOCHS': 7, 'LEARNING_RATE': 3.571430010972717e-05, 'TARGET_NAMES': ['1. Likely ILI infection', '2. Likely COVID-19 Infection (after 2020 only)', '3. Not Relat

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.7204,0.689699,0.400502
2,0.5313,0.684544,0.437963
3,0.4311,0.94375,0.445802
4,0.3345,1.027272,0.477503
5,0.227,1.171391,0.478214
6,0.1681,1.199613,0.508798
7,0.1096,1.280594,0.526703


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-all-finetuned

Getting Predictions on Test dataset

twitter-xlm-roberta-base trained on all languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.4798927260120708
acc: 0.7579545454545454
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.74      0.79      0.77       310
2. Likely COVID-19 Infection (after 2020 only)       0.08      0.13      0.10        23
   3. Not Related to ILI or COVID-19 Infection       0.89      0.82      0.85       499
                           4. Ambiguous/Unsure       0.20      0.21      0.20        48

                                      accuracy                           0.76       880
                                     macro avg       0.48      0.49      0.48

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for all language is (2640, 4), (880, 4), (880, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in all language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-all-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='jhu-clsp/bernice', batch_size=8, epochs=6, learning_rate=1.9960876362570808e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.7072,0.613383,0.404659
2,0.5523,0.611158,0.412632
3,0.4268,0.76719,0.43218
4,0.3415,0.83046,0.501196
5,0.265,0.985996,0.504652
6,0.2026,1.016502,0.52168


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-all-finetuned

Getting Predictions on Test dataset

bernice trained on all languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.5121537668758525
acc: 0.7806818181818181
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.73      0.79      0.76       310
2. Likely COVID-19 Infection (after 2020 only)       0.33      0.17      0.23        23
   3. Not Related to ILI or COVID-19 Infection       0.88      0.86      0.87       499
                           4. Ambiguous/Unsure       0.19      0.19      0.19        48

                                      accuracy                           0.78       880
                                     macro avg       0.53      0.50      0.51       880
                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for all language is (2640, 4), (880, 4), (880, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in all language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-all-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='bert-base-multilingual-uncased', batch_size=32, epochs=10, learning_rate=4.714003352774175e-05)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.7768,0.754878,0.373203
2,0.5997,0.680295,0.387319
3,0.4686,0.764457,0.387798
4,0.347,0.907304,0.416787
5,0.2681,0.943916,0.463397
6,0.2145,1.095098,0.499128
7,0.1428,1.256063,0.466335
8,0.098,1.366857,0.48395
9,0.0849,1.295521,0.476338


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-all-finetuned

Getting Predictions on Test dataset

bert-base-multilingual-uncased trained on all languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.459322938942049
acc: 0.7454545454545455
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.68      0.80      0.74       310
2. Likely COVID-19 Infection (after 2020 only)       0.22      0.22      0.22        23
   3. Not Related to ILI or COVID-19 Infection       0.85      0.80      0.83       499
                           4. Ambiguous/Unsure       0.09      0.04      0.06        48

                                      accuracy                           0.75       880
                                     macro avg       0.46      0.4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for all language is (2640, 4), (880, 4), (880, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in all language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-all-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='microsoft/mdeberta-v3-base', batch_size=16, epochs=10, learning_rate=2.2015563091912367e-05)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.7807,0.658797,0.403842
2,0.5825,0.60913,0.40463
3,0.4903,0.637993,0.410753
4,0.4059,0.740199,0.436017
5,0.3299,0.762898,0.482017
6,0.2798,0.859788,0.473248
7,0.2164,0.922791,0.5138
8,0.1728,0.94102,0.510304
9,0.136,1.029872,0.513877
10,0.1196,1.026103,0.518787


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-all-finetuned

Getting Predictions on Test dataset

mdeberta-v3-base trained on all languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.5081723537379206
acc: 0.7761363636363636
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.75      0.77      0.76       310
2. Likely COVID-19 Infection (after 2020 only)       0.18      0.13      0.15        23
   3. Not Related to ILI or COVID-19 Infection       0.87      0.86      0.86       499
                           4. Ambiguous/Unsure       0.25      0.27      0.26        48

                                      accuracy                           0.78       880
                                     macro avg       0.51      0.51      0.51       880
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for all language is (2640, 4), (880, 4), (880, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in all language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-all-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='xlm-roberta-base', batch_size=16, epochs=8, learning_rate=4.129082702898313e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.823,0.79633,0.384125
2,0.6638,0.693392,0.404646
3,0.5595,0.818553,0.398558
4,0.468,0.804479,0.400163
5,0.3918,0.799781,0.439727
6,0.3225,0.871129,0.446703
7,0.2491,1.085259,0.487868
8,0.2011,1.093572,0.468153


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-all-finetuned

Getting Predictions on Test dataset

xlm-roberta-base trained on all languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.4976210163721508
acc: 0.7647727272727273
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.74      0.74      0.74       310
2. Likely COVID-19 Infection (after 2020 only)       0.29      0.09      0.13        23
   3. Not Related to ILI or COVID-19 Infection       0.86      0.86      0.86       499
                           4. Ambiguous/Unsure       0.22      0.31      0.26        48

                                      accuracy                           0.76       880
                                     macro avg       0.53      0.50      0.50       880
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for de language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in de language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-de-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='cardiffnlp/twitter-xlm-roberta-base', batch_size=32, epochs=6, learning_rate=6.303128537215267e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.0298,0.928089,0.313727
2,0.9061,0.870404,0.36892
3,0.7844,0.886828,0.371
4,0.6785,0.849672,0.3666
5,0.5931,0.876039,0.366675
6,0.5227,0.895525,0.379856


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-de-finetuned

Getting Predictions on Test dataset

twitter-xlm-roberta-base trained on de languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.35286275803303924
acc: 0.675
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.65      0.82      0.73        89
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         5
   3. Not Related to ILI or COVID-19 Infection       0.70      0.67      0.69        93
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.68       200
                                     macro avg       0.34      0.37      0.35       200
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for de language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in de language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-de-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='jhu-clsp/bernice', batch_size=32, epochs=5, learning_rate=1.931835053890701e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.9693,0.835027,0.356207
2,0.8158,0.816125,0.365919
3,0.7252,0.848576,0.372958
4,0.6957,0.813648,0.363857
5,0.6719,0.809641,0.379391


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-de-finetuned

Getting Predictions on Test dataset

bernice trained on de languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.3766845804051261
acc: 0.72
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.67      0.88      0.76        89
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         5
   3. Not Related to ILI or COVID-19 Infection       0.79      0.71      0.75        93
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.72       200
                                     macro avg       0.36      0.40      0.38       200
                                  weigh

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for de language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in de language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-de-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='bert-base-multilingual-uncased', batch_size=16, epochs=3, learning_rate=1.760419522770641e-05)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.0176,0.834531,0.368878
2,0.8038,0.823919,0.374184
3,0.7291,0.828555,0.369429


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-de-finetuned

Getting Predictions on Test dataset

bert-base-multilingual-uncased trained on de languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.36629330984946473
acc: 0.7
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.66      0.84      0.74        89
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         5
   3. Not Related to ILI or COVID-19 Infection       0.76      0.70      0.73        93
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.70       200
                                     macro avg       0.35      0.39      0.37    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for de language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in de language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-de-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='microsoft/mdeberta-v3-base', batch_size=32, epochs=6, learning_rate=1.846007640647933e-05)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.1573,0.970841,0.29967
2,0.9399,0.874204,0.361381
3,0.8588,0.831795,0.366767
4,0.8017,0.809946,0.36695
5,0.7519,0.824134,0.361454
6,0.7185,0.809338,0.37731


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-de-finetuned

Getting Predictions on Test dataset

mdeberta-v3-base trained on de languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.37434554973822
acc: 0.715
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.71      0.81      0.75        89
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         5
   3. Not Related to ILI or COVID-19 Infection       0.72      0.76      0.74        93
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.71       200
                                     macro avg       0.36      0.39      0.37       200
                      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for de language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in de language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-de-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='xlm-roberta-base', batch_size=32, epochs=6, learning_rate=2.216664432392652e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.1416,0.963592,0.304526
2,0.9441,0.887931,0.31536
3,0.844,0.79915,0.371807
4,0.7932,0.838556,0.360831
5,0.7708,0.82797,0.363758
6,0.748,0.807485,0.377155


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-de-finetuned

Getting Predictions on Test dataset

xlm-roberta-base trained on de languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.3743043561696412
acc: 0.715
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.70      0.79      0.74        89
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         5
   3. Not Related to ILI or COVID-19 Infection       0.73      0.78      0.76        93
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.71       200
                                     macro avg       0.36      0.39      0.37       200
                    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for en language is (120, 4), (40, 4), (40, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in en language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-en-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='cardiffnlp/twitter-xlm-roberta-base', batch_size=32, epochs=6, learning_rate=0.000148748398571)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.1408,0.815792,0.478632
2,0.7679,0.706175,0.513228
3,0.5255,0.758964,0.495681
4,0.3342,0.708006,0.529915
5,0.2286,0.777931,0.511185
6,0.1712,0.773696,0.512478


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-en-finetuned

Getting Predictions on Test dataset

twitter-xlm-roberta-base trained on en languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.5464912280701754
acc: 0.8
                                             precision    recall  f1-score   support

                    1. Likely ILI infection       0.75      0.83      0.79        18
3. Not Related to ILI or COVID-19 Infection       0.85      0.85      0.85        20
                        4. Ambiguous/Unsure       0.00      0.00      0.00         2

                                   accuracy                           0.80        40
                                  macro avg       0.53      0.56      0.55        40
                               weighted avg       0.76      0.80      0.78        40

[[0.83333333 0.16666667 0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for en language is (120, 4), (40, 4), (40, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in en language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-en-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='jhu-clsp/bernice', batch_size=16, epochs=2, learning_rate=0.0001038648882532)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.9967,0.819641,0.474074
2,0.7501,0.744547,0.474074


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-en-finetuned

Getting Predictions on Test dataset

bernice trained on en languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.526688815060908
acc: 0.775
                                             precision    recall  f1-score   support

                    1. Likely ILI infection       0.76      0.72      0.74        18
3. Not Related to ILI or COVID-19 Infection       0.78      0.90      0.84        20
                        4. Ambiguous/Unsure       0.00      0.00      0.00         2

                                   accuracy                           0.78        40
                                  macro avg       0.52      0.54      0.53        40
                               weighted avg       0.74      0.78      0.75        40

[[0.72222222 0.27777778 0.        ]
 [0.1        0.9       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for en language is (120, 4), (40, 4), (40, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in en language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-en-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='bert-base-multilingual-uncased', batch_size=32, epochs=7, learning_rate=0.0001048140505428)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.1126,0.875221,0.320503
2,0.7557,0.73098,0.459259
3,0.5422,0.676612,0.508021
4,0.3377,0.646902,0.529915
5,0.255,0.682519,0.529915
6,0.2416,0.678648,0.512281
7,0.1707,0.656467,0.529915


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-en-finetuned

Getting Predictions on Test dataset

bert-base-multilingual-uncased trained on en languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.5122807017543859
acc: 0.75
                                             precision    recall  f1-score   support

                    1. Likely ILI infection       0.70      0.78      0.74        18
3. Not Related to ILI or COVID-19 Infection       0.80      0.80      0.80        20
                        4. Ambiguous/Unsure       0.00      0.00      0.00         2

                                   accuracy                           0.75        40
                                  macro avg       0.50      0.53      0.51        40
                               weighted avg       0.72      0.75      0.73        40

[[0.77777778 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for en language is (120, 4), (40, 4), (40, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in en language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-en-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='microsoft/mdeberta-v3-base', batch_size=8, epochs=7, learning_rate=6.214702150620456e-05)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.0421,0.870699,0.222222
2,0.8749,0.826572,0.372327
3,0.7158,0.662797,0.495614
4,0.4499,0.685149,0.529825
5,0.4344,0.616562,0.541752
6,0.2734,0.604029,0.512415
7,0.2106,0.587469,0.545736


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-en-finetuned

Getting Predictions on Test dataset

mdeberta-v3-base trained on en languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.5291005291005292
acc: 0.775
                                             precision    recall  f1-score   support

                    1. Likely ILI infection       0.78      0.78      0.78        18
3. Not Related to ILI or COVID-19 Infection       0.77      0.85      0.81        20
                        4. Ambiguous/Unsure       0.00      0.00      0.00         2

                                   accuracy                           0.78        40
                                  macro avg       0.52      0.54      0.53        40
                               weighted avg       0.74      0.78      0.75        40

[[0.77777778 0.22222222 0.        ]
 [0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for en language is (120, 4), (40, 4), (40, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in en language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-en-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='xlm-roberta-base', batch_size=16, epochs=9, learning_rate=7.64366123017573e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.1462,0.896001,0.293836
2,0.9126,0.830331,0.513228
3,0.8554,0.83371,0.445897
4,0.7117,0.679897,0.492137
5,0.4924,0.677506,0.506063


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-en-finetuned

Getting Predictions on Test dataset

xlm-roberta-base trained on en languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.5108695652173912
acc: 0.75
                                             precision    recall  f1-score   support

                    1. Likely ILI infection       0.64      1.00      0.78        18
3. Not Related to ILI or COVID-19 Infection       1.00      0.60      0.75        20
                        4. Ambiguous/Unsure       0.00      0.00      0.00         2

                                   accuracy                           0.75        40
                                  macro avg       0.55      0.53      0.51        40
                               weighted avg       0.79      0.75      0.73        40

[[1.  0.  0. ]
 [0.4 0.6 0. ]
 [1.  0.  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for es language is (720, 4), (240, 4), (240, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in es language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-es-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='cardiffnlp/twitter-xlm-roberta-base', batch_size=8, epochs=4, learning_rate=7.96269320820549e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.8469,0.659761,0.415786
2,0.5807,0.896962,0.41303
3,0.441,0.861651,0.429762
4,0.3067,0.971187,0.481389


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-es-finetuned

Getting Predictions on Test dataset

twitter-xlm-roberta-base trained on es languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.479972742986305
acc: 0.7541666666666667
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.81      0.86      0.83        99
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00        12
   3. Not Related to ILI or COVID-19 Infection       0.88      0.78      0.83       116
                           4. Ambiguous/Unsure       0.18      0.46      0.26        13

                                      accuracy                           0.75       240
                                     macro avg       0.47      0.52      0.48   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for es language is (720, 4), (240, 4), (240, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in es language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-es-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='jhu-clsp/bernice', batch_size=32, epochs=10, learning_rate=7.48302766200615e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.7987,0.70353,0.414793
2,0.5984,0.660438,0.425108
3,0.4626,0.738729,0.461911
4,0.2808,0.899517,0.411844
5,0.2003,0.991726,0.529071
6,0.1324,0.892603,0.540339
7,0.0817,0.964098,0.523959
8,0.0426,1.053349,0.566636
9,0.0216,1.083146,0.579948
10,0.0183,1.126551,0.520382


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-es-finetuned

Getting Predictions on Test dataset

bernice trained on es languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.4922927094920282
acc: 0.7458333333333333
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.78      0.74      0.76        99
2. Likely COVID-19 Infection (after 2020 only)       0.21      0.33      0.26        12
   3. Not Related to ILI or COVID-19 Infection       0.87      0.87      0.87       116
                           4. Ambiguous/Unsure       0.08      0.08      0.08        13

                                      accuracy                           0.75       240
                                     macro avg       0.49      0.50      0.49       240
                         

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in es language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-es-finetuned

Training model using with configurations:
namespace(max_len=256, model_checkpoint='bert-base-multilingual-uncased', batch_size=8, epochs=6, learning_rate=2.7742632399963146e-05)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.8305,0.789398,0.381447
2,0.6232,0.644513,0.422769
3,0.4931,0.780122,0.42259
4,0.3912,0.719148,0.524001
5,0.272,0.9139,0.522999
6,0.1864,0.919595,0.495032


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-es-finetuned

Getting Predictions on Test dataset

bert-base-multilingual-uncased trained on es languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.48388761131942815
acc: 0.7625
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.81      0.76      0.78        99
2. Likely COVID-19 Infection (after 2020 only)       0.25      0.17      0.20        12
   3. Not Related to ILI or COVID-19 Infection       0.78      0.91      0.84       116
                           4. Ambiguous/Unsure       0.25      0.08      0.12        13

                                      accuracy                           0.76       240
                                     macro avg       0.52      0.48      0.48 

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in es language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-es-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='microsoft/mdeberta-v3-base', batch_size=8, epochs=6, learning_rate=3.407229021688368e-05)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.8461,0.660019,0.425224
2,0.6341,0.628733,0.415928
3,0.5052,0.704086,0.429093
4,0.4375,0.692092,0.423414
5,0.3115,0.835954,0.417578
6,0.244,0.763887,0.420507


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-es-finetuned

Getting Predictions on Test dataset

mdeberta-v3-base trained on es languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.40194926045646
acc: 0.7625
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.75      0.81      0.78        99
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00        12
   3. Not Related to ILI or COVID-19 Infection       0.77      0.89      0.83       116
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.76       240
                                     macro avg       0.38      0.42      0.40       240
                     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for es language is (720, 4), (240, 4), (240, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in es language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-es-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='xlm-roberta-base', batch_size=16, epochs=10, learning_rate=3.032823977759674e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.9947,0.740398,0.390296
2,0.7476,0.744391,0.389717
3,0.6313,0.649676,0.41602
4,0.4977,0.753618,0.422457
5,0.4521,0.765868,0.423413
6,0.4078,0.77688,0.426414
7,0.2915,0.905793,0.441432
8,0.2193,1.325014,0.409384
9,0.2023,1.047434,0.485983
10,0.1736,1.058876,0.46425


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-es-finetuned

Getting Predictions on Test dataset

xlm-roberta-base trained on es languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.45149993684476447
acc: 0.7416666666666667
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.80      0.78      0.79        99
2. Likely COVID-19 Infection (after 2020 only)       0.13      0.25      0.17        12
   3. Not Related to ILI or COVID-19 Infection       0.84      0.84      0.84       116
                           4. Ambiguous/Unsure       0.00      0.00      0.00        13

                                      accuracy                           0.74       240
                                     macro avg       0.44      0.47      0.45       240
      

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in fr language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-fr-finetuned

Training model using with configurations:
namespace(max_len=256, model_checkpoint='cardiffnlp/twitter-xlm-roberta-base', batch_size=8, epochs=4, learning_rate=2.596766847433272e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.8724,0.693124,0.406695
2,0.6122,0.639585,0.406146
3,0.4709,0.759359,0.401231
4,0.3656,0.782254,0.409177


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-fr-finetuned

Getting Predictions on Test dataset

twitter-xlm-roberta-base trained on fr languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.4123742593358137
acc: 0.79
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.71      0.89      0.79        79
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         4
   3. Not Related to ILI or COVID-19 Infection       0.86      0.85      0.86       103
                           4. Ambiguous/Unsure       0.00      0.00      0.00        14

                                      accuracy                           0.79       200
                                     macro avg       0.39      0.44      0.41       200
     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for fr language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in fr language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-fr-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='jhu-clsp/bernice', batch_size=16, epochs=6, learning_rate=8.106216723781869e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.907,0.799015,0.373198
2,0.7338,0.753206,0.400757
3,0.5004,0.755886,0.396743
4,0.3674,0.995617,0.390587
5,0.2563,1.080687,0.449032
6,0.1757,1.148916,0.430731


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-fr-finetuned

Getting Predictions on Test dataset

bernice trained on fr languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.45317733990147785
acc: 0.78
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.74      0.84      0.79        79
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         4
   3. Not Related to ILI or COVID-19 Infection       0.88      0.85      0.87       103
                           4. Ambiguous/Unsure       0.18      0.14      0.16        14

                                      accuracy                           0.78       200
                                     macro avg       0.45      0.46      0.45       200
                                  weig

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for fr language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in fr language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-fr-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='bert-base-multilingual-uncased', batch_size=8, epochs=10, learning_rate=1.4333262623697738e-05)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.9165,0.770221,0.369052
2,0.6998,0.734959,0.389338
3,0.5961,0.748348,0.393076
4,0.5092,0.952224,0.38953
5,0.4389,0.947798,0.396856
6,0.3754,0.986961,0.387959
7,0.3406,0.967719,0.425148
8,0.2365,1.047758,0.404023
9,0.2067,1.101188,0.422254
10,0.186,1.119583,0.422254


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-fr-finetuned

Getting Predictions on Test dataset

bert-base-multilingual-uncased trained on fr languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.4282846866557274
acc: 0.765
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.71      0.81      0.76        79
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         4
   3. Not Related to ILI or COVID-19 Infection       0.82      0.85      0.84       103
                           4. Ambiguous/Unsure       0.33      0.07      0.12        14

                                      accuracy                           0.77       200
                                     macro avg       0.47      0.43      0.43   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for fr language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in fr language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-fr-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='microsoft/mdeberta-v3-base', batch_size=16, epochs=6, learning_rate=9.84639301079326e-05)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.8923,0.861852,0.342025
2,0.72,0.703606,0.391584
3,0.5349,0.920646,0.361323
4,0.4145,0.913368,0.406365
5,0.3022,0.938805,0.464089
6,0.2261,1.057584,0.427003


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-fr-finetuned

Getting Predictions on Test dataset

mdeberta-v3-base trained on fr languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.46972427464408
acc: 0.76
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.70      0.89      0.78        79
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         4
   3. Not Related to ILI or COVID-19 Infection       0.94      0.76      0.84       103
                           4. Ambiguous/Unsure       0.24      0.29      0.26        14

                                      accuracy                           0.76       200
                                     macro avg       0.47      0.48      0.47       200
                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for fr language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in fr language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-fr-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='xlm-roberta-base', batch_size=8, epochs=8, learning_rate=4.2896922812818176e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.9587,0.814152,0.377078
2,0.8226,0.80908,0.371063
3,0.6978,0.776382,0.383744
4,0.6087,0.910088,0.386629
5,0.5422,0.948769,0.383281
6,0.4979,1.123249,0.391582
7,0.408,1.183037,0.391515
8,0.3787,1.258636,0.394022


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-fr-finetuned

Getting Predictions on Test dataset

xlm-roberta-base trained on fr languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.38171291116580885
acc: 0.735
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.69      0.75      0.72        79
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         4
   3. Not Related to ILI or COVID-19 Infection       0.77      0.85      0.81       103
                           4. Ambiguous/Unsure       0.00      0.00      0.00        14

                                      accuracy                           0.73       200
                                     macro avg       0.36      0.40      0.38       200
                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for it language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in it language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-it-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='cardiffnlp/twitter-xlm-roberta-base', batch_size=8, epochs=6, learning_rate=2.121329094057246e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.597,0.50045,0.36087
2,0.4579,0.508427,0.368859
3,0.3935,0.620608,0.377941
4,0.3103,0.677238,0.374489
5,0.2613,0.692259,0.368137
6,0.2394,0.709148,0.370205


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/twitter-xlm-roberta-base-it-finetuned

Getting Predictions on Test dataset

twitter-xlm-roberta-base trained on it languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.39732142857142855
acc: 0.885
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.58      0.72      0.64        25
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         2
   3. Not Related to ILI or COVID-19 Infection       0.94      0.95      0.95       167
                           4. Ambiguous/Unsure       0.00      0.00      0.00         6

                                      accuracy                           0.89       200
                                     macro avg       0.38      0.42      0.40       200
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for it language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in it language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-it-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='jhu-clsp/bernice', batch_size=32, epochs=7, learning_rate=2.7820794319855572e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/bernice and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.6592,0.496709,0.226776
2,0.438,0.45873,0.383921
3,0.3526,0.490001,0.375658
4,0.3119,0.476418,0.377612
5,0.2571,0.539293,0.375658


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bernice-it-finetuned

Getting Predictions on Test dataset

bernice trained on it languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.39261450799912334
acc: 0.88
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.59      0.68      0.63        25
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         2
   3. Not Related to ILI or COVID-19 Infection       0.93      0.95      0.94       167
                           4. Ambiguous/Unsure       0.00      0.00      0.00         6

                                      accuracy                           0.88       200
                                     macro avg       0.38      0.41      0.39       200
                                  weig

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for it language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in it language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-it-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='bert-base-multilingual-uncased', batch_size=16, epochs=9, learning_rate=1.5020978833449689e-05)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.74,0.526748,0.226776
2,0.4369,0.480151,0.334034
3,0.3432,0.479292,0.358618
4,0.2621,0.475527,0.356988
5,0.2024,0.579857,0.347601
6,0.1571,0.672185,0.343505


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/bert-base-multilingual-uncased-it-finetuned

Getting Predictions on Test dataset

bert-base-multilingual-uncased trained on it languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.35294117647058826
acc: 0.84
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.48      0.52      0.50        25
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         2
   3. Not Related to ILI or COVID-19 Infection       0.90      0.93      0.91       167
                           4. Ambiguous/Unsure       0.00      0.00      0.00         6

                                      accuracy                           0.84       200
                                     macro avg       0.34      0.36      0.35   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for it language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in it language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-it-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='microsoft/mdeberta-v3-base', batch_size=32, epochs=4, learning_rate=3.399329304887933e-05)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.8327,0.579679,0.226776
2,0.5197,0.457521,0.278532
3,0.405,0.415625,0.383138
4,0.3385,0.445562,0.379199


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/mdeberta-v3-base-it-finetuned

Getting Predictions on Test dataset

mdeberta-v3-base trained on it languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.3607547917388936
acc: 0.84
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.44      0.64      0.52        25
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         2
   3. Not Related to ILI or COVID-19 Infection       0.93      0.91      0.92       167
                           4. Ambiguous/Unsure       0.00      0.00      0.00         6

                                      accuracy                           0.84       200
                                     macro avg       0.34      0.39      0.36       200
                     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Dimensions of encoded features: torch.Size([4400, 128])
Encoding contains: ['input_ids', 'token_type_ids', 'attention_mask']
Distribution of data splits for it language is (600, 4), (200, 4), (200, 4)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


4 classes in it language
Model to be saved in /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-it-finetuned

Training model using with configurations:
namespace(max_len=128, model_checkpoint='xlm-roberta-base', batch_size=8, epochs=8, learning_rate=1.089678819474072e-05)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.7806,0.571764,0.226776
2,0.5326,0.449375,0.226776
3,0.4447,0.431033,0.386957
4,0.3896,0.496138,0.383138
5,0.3382,0.551049,0.371672
6,0.287,0.620056,0.379412


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/evalnew/testset0.6_0.2_0.2/models/xlm-roberta-base-it-finetuned

Getting Predictions on Test dataset

xlm-roberta-base trained on it languages
free space by deleting: /gaueko0/users/nmishra/multiling_fludetection/models
Number of labels in target_names is 4
f1: 0.37454778204486006
acc: 0.865
                                                precision    recall  f1-score   support

                       1. Likely ILI infection       0.54      0.60      0.57        25
2. Likely COVID-19 Infection (after 2020 only)       0.00      0.00      0.00         2
   3. Not Related to ILI or COVID-19 Infection       0.92      0.95      0.93       167
                           4. Ambiguous/Unsure       0.00      0.00      0.00         6

                                      accuracy                           0.86       200
                                     macro avg       0.36      0.39      0.37       200
                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# # read data split index
# splits = params['split'].unique()
# for split in splits:
    
#     cache_path = OUT_PATH.parent.joinpath('.cache')
#     cache_path.mkdir(parents=True, exist_ok=True)
#     print(f"Cache in {cache_path}")

#     dirname = f"testset{'_'.join([str(i) for i in split])}"
#     split_path = OUT_PATH.joinpath(dirname)
#     print(f"Reading data split index from: {split_path}")
#     with open(split_path.joinpath('split_idx.json'), 'r') as f:
#         split_idx = json.load(f) 
        
#     # determine languages for which to get split index
#     if params['LANG']=='all':
#         languages = [i for i in split_idx]
#     else:
#         languages = [i for i in split_idx if i in params['LANG'].split(',')]
    
#     # # train on all languages and then on each language
#     # lang_split_idx = {i:split_idx[i] for i in languages}
#     # print(f"Training data used for {params['LANG']} languages")
#     # mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, params['LANG'], cache_path)
    
#     for lang_to_train in languages:
#         if lang_to_train=='es': #or lang_to_train=='it':
#             print(f"\nTraining data used for {lang_to_train} language")
#             lang_split_idx = {}
#             lang_split_idx[lang_to_train] = split_idx[lang_to_train]
#             mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, lang_to_train, cache_path)

In [25]:
# # read data split index
# for split in params['SPLITS'][:1]:
    
#     cache_path = OUT_PATH.parent.joinpath('.cache')
#     cache_path.mkdir(parents=True, exist_ok=True)
#     print(f"Cache in {cache_path}")

#     dirname = f"testset{'_'.join([str(i) for i in split])}"
#     split_path = OUT_PATH.joinpath(dirname)
#     print(f"Reading data split index from: {split_path}")
#     with open(split_path.joinpath('split_idx.json'), 'r') as f:
#         split_idx = json.load(f) 
        
#     # determine languages for which to get split index
#     if params['LANG']=='all':
#         languages = [i for i in split_idx]
#     else:
#         languages = [i for i in split_idx if i in params['LANG'].split(',')]
    
#     # train on all languages and then on each language
#     lang_split_idx = {i:split_idx[i] for i in languages}
#     print(f"Training data used for {params['LANG']} languages")
#     # mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, params['LANG'], cache_path)
    
#     for lang_to_train in languages:
#         print(f"\nTraining data used for {lang_to_train} language")
#         lang_split_idx = {}
#         lang_split_idx[lang_to_train] = split_idx[lang_to_train]
#         mlm_evaluation(lang_split_idx, tweets, params, split_path, dirname, lang_to_train, cache_path)