# Model fine-tuning

## Initialization

#### Setup project paths:

In [None]:
import os
import sys
from os.path import join, exists
from py_markdown_table.markdown_table import markdown_table

from tqdm.notebook import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_red = '\x1b[1;30;41m'
h_green = '\x1b[1;30;42m'
h_yellow = '\x1b[1;30;43m'
h_stop = '\x1b[0m'

## Setup project paths:
project_path = os.getcwd()
models_path = join(project_path, "Models")

datasets_path = join(project_path, "Source datasets")
multicw_path = join(project_path, 'Final-dataset')
multiclaim_path = join(datasets_path, "MultiClaim")
lesa_dst_dir = join(datasets_path, 'LESA-EACL-2021')

sys.path.insert(1, join('Tools'))
from preprocess import to_structured
from data_filtration import filter_dataframe

## Datasets

### MultiCW dataset

In [None]:
# Load MultiCW model
import pandas as pd
from os.path import join
languages = pd.read_csv(join('Final-dataset', 'multicw.csv'))['lang'].unique()

multicw_path = join("Final-dataset")
multicw_train = pd.read_csv(join(multicw_path, "multicw-train.csv")).astype({'label':'int'})
multicw_dev = pd.read_csv(join(multicw_path, "multicw-dev.csv")).astype({'label':'int'})
multicw_test = pd.read_csv(join(multicw_path, "multicw-test.csv")).astype({'label':'int'})

multicw_train = multicw_train[['text', 'label']]
multicw_dev = multicw_dev[['text', 'label']]
multicw_test = multicw_test[['text', 'label', 'style']]

multicw_train.dropna(inplace=True)
multicw_dev.dropna(inplace=True)
multicw_test.dropna(inplace=True)

print(multicw_test.shape[0])

### MultiCW dataset - preprocessed

In [None]:
print('Preprocessing MultiCW train set')
multicw_train_norm = to_structured(multicw_train['text'])
print('Preprocessing MultiCW validation set')
multicw_dev_norm = to_structured(multicw_dev['text'])
print('Preprocessing MultiCW test set')
multicw_test_norm = to_structured(multicw_test['text'])
print('Done')

### Manually obtained samples from AFP-fact-checks

In [None]:
import pandas as pd

# Create an empty DataFrame with specified columns
manual_test = pd.DataFrame(columns=["text", "label", "url"])

# Add rows using loc
manual_test.loc[0] = ["Example text 1", "positive", "http://example.com/1"]
manual_test.loc[1] = ["Example text 2", "negative", "http://example.com/2"]
manual_test.loc[2] = ["Example text 3", "neutral", "http://example.com/3"]

print('Preprocessing Manually obtained samples:')
manual_test['text'] = to_structured(manual_test['text'])

## Models

### LESA model (EACL-2021)

In [None]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer

sys.path.insert(1, lesa_model_path)

from loader import sent2feature2ngram, ParentPositions, tokenize_sentences, create_attention_masks, \
    load_embedding_matrix, load_tokenizer, ind_model_noisy, ind_model_semi, ind_model_structured, final

# ANSI Highlighting: https://stackoverflow.com/a/21786287
h_stop = '\x1b[0m'
gh_start = '\x1b[1;30;42m'
rh_start = '\x1b[1;30;41m'


class LESAClaimModel():
    """ LESA: Linguistic Encapsulation and Semantic Amalgamation Based Generalised Claim Detection from Online
    Content accepted at EACL 2021. ArXiv paper [link](https://arxiv.org/abs/2101.11891) """

    def __init__(self):
        self.max_len = 30
        self.batch_size = 32

        self.final_model = None
        self.noisy_model = None
        self.semi_model = None
        self.structured_model = None

        self.noisy = None
        self.noisy_dev = None
        self.semi = None
        self.semi_dev = None
        self.struct = None
        self.struct_dev = None

        self.update_semantic_dbs = False

        # Set default semantic embedding datasets
        # self.set_semantic_datasets()

        # Init tokenizers as class variables
        print('Loading tokenizers: ', end='')

        # Original BERT model
        # model = os.path.join(lesa_model_path, 'bert-base')
        # self.bert_tokenizer_transformer = BertTokenizer.from_pretrained(model, local_files_only=True)

        # Multilingual BERT model
        model = "bert-base-multilingual-cased"
        self.bert_tokenizer_transformer = AutoTokenizer.from_pretrained(model)

        # EMBEDDINGS
        self.noisy_embedding_matrix_tag = load_embedding_matrix(lesa_model_path, 'embedding_matrix_tag_noisy.pickle')
        self.noisy_vocab_size_tag = self.noisy_embedding_matrix_tag.shape[0]  # 5363

        self.semi_embedding_matrix_tag = load_embedding_matrix(lesa_model_path, 'embedding_matrix_tag_semi.pickle')
        self.semi_vocab_size_tag = self.semi_embedding_matrix_tag.shape[0]  # 6137

        self.structured_embedding_matrix_tag = load_embedding_matrix(lesa_model_path, 'embedding_matrix_tag_structured.pickle')
        self.structured_vocab_size_tag = self.structured_embedding_matrix_tag.shape[0]  # 6048

        # PARENT POS TOKENIZER
        self.tokenizer_dep_parent_noisy = load_tokenizer(lesa_model_path, 'tokenizer_dep_parent_noisy.pickle')
        self.num_words_dep_parent_noisy = self.tokenizer_dep_parent_noisy.num_words  # 100

        self.tokenizer_dep_parent_semi = load_tokenizer(lesa_model_path, 'tokenizer_dep_parent_semi.pickle')
        self.num_words_dep_parent_semi = self.tokenizer_dep_parent_semi.num_words  # 200

        self.tokenizer_dep_parent_structured = load_tokenizer(lesa_model_path, 'tokenizer_dep_parent_structured.pickle')
        self.num_words_dep_parent_structured = self.tokenizer_dep_parent_structured.num_words  # 200

        # LABEL TOKENIZER
        self.tokenizer_dep_noisy = load_tokenizer(lesa_model_path, 'tokenizer_dep_noisy.pickle')
        self.num_words_dep_noisy = self.tokenizer_dep_noisy.num_words  # 6300

        self.tokenizer_dep_semi = load_tokenizer(lesa_model_path, 'tokenizer_dep_semi.pickle')
        self.num_words_dep_semi = self.tokenizer_dep_semi.num_words  # 7300

        self.tokenizer_dep_structured = load_tokenizer(lesa_model_path, 'tokenizer_dep_structured.pickle')
        self.num_words_dep_structured = self.tokenizer_dep_structured.num_words  # 7400

        # TAG TOKENIZER
        self.tokenizer_tag_noisy = load_tokenizer(lesa_model_path, 'tokenizer_tag_noisy.pickle')
        self.tokenizer_tag_semi = load_tokenizer(lesa_model_path, 'tokenizer_tag_semi.pickle')
        self.tokenizer_tag_structured = load_tokenizer(lesa_model_path, 'tokenizer_tag_structured.pickle')
        print('ok')

        self.final_model = self._init_model()

    def load_model(self, model_name='lesa2021') -> bool:
        """Loads the model from the file."""

        os.makedirs(models_path, exist_ok=True)

        print(join(os.getcwd(), models_path, model_name))
        if not os.path.exists(join(models_path, model_name)):
            print(rh_start + 'Invalid path!' + h_stop)
            return False

        print(f'Loading {model_name} model: ', end='')
        try:
            self.noisy_model.load_weights(join(models_path, model_name, '_dep_noisy.h5'))
            self.semi_model.load_weights(join(models_path, model_name, '_dep_semi.h5'))
            self.structured_model.load_weights(join(models_path, model_name, '_dep_structured.h5'))

            self.final_model.load_weights(join(models_path, model_name, '_bert_comb.h5'))

            print(gh_start + " ok" + h_stop)
            return True
        except Exception as e:
            # print(e)
            print(rh_start + " failed!" + h_stop)
            return False

    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        x = self.semantic_embeddings(test_set)

        # testing
        print('Running classification:')
        metrics = self.final_model.predict(x)
        results = [np.argmax(el) for el in metrics[-1]]
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # compare against ground-truth
        ground_truth = test_set['label']
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en'):
        """
        Train the LESA-2021 model with the given parameters. The training consists of two phases:
        - Pre-training of semantic modules with noisy, semi-noisy and structured data respectively
        - Fine-tuning of the main BERT model together with the semantic model on the training data
        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        model_name = f"{model_name}-{lang}-{epochs}e"
        path = os.path.join(models_path, model_name)
        semantic_path = os.path.join(lesa_model_path, 'semantic_base')
        os.makedirs(path, exist_ok=True)

        if not self.update_semantic_dbs:
            # Make sure that semantic models are loaded
            shutil.copyfile(src=os.path.join(semantic_path, '_dep_noisy.h5'), dst=os.path.join(path, '_dep_noisy.h5'))
            shutil.copyfile(src=os.path.join(semantic_path, '_dep_semi.h5'), dst=os.path.join(path, '_dep_semi.h5'))
            shutil.copyfile(src=os.path.join(semantic_path, '_dep_structured.h5'), dst=os.path.join(path, '_dep_structured.h5'))

            self.noisy_model.load_weights(os.path.join(semantic_path, '_dep_noisy.h5'))
            self.semi_model.load_weights(os.path.join(semantic_path, '_dep_semi.h5'))
            self.structured_model.load_weights(os.path.join(semantic_path, '_dep_structured.h5'))
        else:
            # If we use custom semantic models, make sure they are saved within the same folder as the model file
            self.noisy_model.save_weights(os.path.join(path, '_dep_noisy.h5'))
            print('Model saved to: ', os.path.join(path, '_dep_noisy.h5'))

            self.semi_model.save_weights(os.path.join(path, '_dep_semi.h5'))
            print('Model saved to: ', os.path.join(path, '_dep_semi.h5'))

            self.structured_model.save_weights(os.path.join(path, '_dep_structured.h5'))
            print('Model saved to: ', os.path.join(path, '_dep_structured.h5'))

        print('Train Final model:')
        x = self.semantic_embeddings(train_set)
        dev = self.semantic_embeddings(dev_set)

        self.final_model.optimizer = Adam(learning_rate=learn_rate)
        self.final_model.fit(x=x, y=train_set['label'], batch_size=self.batch_size, epochs=epochs,
                             validation_data=(dev, dev_set['label']))
        self.final_model.save_weights(os.path.join(path, '_bert_comb.h5'))
        print('Model saved to: ', os.path.join(path, '_bert_comb.h5'))

    def _init_model(self):
        # load_model
        print("Initializing model architecture: ")

        # aux CLAIMS-2023 model
        self.noisy_model = ind_model_noisy(embed_dim=20, num_heads=5, ff_dim=128,
                                           maxlen=self.max_len, vocab_label=self.num_words_dep_noisy,
                                           vocab_parent_pos=self.num_words_dep_parent_noisy)

        self.semi_model = ind_model_semi(embed_dim=20, num_heads=5, ff_dim=128,
                                         maxlen=self.max_len, vocab_label=self.num_words_dep_semi,
                                         vocab_parent_pos=self.num_words_dep_parent_semi)

        self.structured_model = ind_model_structured(embed_dim=20, num_heads=5, ff_dim=128,
                                                     maxlen=self.max_len, vocab_label=self.num_words_dep_structured,
                                                     vocab_parent_pos=self.num_words_dep_parent_structured)

        parameters_dict_noisy = {
            "vocab_size_tag": self.noisy_vocab_size_tag,
            "EMBEDDING_DIM_TAG": 20,
            "embedding_matrix_tag": self.noisy_embedding_matrix_tag,
            "maxlen_tag": self.max_len
        }

        parameters_dict_semi = {
            "vocab_size_tag": self.semi_vocab_size_tag,
            "EMBEDDING_DIM_TAG": 20,
            "embedding_matrix_tag": self.semi_embedding_matrix_tag,
            "maxlen_tag": self.max_len
        }

        parameters_dict_structured = {
            "vocab_size_tag": self.structured_vocab_size_tag,
            "EMBEDDING_DIM_TAG": 20,
            "embedding_matrix_tag": self.structured_embedding_matrix_tag,
            "maxlen_tag": self.max_len
        }

        final_model = final(lesa_model_path, self.noisy_model, self.semi_model, self.structured_model, parameters_dict_noisy,
                            parameters_dict_semi, parameters_dict_structured, max_seq_length=60)

        print('ok')

        return final_model

    def semantic_embeddings(self, dataset: DataFrame):
        # GET SYNTACTIC REP: TEST
        # Assuming 'original_dataset' is your original DataFrame
        dataset = dataset.copy()

        print("Getting dependency and POS tags...")

        # Processing DEP tags
        progress = tqdm(dataset['text'].copy())
        progress.set_description('sentence2pos tags:')
        dataset.loc[:, 'DEP'] = [sent2feature2ngram(row) for row in progress]

        # Processing parent POS tags
        progress = tqdm(dataset['text'].copy())
        progress.set_description('Parent POS tags:')
        dataset.loc[:, "parent_pos"] = [ParentPositions(row) for row in progress]

        # Processing TAG tags
        progress = tqdm(dataset['text'].copy())
        progress.set_description('sentence2tag tags:')
        dataset.loc[:, 'TAG'] = [sent2feature2ngram(row, feature="TAG") for row in progress]

        print("POS Tags: complete!")

        # COMMON TEST REP | DEP
        print("Getting dependency and DEP tags...")
        noisy = self.tokenizer_dep_noisy.texts_to_sequences(dataset['DEP'])
        semi = self.tokenizer_dep_semi.texts_to_sequences(dataset['DEP'])
        structured = self.tokenizer_dep_structured.texts_to_sequences(dataset['DEP'])

        p_noisy = self.tokenizer_dep_parent_noisy.texts_to_sequences(dataset['parent_pos'])
        p_semi = self.tokenizer_dep_parent_semi.texts_to_sequences(dataset['parent_pos'])
        p_structured = self.tokenizer_dep_parent_structured.texts_to_sequences(dataset['parent_pos'])

        noisy = pad_sequences(noisy, maxlen=self.max_len)
        semi = pad_sequences(semi, maxlen=self.max_len)
        structured = pad_sequences(structured, maxlen=self.max_len)

        p_noisy = pad_sequences(p_noisy, maxlen=self.max_len)
        p_semi = pad_sequences(p_semi, maxlen=self.max_len)
        p_structured = pad_sequences(p_structured, maxlen=self.max_len)

        print("DEP Tags: complete!")

        # BERT REP: TEST
        print("Creating BERT Embeddings...")
        input_ids = tokenize_sentences(dataset['text'], self.bert_tokenizer_transformer, 60)
        input_ids = pad_sequences(input_ids, maxlen=60, dtype="long", value=0, truncating="post", padding="post")
        attention_masks = create_attention_masks(input_ids)

        # COMMON TEST REP | TAG
        tag_noisy = self.tokenizer_tag_noisy.texts_to_sequences(dataset['TAG'])
        tag_semi = self.tokenizer_tag_semi.texts_to_sequences(dataset['TAG'])
        tag_structured = self.tokenizer_tag_structured.texts_to_sequences(dataset['TAG'])

        noisy[noisy >= self.noisy_vocab_size_tag] = self.noisy_vocab_size_tag - 1
        p_noisy[p_noisy >= self.noisy_vocab_size_tag] = self.noisy_vocab_size_tag - 1
        tag_noisy = np.array([np.array(x) for x in tqdm(tag_noisy)], dtype=object)
        for arrays in tag_noisy:
            arrays[arrays >= self.noisy_vocab_size_tag] = self.noisy_vocab_size_tag - 1

        tag_noisy = pad_sequences(tag_noisy, maxlen=self.max_len)
        tag_semi = pad_sequences(tag_semi, maxlen=self.max_len)
        tag_structured = pad_sequences(tag_structured, maxlen=self.max_len)

        x = {"label_noisy": np.array(noisy), "parent_pos_noisy": np.array(p_noisy),
             "label_semi": np.array(semi), "parent_pos_semi": np.array(p_semi),
             "label_structured": np.array(structured),
             "parent_pos_structured": np.array(p_structured),
             "inp_noisy": np.array(tag_noisy),
             "inp_semi": np.array(tag_semi),
             "inp_structured": np.array(tag_structured),
             'input_word_ids': np.array(input_ids), 'input_masks': np.array(attention_masks)}

        return x




### XLM-RoBERTa

In [None]:
import keras_hub
import keras
preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
    "xlm_roberta_base_multi",
    sequence_length=256 # Optional.
)

In [None]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

import keras_hub
import keras

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer


class XLMRobertaModel():
    """ Model finetuning and inference on the LESA dataset using XLMRoberta """

    def __init__(self):
        self.max_len = 256
        self.batch_size = 32

        self.final_model = None

        self.preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
            'xlm_roberta_base_multi',
            sequence_length=self.max_len
        )

    def load_model(self, model_name) -> bool:
        try:
            # 1. Instantiate the model with the known preset (architecture)
            self.final_model = keras_hub.models.XLMRobertaTextClassifier.from_preset(
                'xlm_roberta_base_multi',
                num_classes=2,
                preprocessor=self.preprocessor,
                dropout=0.2
            )
    
            # 2. Load the weights manually from the path
            weights_path = os.path.join('Models', f'{model_name}.weights.h5')
            self.final_model.load_weights(weights_path)
    
            print('Model loaded successfully.')
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
    
    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        print('Running classification:')
        metrics = self.final_model.predict(x=test_set['text'].to_numpy(), batch_size=self.batch_size)

        # Testing
        results = np.argmax(metrics, axis=1)
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # Compare against ground-truth
        ground_truth = test_set['label'].to_numpy()
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en'):
        """
        Train the XLMRoberta model with the given parameters.

        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        model_name_path = f"{model_name}-{lang}-{epochs}e"
        path = os.path.join('Models', 'LESA', 'models', model_name_path)
        if not os.path.exists(path):
            os.makedirs(path)


        self.final_model = keras_hub.models.XLMRobertaTextClassifier.from_preset(
            'xlm_roberta_base_multi',
            num_classes=len(train_set['label'].unique()),
            preprocessor=self.preprocessor,
            dropout=0.2
        )

        for layer in self.final_model.layers:
            layer.trainable = True

        self.final_model.compile(
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=keras.optimizers.Adam(learn_rate),
            jit_compile=True,
        )

        self.final_model.fit(x=train_set['text'].to_numpy(), y=train_set['label'].to_numpy(), batch_size=self.batch_size,
                             epochs=epochs, validation_data=(dev_set['text'].to_numpy(), dev_set['label'].to_numpy()))

        model_save_path = os.path.join(path, 'model.keras')
        self.final_model.save(model_save_path)

    def inference(self, texts: list[str]) -> list[bool]:
        """Returns a list of booleans: True if classified as a claim (class 1), else False."""
        if not isinstance(texts, list):
            raise ValueError("Input must be a list of strings.")
        if not all(isinstance(t, str) for t in texts):
            raise ValueError("All items in input list must be strings.")
    
        predictions = self.final_model.predict(texts)  # Must be list[str]
        classifications = np.argmax(predictions, axis=1)  # shape (batch_size,)
        
        return (classifications == 1).tolist()


### mDeBERTa

In [None]:
import keras_hub
import keras
preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
    "deberta_v3_base_multi",
    sequence_length=256 # Optional.
)

In [None]:
import sys
import os
import shutil
import warnings
from typing import Callable
import numpy as np
import pandas as pd

import keras_hub
import keras

from keras_preprocessing.sequence import pad_sequences
from pandas import DataFrame
from sklearn.metrics import classification_report
from keras.optimizers import Adam
from tqdm import tqdm
from transformers import BertTokenizer, AutoTokenizer
from tensorflow.keras.optimizers.schedules import CosineDecay


class MDeBertaModel():
    """  """

    def __init__(self, model_name=''):
        self.max_len = 256
        self.batch_size = 32
        self.final_model = None
        
        self.preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
            'deberta_v3_base_multi',
            sequence_length=self.max_len
        )

    def load_model(self, model_name) -> bool:
        try:
            model_path = os.path.join('Models', model_name + '.keras')
    
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Saved model not found at: {model_path}")
    
            print(f"Loading full model from: {model_path}")
            self.final_model = keras.models.load_model(model_path)
            print("Model loaded successfully.")
            return True
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            return False
            
    def detect_claims(self, test_set: DataFrame, verbose=False) -> tuple:
        """Performs inference on the testing data and evaluates results."""

        print('Running classification:')
        metrics = self.final_model.predict(x=test_set['text'].to_numpy(), batch_size=self.batch_size)

        # Testing
        results = np.argmax(metrics, axis=1)
        print('Done.')
        # Print sentences with classifications
        if verbose:
            for i, text in enumerate(test_set['text']):
                print("LESA classification: '{text}' {c} a claim.".
                      format(text=text, c=gh_start + "is" if results[i] == 1 else rh_start + "is not") + h_stop)

        # Compare against ground-truth
        ground_truth = test_set['label'].to_numpy()
        report = classification_report(ground_truth, results, output_dict=True)
        report_str = str(classification_report(ground_truth, results))

        return report, report_str

    def train_model(self, train_set: DataFrame, dev_set: DataFrame, epochs=1, learn_rate=3e-5, model_name='', lang='en', final_learn_rate_fraction=0.5):
        """
        Train the mDeBerta model with the given parameters.
        :param learn_rate: Learning rate.
        :param train_set: Training dataset.
        :param dev_set: Validation dataset.
        :param epochs: Number of training epochs.
        :param model_name: Model will be saved to the directory named by this value.If left blank, the model won't save.
        :param lang: Training dataset language(s). Needed for naming conventions.
        :return: Trained model.
        """

        self.final_model = keras_hub.models.DebertaV3Classifier.from_preset(
            'deberta_v3_base_multi',
            num_classes=len(set(train_set['label'].unique())),
            preprocessor=self.preprocessor,
            dropout=0.2
        )

        for layer in self.final_model.layers:
            layer.trainable = True

        initial_learning_rate = learn_rate
        decay_steps = int(len(train_set) // self.batch_size * epochs)   # Number of steps over which the decay is applied
        alpha = final_learn_rate_fraction  # Minimum learning rate as a fraction of initial_learning_rate

        lr_schedule = CosineDecay(
            initial_learning_rate=initial_learning_rate,
            decay_steps=decay_steps,
            alpha=alpha
        )

        self.final_model.compile(
            loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
            jit_compile=True
        )

        self.final_model.fit(x=train_set['text'].to_numpy(), y=train_set['label'].to_numpy(), batch_size=self.batch_size,
                             epochs=epochs, validation_data=(dev_set['text'].to_numpy(), dev_set['label'].to_numpy()))

        model_save_path = os.path.join(path, f"{model_name}-{lang}-{epochs}e.keras")
        self.final_model.save(model_save_path)
        
    def inference(self, texts: list[str]) -> list[bool]:
        """Returns a list of booleans: True if classified as a claim (class 1), else False."""
        if not isinstance(texts, list):
            raise ValueError("Input must be a list of strings.")
        if not all(isinstance(t, str) for t in texts):
            raise ValueError("All items in input list must be strings.")
    
        predictions = self.final_model.predict(texts)  # Must be list[str]
        classifications = np.argmax(predictions, axis=1)  # shape (batch_size,)
        
        return (classifications == 1).tolist()

## Experiments

### Models fine-tuning on MultiCW dataset
- Fine-tuning of xlm-RoBERTa, mDeBERTa anb LESA models on MultiCW train set
- Evaluation on MultiCW test set

In [None]:
model = None
detector = None
models = ['xlm', 'mdb', 'lesa']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()
    if model == 'lesa':
        print(f'{h_green}LESA model:{h_stop}')
        detector = LESAClaimModel()

if not detector.load_model(model_name=f'{model}-multicw-2e6-5e'):
    print(f'{h_yellow}No model found. Initiating fine-tuning:{h_stop}')
    # Note: Works well with a small learning rate (e.g. 3e-6)
    detector.train_model(multicw_train, multicw_dev, epochs=5, learn_rate=2e-6, lang='en', model_name=f'{model}-multicw')

print(f'{h_green}MultiCW overall:{h_stop}')
_, report = detector.detect_claims(multicw_test)
print(report)

test_noisy = multicw_test.loc[multicw_test['style']=='noisy']
_, report = detector.detect_claims(test_noisy, verbose=False)
print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
print(report)

test_strut = multicw_test.loc[multicw_test['style']=='struct']
_, report = detector.detect_claims(test_strut, verbose=False)
print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
print(report)

### Fine-tuned models evaluation with preprocessing
- Use Qwen2.5-7b model to convert all the samples to a structured writing style before evaluation

In [None]:
model = None
detector = None
models = ['xlm', 'mdb', 'lesa']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()
    if model == 'lesa':
        print(f'{h_green}LESA model:{h_stop}')
        detector = LESAClaimModel()

assert detector.load_model(model_name=f'{model}-multicw-2e6-5e'), f'{h_red}No model found!{h_stop}'

print(f'{h_green}MultiCW overall:{h_stop}')
_, report = detector.detect_claims(multicw_test_norm)
print(report)

test_noisy = multicw_test_norm.loc[multicw_test_norm['style']=='noisy']
_, report = detector.detect_claims(test_noisy, verbose=False)
print(f'{h_yellow}MultiCW Noisy Part:{h_stop}')
print(report)

test_strut = multicw_test_norm.loc[multicw_test_norm['style']=='struct']
_, report = detector.detect_claims(test_strut, verbose=False)
print(f'{h_yellow}MultiCW Structured Part:{h_stop}')
print(report)

### Out-domain evaluation
- Evaluation of the fine-tuned models on manually obtained samples from factcheck.afp.com and with the preprocessing applied

In [None]:
model = None
detector = None
models = ['xlm', 'mdb', 'lesa']

for model in models:
    if model == 'xlm':
        print(f'{h_green}XLM-RoBERTa model:{h_stop}')
        detector = XLMRobertaModel()
    if model == 'mdb':
        print(f'{h_green}mDBERTa model:{h_stop}')
        detector = MDeBertaModel()
    if model == 'lesa':
        print(f'{h_green}LESA model:{h_stop}')
        detector = LESAClaimModel()

assert detector.load_model(model_name=f'{model}-multicw-2e6-5e'), f'{h_red}No model found!{h_stop}'

results = detector.inference(manual_test)
for text, worthy in zip(manual_test['text'], results):
    if worthy:
        print(f'{h_green}True{h_stop}: {text}')
    else:
        print(f'{h_red}False{h_stop}: {text}')
