# An In-depth Evaluation of Approaches to Text Classification (IDEATC)

## III. Neural Supervised Baselines

_This notebook is used to establish baselines using neural supervised learning approaches to text classification, including fastText, Convolutional Neural Network (CNN) and DeBERTa Transformer._

### Libraries

In [None]:
# standard library
import os
from pathlib import Path

# data wrangling
import datasets

# deep learning
from transformers import AutoTokenizer

# local packages
import src
from src.experiments import supervised
from src.frameworks import fasttext, pytorch, transformers

LOAD_PATH_DATASET = Path(os.pardir, 'data', 'processed')
SAVE_PATH_RESULTS = Path(os.pardir, 'data', 'results')

## I. FastText

In [None]:
for path in LOAD_PATH_DATASET.glob('*processed*'):
    dataset = datasets.load_from_disk(path)
    sample_sizes = src.experiments.utils.get_sample_sizes(dataset['train'])
    supervised.run_experiment(
        dataset_dict=dataset,
        feature='text',
        get_model=fasttext.models.get_fasttext,
        search_params={},  # auto-tuned internally
        optimisation=fasttext.optimisation,
        sample_sizes=sample_sizes,
        progress_bar=True,
        experiment_id='fasttext',
        save_path=SAVE_PATH_RESULTS.joinpath(path.name),
    )
print('Done!')

## II.  Convolutional Neural Network (CNN)

In [None]:
tokeniser = AutoTokenizer.from_pretrained('albert-base-v2')
tokeniser.vocab_size

In [None]:
def tokenise(example: dict) -> dict:
    return tokeniser(example['text'], padding=False, truncation=True, max_length=1024)

In [None]:
for path in LOAD_PATH_DATASET.glob('*processed*'):
    dataset = datasets.load_from_disk(path)
    sample_sizes = src.experiments.utils.get_sample_sizes(dataset['train'])
    dataset = dataset.map(tokenise).with_format('torch')
    get_model = lambda: pytorch.models.get_cnn(
        num_class=len(dataset['train'].features['label'].names),
        vocab_size=tokeniser.vocab_size,
    )
    params = {
        'epochs': 50,  # max number of epochs
        'patience': 3,  # for early stopping
        'batch_size': 32,
    }
    supervised.run_experiment(
        dataset_dict=dataset,
        feature='input_ids',
        get_model=get_model,
        search_params=params,
        optimisation=pytorch.optimisation,
        sample_sizes=sample_sizes,
        progress_bar=True,
        experiment_id='cnn',
        save_path=SAVE_PATH_RESULTS.joinpath(path.name),
    )

## III. Transformer (DeBERTa)

In [None]:
for path in LOAD_PATH_DATASET.glob('*processed*'):
    dataset = datasets.load_from_disk(path)
    sample_sizes = src.experiments.utils.get_sample_sizes(dataset['train'])
    sample_sizes = [min(2**15, sample_sizes[-1])]
    get_model = lambda: transformers.models.get_transformer(
        model_name='microsoft/deberta-v3-small',
        num_class=len(dataset['train'].features['label'].names),
    )
    # logging.info(f'START PROCESSING {path.name}')
    supervised.run_experiment(
        dataset_dict=dataset,
        feature='text',
        get_model=get_model,
        search_params={'batch_size': 16},  # uses early stopping internally
        optimisation=transformers.optimisation,
        sample_sizes=sample_sizes,
        max_runs=1,
        progress_bar=True,
        experiment_id='deberta_v3_small_finetuned',
        save_path=SAVE_PATH_RESULTS.joinpath(path.name),
    )
    # logging.info(f'FINISH PROCESSING {path.name}')
print('Done!')

## VI. Sanity Check

In [None]:
src.experiments.utils.show_best_results(SAVE_PATH_RESULTS)