# An In-depth Evaluation of Approaches to Text Classification (IDEATC)

## V. Reporting Results

_This notebook is to create output tables and figures for the paper._

### Libraries

In [None]:
# standard library
import os
from pathlib import Path

# data wrangling
import datasets
import pandas as pd
import xarray as xr

# machine learning
from scipy.special import softmax
from sklearn.metrics import classification_report, top_k_accuracy_score

# local packages
import src

# other settings
LOAD_PATH_DATASET = Path(os.pardir, 'data', 'processed')
LOAD_PATH_PROMPTING = Path(os.pardir, 'data', 'prompting')
SAVE_PATH_RESULTS = Path(os.pardir, 'data', 'results')
SAVE_PATH_FIGURES = Path(os.pardir, 'figures')

## I. Overall Performance

In [None]:
df_table = src.experiments.utils.show_best_results(SAVE_PATH_RESULTS)
df_table = df_table.join(df_table.mean(axis=1).rename('macroaverage')).round(2)
df_table

In [None]:
series_supervised = df_table.loc['deberta_v3_small_finetuned']
series_zeroshot = df_table.loc[['deberta_v3_small_zeroshot', 'deberta_v3_xsmall_zeroshot', 'deberta_v3_base_zeroshot']].max()

In [None]:
series_zeroshot.divide(series_supervised).to_frame().T.multiply(100).round(1)#.to_clipboard(index=False)

In [None]:
df_table.to_clipboard(index=False)

## II. Learning Curves

In [None]:
name2path = {
    'Rotten Tomatoes': Path('../data/processed/rotten_tomatoes_processed'),
    'IMDb': Path('../data/processed/imdb_processed'),
    'Yelp-2': Path('../data/processed/yelp_polarity_processed'),
    'Yelp-5': Path('../data/processed/yelp_review_full_processed'),
    'SST-5': Path('../data/processed/setfit_sst5_processed'),
    'Dynasent (R2)': Path('../data/processed/dynabench_dynasent_processed'),
    'AG News': Path('../data/processed/ag_news_processed'),
    '20 Newsgroups': Path('../data/processed/20_newsgroups_processed'),
    'DBpedia14': Path('../data/processed/dbpedia_14_processed'),
    'Web of Science': Path('../data/processed/web_of_science_processed'),
}

In [None]:
list_df_metrics = [src.experiments.utils.read_metrics(SAVE_PATH_RESULTS, path.name) for path in name2path.values()]
fig = src.plotting.plot_performance_overall_all(list_df_metrics, list(name2path))
fig.update_layout(
    height=600,
    width=1200,
)
fig.write_image(SAVE_PATH_FIGURES.joinpath('figure_1.svg'))

In [None]:
dataset_names = ('yelp_review_full_processed', 'setfit_sst5_processed')
for idx, dataset_name in enumerate(dataset_names, start=1):
    df_metrics = src.experiments.utils.read_metrics(SAVE_PATH_RESULTS, dataset_name)
    fig = src.plotting.plot_performance_by_class(df_metrics=df_metrics, per_row=5, target_order=['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'])
    fig.update_layout(height=360, width=1200)
    fig.write_image(SAVE_PATH_FIGURES.joinpath(f'figure_2_{idx}.svg'))

In [None]:
dataset_names = ('ag_news_processed', 'dbpedia_14_processed')
for idx, dataset_name in enumerate(dataset_names, start=1):
    df_metrics = src.experiments.utils.read_metrics(SAVE_PATH_RESULTS, dataset_name)
    fig = src.plotting.plot_performance_by_class(df_metrics=df_metrics, per_row=4 if idx == 1 else 5)
    fig.update_layout(height=360 if idx == 1 else 600, width=1200)
    fig.write_image(SAVE_PATH_FIGURES.joinpath(f'figure_3_{idx}.svg'))

## III. Top k Accuracy

In [None]:
dataset_name = '20_newsgroups_processed'
scores = xr.open_dataarray(f'../data/results/{dataset_name}_deberta_v3_base_zeroshot_logits.nc')
dataset = datasets.load_from_disk(LOAD_PATH_DATASET.joinpath(dataset_name))

In [None]:
# for multilabel classification, use softmax over entailment and contraduction within each example, the second column gives the desired probabilities
probs = softmax(scores.loc[:, :, ['contradiction', 'entailment']], axis=2)
probs.shape

In [None]:
# for multiclass classification, use softmax over entailment
probs = softmax(scores.sel(classes='entailment'), axis=1)
probs.shape

In [None]:
print(classification_report(y_true=dataset['test']['label'], y_pred=probs.argmax(axis=1)))

In [None]:
dataset_names = {
    'ag_news_processed': 'AG News',
    '20_newsgroups_processed': '20 Newsgroups',
    'dbpedia_14_processed': 'DBpedia14',
    'web_of_science_processed': 'Web of Science',
}
records = list()
for dataset_name in dataset_names:
    dataset = datasets.load_from_disk(LOAD_PATH_DATASET.joinpath(dataset_name))
    for path in SAVE_PATH_RESULTS.glob(f'{dataset_name}*.nc'):
        experiment_id = path.name.split('processed_')[-1].split('_logits')[0]
        scores = xr.open_dataarray(path)
        probs = softmax(scores.sel(classes='entailment'), axis=1)
        for k in range(1, 6):
            accuracy = top_k_accuracy_score(y_true=dataset['test']['label'], y_score=probs, k=k)
            record = {
                'experiment_id': experiment_id,
                'dataset': dataset_name,
                'k': k,
                'top_k_accuracy': accuracy,
            }
            records.append(record)
df_accuracy = pd.DataFrame(records); del records
df_accuracy['experiment_id'].replace(src.plotting.get_name_map(), inplace=True)
df_accuracy['dataset'].replace(dataset_names, inplace=True)
print('Shape:', df_accuracy.shape)
display(df_accuracy.head())

In [None]:
fig = src.plotting.plot_top_k_accuracy(df_accuracy)
fig.update_layout(
    height=360,
    width=1200,
)
fig.write_image(SAVE_PATH_FIGURES.joinpath(f'figure_4.svg'))

## IV. Prompting Experiments

In [None]:
prompts = [
    'This example is {}.',
    '{}',
    'This example expresses a {} sentiment.',
    'This example expresses a {} feeling.',
    'This example expresses a {} attitude.',
    'This example expresses a {} opinion.',
]

df_prompts = pd.concat([pd.read_csv(path) for path in sorted(LOAD_PATH_PROMPTING.glob('*dynabench_dynasent_processed*metrics.csv'))])
df_prompts['experiment_id'] = df_prompts['experiment_id'].str.split('_prompt_')
df_prompts['prompt'] = df_prompts['experiment_id'].str.get(1).astype(int).replace(dict(enumerate(prompts)))
df_prompts['experiment_id'] = df_prompts['experiment_id'].str.get(0)
df_prompts['experiment_id'].replace(src.plotting.get_name_map(), inplace=True)
df_prompts = df_prompts.query('target == "total_weighted"')[['experiment_id', 'prompt', 'precision', 'recall', 'fscore']].copy()
print('Shape:', df_prompts.shape)
display(df_prompts.head())

In [None]:
fig = src.plotting.plot_prompt_performance(df_prompts)
fig.write_image(SAVE_PATH_FIGURES.joinpath(f'figure_5_1.svg'))

In [None]:
prompts = [
    'This example is {}.',
    '{}',
    'This example is about {}.',
    'This main topic of this text is {}.',
    'This example is World News, Sports...',
]

df_prompts = pd.concat([pd.read_csv(path) for path in sorted(LOAD_PATH_PROMPTING.glob('*ag_news_processed*metrics.csv'))])
df_prompts['experiment_id'] = df_prompts['experiment_id'].str.split('_prompt_')
df_prompts['prompt'] = df_prompts['experiment_id'].str.get(1).astype(int).replace(dict(enumerate(prompts)))
df_prompts['experiment_id'] = df_prompts['experiment_id'].str.get(0)
df_prompts['experiment_id'].replace(src.plotting.get_name_map(), inplace=True)
df_prompts = df_prompts.query('target == "total_weighted"')[['experiment_id', 'prompt', 'precision', 'recall', 'fscore']].copy()
print('Shape:', df_prompts.shape)
display(df_prompts.head())

In [None]:
fig = src.plotting.plot_prompt_performance(df_prompts)
fig.write_image(SAVE_PATH_FIGURES.joinpath(f'figure_5_2.svg'))