# Using ClassificationEvaluator Class

In [1]:
import os
import sys
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# Main folder
local_path = '/Users/paulaleonova/repos/'

In [6]:
# In order to locate the eval classes, need to first add path to notebook
folder_name = "llm-toolbox/evaluation/multiple_labels/utils"
sys.path.append(os.path.join(local_path ,folder_name))

In [7]:
# Use os.listdir() to get the list of files and folders in the specified path
contents = os.listdir(os.path.join(local_path ,folder_name))

# Print the list of contents
print(contents)

['classification_evaluator.py']


In [26]:
from classification_evaluator import ClassificationEvaluator

In [9]:
%load_ext autoreload
%autoreload 2

## Import Data

### Labels

In [10]:
label_df = pd.DataFrame([
    {'label_id': 1, 'label': 'Alpha'},
    {'label_id': 2, 'label': 'Beta'},
    {'label_id': 3, 'label': 'Gamma'},
]
)

# Display the DataFrame
label_df

Unnamed: 0,label_id,label
0,1,Alpha
1,2,Beta
2,3,Gamma


### Predictions

In [11]:
pred_df = pd.DataFrame([
    {'id': 123, 'label': 'Alpha', 'score': 3},
    {'id': 123, 'label': 'Beta', 'score': 4},
    {'id': 123, 'label': 'Gamma', 'score': 1},
    {'id': 145, 'label': 'Alpha', 'score': 2},
    {'id': 165, 'label': 'Alpha', 'score': 5},
    {'id': 175, 'label': 'Beta', 'score': 4},
    {'id': 175, 'label': 'Gamma', 'score': 2}
]
)

# Display the DataFrame
pred_df

Unnamed: 0,id,label,score
0,123,Alpha,3
1,123,Beta,4
2,123,Gamma,1
3,145,Alpha,2
4,165,Alpha,5
5,175,Beta,4
6,175,Gamma,2


### Validation

In [12]:
val_df = pd.DataFrame([
    {'id': 123, 'label': 'Alpha', 'score': 5},
    {'id': 123, 'label': 'Beta', 'score': 5},
    {'id': 145, 'label': 'Gamma', 'score': 5},
    {'id': 165, 'label': 'Alpha', 'score': 5},
    {'id': 175, 'label': 'Alpha', 'score': 5}
]
)

val_df

Unnamed: 0,id,label,score
0,123,Alpha,5
1,123,Beta,5
2,145,Gamma,5
3,165,Alpha,5
4,175,Alpha,5


## Terminology Refresher

METRICS OVERVIEW
- **Micro-Averaging**: Aggregate the contributions of all classes to compute the average metric. In a multi-class classification setup, micro-average is preferable if you suspect there might be class imbalance
- **Macro-Averaging**: Compute the metric independently for each class and then take the average (hence treating all classes equally)
- **Hamming Loss**: Fraction of labels that are incorrectly predicted, i.e., the fraction of the wrong labels to the total number of labels (lower the value better the model)

- **Recall (sensitivity)**: To improve recall, minimize false negatives. In guard dog terms, that means if you’re unsure, bark at it, just in case. Don’t let any rodent thieves sneak by on your watch! High recall minimizes false negatives.

- **Precision**: Precision is basically “Never bark unless you’re sure.” To improve precision, minimize false positives. Preston won’t bark at something unless he’s certain it’s a burglar. High precision minimizes false positives.


## Metric Results

In [13]:
evaluator = ClassificationEvaluator(
    align_doc_label_df=val_df,
    align_doc_id_column_name='id',
    label_df=label_df,
    label_column_name='label'
)


In [14]:
doc_id_list, label_text_list = evaluator.create_lists_of_docs_and_labels()

Number of Unique Docs: 4
Number of Unique Labels: 3
[123, 145, 165, 175]


In [15]:
print(doc_id_list)

[123, 145, 165, 175]


In [16]:
print(label_text_list)

['Alpha', 'Beta', 'Gamma']


In [17]:
actual_doc_labels_cx_df = evaluator.cross_docs_and_labels(
      doc_id_list = doc_id_list,
      label_text_list = label_text_list,
      align_doc_label_df_subset = val_df,
      align_doc_id_column_name = 'id',
      align_doc_label_column_name = 'label'
)
actual_doc_labels_cx_df

Unnamed: 0,doc_id,label_text,is_actual_value
0,123,Alpha,1
1,123,Beta,1
2,123,Gamma,0
3,145,Alpha,0
4,145,Beta,0
5,145,Gamma,1
6,165,Alpha,1
7,165,Beta,0
8,165,Gamma,0
9,175,Alpha,1


In [18]:
report_cdf_long_plus, num_labels_per_doc_df, _, = evaluator.create_classification_confusion_table(
    actual_doc_labels_cx_df = actual_doc_labels_cx_df,
    pred_doc_label_df = pred_df,
    pred_doc_id_column_name = 'id',
    pred_doc_label_column_name = 'label',
    pred_doc_score_column_name = 'score',
    threshold_val = 0)

# agg_metrics = report_cdf_long_plus[report_cdf_long_plus['label_text'].isin(['micro avg', 'macro avg', 'weighted avg', 'samples avg'])]

print("Validation Label Count")
print(num_labels_per_doc_df)


Hamming Loss: 0.5
Validation Label Count
   doc_id  NumLabels
0     123          3
3     175          2
1     145          1
2     165          1


In [19]:
report_cdf_long_plus

Unnamed: 0,label_text,precision,recall,f1-score,support,num false positives,num false negatives,num true positives,num true negative
0,Alpha,0.67,0.67,0.67,3.0,1.0,1.0,2.0,0.0
1,Beta,0.5,1.0,0.67,1.0,1.0,0.0,1.0,2.0
2,Gamma,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0
3,micro avg,0.43,0.6,0.5,5.0,4.0,2.0,3.0,3.0
4,macro avg,0.39,0.56,0.44,5.0,4.0,2.0,3.0,3.0
5,weighted avg,0.5,0.6,0.53,5.0,0.33,0.17,0.25,0.25
6,samples avg,0.42,0.5,0.45,5.0,0.33,0.17,0.25,0.25


In [32]:
report_cdf_long_plus_id = pd.merge(
    label_df[['label_id', 'label']],
    report_cdf_long_plus,
    how ='left',
    right_on ='label_text',
    left_on = 'label'
).drop('label', axis=1)

report_cdf_long_plus_id

Unnamed: 0,label_id,label_text,precision,recall,f1-score,support,num false positives,num false negatives,num true positives,num true negative
0,1,Alpha,0.67,0.67,0.67,3.0,1.0,1.0,2.0,0.0
1,2,Beta,0.5,1.0,0.67,1.0,1.0,0.0,1.0,2.0
2,3,Gamma,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0


# Combined

In [30]:
# Initialize ClassificationEvaluator
evaluator = ClassificationEvaluator(val_df, 'id', label_df, 'label')

# Create lists of docs and labels
doc_id_list, label_text_list = evaluator.create_lists_of_docs_and_labels()

# Cross docs and labels
esd = evaluator.cross_docs_and_labels(doc_id_list, label_text_list, val_df, 'id', 'label')

# Create classification confusion table
cdf_long_plus_adj, num_labels_per_doc_df, actual_doc_labels_cx_plus_pred = evaluator.create_classification_confusion_table(
    esd, pred_df, 'id', 'label', 'score'
)


print("\nNumber of Labels per Document:")
print(num_labels_per_doc_df)


Number of Unique Docs: 4
Number of Unique Labels: 3

Hamming Loss: 0.5

Number of Labels per Document:
   doc_id  NumLabels
0     123          3
3     175          2
1     145          1
2     165          1


In [31]:
# Sample output
print("Classification Confusion Table:")
cdf_long_plus_adj

Classification Confusion Table:


Unnamed: 0,label_text,precision,recall,f1-score,support,num false positives,num false negatives,num true positives,num true negative
0,Alpha,0.67,0.67,0.67,3.0,1.0,1.0,2.0,0.0
1,Beta,0.5,1.0,0.67,1.0,1.0,0.0,1.0,2.0
2,Gamma,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0
3,micro avg,0.43,0.6,0.5,5.0,4.0,2.0,3.0,3.0
4,macro avg,0.39,0.56,0.44,5.0,4.0,2.0,3.0,3.0
5,weighted avg,0.5,0.6,0.53,5.0,0.33,0.17,0.25,0.25
6,samples avg,0.42,0.5,0.45,5.0,0.33,0.17,0.25,0.25
