In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

from cleanlab.classification import CleanLearning
from cleanlab.lexical_quality.lexical_quality import LexicalQualityEvaluator

  from tqdm.autonotebook import tqdm, trange


First, we load the original banking intent classification datataset from the cleanlab example.

In [2]:
data = pd.read_csv("https://s.cleanlab.ai/banking-intent-classification.csv")
data.head()

Unnamed: 0,text,label
0,i accidentally made a payment to a wrong accou...,cancel_transfer
1,"i no longer want to transfer funds, can we can...",cancel_transfer
2,"cancel my transfer, please.",cancel_transfer
3,i want to revert this mornings transaction.,cancel_transfer
4,i just realised i made the wrong payment yeste...,cancel_transfer


We now want to get a baseline reading of label accuracy to compare the accuracy score of the labels with and without our lexical filters or algorithm modifications.

In [3]:
raw_texts, raw_labels = data["text"].values, data["label"].values
raw_train_texts, raw_test_texts, raw_train_labels, raw_test_labels = train_test_split(raw_texts, raw_labels, test_size=0.1)

encoder = LabelEncoder()
encoder.fit(raw_train_labels)

transformer = SentenceTransformer("google/electra-small-discriminator")

train_texts = transformer.encode(raw_train_texts)
test_texts = transformer.encode(raw_test_texts)

train_labels = encoder.transform(raw_train_labels)
test_labels = encoder.transform(raw_test_labels)

baseline_model = LogisticRegression(max_iter=400)
baseline_model.fit(X=train_texts, y=train_labels)

preds = baseline_model.predict(test_texts)
baseline_og = accuracy_score(test_labels, preds)
print(f"\n Test accuracy of original model: {baseline_og}")

No sentence-transformers model found with name google/electra-small-discriminator. Creating a new one with mean pooling.



 Test accuracy of original model: 0.83


Using our new LexicalQualityEvaluator, we can add three columns to the dataframe.
We add a spelling_score, quality_score and combined_lexical_score to the dataframe.

In [4]:
data["spelling_score"] = [LexicalQualityEvaluator(text).get_spelling_score() for text in data["text"]]
data["quality_score"] = [LexicalQualityEvaluator(text).get_quality_score() for text in data["text"]]
data["combined_lexical_score"] = [LexicalQualityEvaluator(text).get_combined_quality_score() for text in data["text"]]

data.head()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Unnamed: 0,text,label,spelling_score,quality_score,combined_lexical_score
0,i accidentally made a payment to a wrong accou...,cancel_transfer,1.0,0.972254,0.986127
1,"i no longer want to transfer funds, can we can...",cancel_transfer,1.0,0.884922,0.942461
2,"cancel my transfer, please.",cancel_transfer,1.0,0.980948,0.990474
3,i want to revert this mornings transaction.,cancel_transfer,1.0,0.352956,0.676478
4,i just realised i made the wrong payment yeste...,cancel_transfer,0.970588,0.904916,0.937752


We can then use our new metrics to filter the data.
For example, we can first sort the dataset by combined lexical score and then find any texts with a score of less than 0.6 and remove them from the dataset.

In [5]:
combined_lexical_score_threshold = 0.6

filtered_data = data.sort_values(by="combined_lexical_score", ascending=True)

unfiltered_size = data.size
filtered_data = filtered_data[filtered_data["combined_lexical_score"] >= combined_lexical_score_threshold]
filtered_size = filtered_data.size

filtered_data.head()

Unnamed: 0,text,label,spelling_score,quality_score,combined_lexical_score
893,can i use top-up with apple pay?,apple_pay_or_google_pay,0.857143,0.343864,0.600504
232,i was spending cash with my card and got a fee.,card_payment_fee_charged,1.0,0.201074,0.600537
871,how to activate google pay for top up?,apple_pay_or_google_pay,0.875,0.3321,0.60355
219,why was i charged extra when paying with card?,card_payment_fee_charged,1.0,0.212546,0.606273
930,"i received my american express in apple pay, i...",apple_pay_or_google_pay,0.947368,0.272377,0.609873


In [6]:
print(f"{unfiltered_size - filtered_size} texts with a combined lexical score of less than {combined_lexical_score_threshold} removed from the dataset")

295 texts with a combined lexical score of less than 0.6 removed from the dataset


Now we can train a new regression model and compare the labelling accuracy score after applying lexical filtering.

In [7]:
filtered_raw_texts, filtered_raw_labels = filtered_data["text"].values, filtered_data["label"].values
filtered_raw_train_texts, filtered_raw_test_texts, filtered_raw_train_labels, filtered_raw_test_labels = train_test_split(filtered_raw_texts, filtered_raw_labels, test_size=0.1)

encoder = LabelEncoder()
encoder.fit(filtered_raw_train_labels)

transformer = SentenceTransformer("google/electra-small-discriminator")

filtered_train_texts = transformer.encode(filtered_raw_train_texts)
filtered_test_texts = transformer.encode(filtered_raw_test_texts)

filtered_train_labels = encoder.transform(filtered_raw_train_labels)
filtered_test_labels = encoder.transform(filtered_raw_test_labels)

filtered_model = LogisticRegression(max_iter=400)
filtered_model.fit(X=filtered_train_texts, y=filtered_train_labels)

filtered_preds = filtered_model.predict(filtered_test_texts)
filtered_og = accuracy_score(filtered_test_labels, filtered_preds)
print(f"\n Test accuracy of filtered model: {filtered_og}")

No sentence-transformers model found with name google/electra-small-discriminator. Creating a new one with mean pooling.



 Test accuracy of filtered model: 0.7368421052631579


In [8]:
percentage_change = ((filtered_og - baseline_og) / baseline_og) * 100
print(f"The baseline accuracy score was {baseline_og:.2f}, with the filtering applied our accuracy score is now {filtered_og:.2f}. Lexical filtering has {'increased' if percentage_change > 0 else 'decreased'} the label accuracy by {percentage_change:.2f}%")

The baseline accuracy score was 0.83, with the filtering applied our accuracy score is now 0.74. Lexical filtering has decreased the label accuracy by -11.22%


The next thing we want to do is incorporate our combined lexical score into cleanlab's find label issues algorithm. But first we can use their algorithm without modifications to see the number of labels which are flagged as low quality, remove them from the dataset and then get another baseline accuracy.

In [9]:
model = LogisticRegression(max_iter=400)

cv_n_folds = 5
cl = CleanLearning(model, cv_n_folds=cv_n_folds)

label_issues = cl.find_label_issues(X=train_texts, labels=train_labels)
label_issues.head()

Unnamed: 0,is_label_issue,label_quality,given_label,predicted_label
0,False,0.969299,6,6
1,False,0.738626,4,4
2,False,0.285632,0,8
3,False,0.975937,2,2
4,False,0.102667,3,5


In [10]:
identified_issues = label_issues[label_issues["is_label_issue"] == True]
lowest_quality_labels = label_issues["label_quality"].argsort()[:10].to_numpy()

print(
    f"cleanlab found {len(identified_issues)} potential label errors in the dataset.\n"
    f"Here are indices of the top 10 most likely errors: \n {lowest_quality_labels}"
)

cleanlab found 41 potential label errors in the dataset.
Here are indices of the top 10 most likely errors: 
 [383 669 359 485 104  60 196 727 394 816]


In [11]:
cl.fit(X=train_texts, labels=train_labels, label_issues=cl.get_label_issues())

pred_labels = cl.predict(test_texts)
acc_cl = accuracy_score(test_labels, pred_labels)
print(f"Test accuracy of cleanlab's model: {acc_cl}")

Test accuracy of cleanlab's model: 0.82


Now, we can use our own modified find label issues algoritm with the lexical scoress argument flag to combine our combined lexical score with cleanlab's 

In [12]:
model = LogisticRegression(max_iter=400)

cv_n_folds = 5
cl = CleanLearning(model, cv_n_folds=cv_n_folds)

lexical_scores = np.array([LexicalQualityEvaluator(text).get_combined_quality_score() for text in raw_train_texts])
label_issues = cl.find_label_issues(X=train_texts, labels=train_labels, lexical_scores=lexical_scores)

cl.fit(X=train_texts, labels=train_labels, label_issues=cl.get_label_issues())

pred_labels = cl.predict(test_texts)
combined_acc_cl = accuracy_score(test_labels, pred_labels)
print(f"Test accuracy of cleanlab's model with combined lexical scores: {combined_acc_cl}")

Test accuracy of cleanlab's model with combined lexical scores: 0.83


In [13]:
percentage_change = ((combined_acc_cl - acc_cl) / acc_cl) * 100
print(
    f"The baseline accuracy score was {acc_cl:.2f}, with the lexical score applied our accuracy score is now {combined_acc_cl:.2f}. The new algorithm has {'increased' if percentage_change > 0 else 'decreased'} the label accuracy by {percentage_change:.2f}%"
)

The baseline accuracy score was 0.82, with the lexical score applied our accuracy score is now 0.83. The new algorithm has increased the label accuracy by 1.22%


In conclusion, the tests suggest that the methods used to determine the spelling accuracy and lexical quality of the given texts are not very accurate in the first place. This combined with the simple approach of averaging the lexical quality score with cleanlab's confidence score resulted in little improvement.
To further improve the accuracy, a better model could be used to more accurately determine the lexical quality of the given texts, as well as a more considered approach when incorporating these scores with the existing algorithm.
