# Bad labels

Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks

https://arxiv.org/abs/2103.14749


In [13]:
%pip install cleanlab

Collecting cleanlab
  Downloading cleanlab-1.0-py2.py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 1.9 MB/s eta 0:00:011
Installing collected packages: cleanlab
Successfully installed cleanlab-1.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("./../data/goemotions/full_dataset/goemotions_1.csv")

In [2]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [3]:
pd.set_option('display.max_colwidth', None)

df[['text', 'excitement']].loc[lambda d: d['excitement'] == 1].sample(2)

Unnamed: 0,text,excitement
51781,Happy birthday!,1
60473,I’m feeling very excited !,1


In [4]:
# Class imbalance

df['excitement'].value_counts()

0    68100
1     1900
Name: excitement, dtype: int64

In [5]:
X, y = df['text'], df['excitement']

pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

pipe.fit(X, y)


Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

## Predicted probabilities

In [6]:
pipe.predict_proba(X)

array([[0.81906041, 0.18093959],
       [0.87337141, 0.12662859],
       [0.99887467, 0.00112533],
       ...,
       [0.9576734 , 0.0423266 ],
       [0.89403891, 0.10596109],
       [0.97989256, 0.02010744]])

In [7]:
# make predictions
probas = pipe.predict_proba(X)[:, 0]

# use predictions in hindsight, note that
# probas.shape[0] == df.shape[0]
(df
  .loc[(probas > 0.45) & (probas < 0.55)]
  [['text', 'excitement']]
  .head(7))

Unnamed: 0,text,excitement
8,that's adorable asf,0
46,"If there’s a pattern, yes.",0
107,My fans on patreon will be rewarded soon,0
154,"Ones with close ties to SA, anyway. An escaped apostate won't exactly be itching to run home.",0
158,I really like this ring so I’m glad to hear that.,0
262,OMG THOSE TINY SHOES! *desire to boop snoot intensifies*,0
362,This. I relate to this. So much. Almost too much.,0


## Model disagreement

In [8]:
df.loc[lambda d: d['excitement'] != pipe.predict(X)].shape

(5314, 37)

In [9]:
def correct_class_confidence(X, y, mod):
    """
    Gives the predicted confidence (or proba) associated
    with the correct label `y` from a given model.
    """
    probas = mod.predict_proba(X)
    values = []
    for i, proba in enumerate(probas):
        proba_dict = {mod.classes_[j]: v for j, v in enumerate(proba)}
        values.append(proba_dict[y[i]])
    return values

In [10]:
(df
  .assign(confidence=correct_class_confidence(X, y, pipe))
  .loc[lambda d: pipe.predict(d['text']) != d['excitement']]
  [['text', 'excitement', 'confidence']]
  .sort_values("confidence")
  .loc[lambda d: d['excitement'] == 0]
  .head(20))

Unnamed: 0,text,excitement,confidence
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0,0.000148
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0,0.000262
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0,0.000594
69395,"Wow, your posting history is a real... interesting ride.",0,0.000719
20823,"Wow, your posting history is a real... interesting ride.",0,0.000719
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0,0.000741
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0,0.000812
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0,0.001129


## Cleanlab

In [11]:
from cleanlab.pruning import get_noise_indices

ordered_label_errors = get_noise_indices(
    s=y,
    psx=pipe.predict_proba(X),
    sorted_index_method='prob_given_label',
)

In [12]:
df.iloc[ordered_label_errors][['text', 'excitement']].head(20)

Unnamed: 0,text,excitement
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0
20823,"Wow, your posting history is a real... interesting ride.",0
69395,"Wow, your posting history is a real... interesting ride.",0
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0


## Noisy label learning

In [13]:
from cleanlab.classification import LearningWithNoisyLabels
from sklearn.linear_model import LogisticRegression

# Wrap around any classifier that has `sample_weights`.
fresh_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
lnl = LearningWithNoisyLabels(clf=fresh_pipe)

# Pay attention! It's s=, not y=!
lnl.fit(X=X, s=y.values)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [14]:
# comparison

new_pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

new_pipe.fit(X=X, y=y)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [15]:
df.loc[lnl.predict(X) != new_pipe.predict(X)][['text', 'excitement']].sample(5)

Unnamed: 0,text,excitement
12853,"Any respectable head shop (pot accessory store) or tobacconist (cigar shop) will have them. Good luck, love that friggin pipe!",1
13727,Yep! It's still busy. Slightly less crowded but still fun!,0
16716,Right on. The cities are where you can find some of the most virgin girls.,1
31208,Yes! It's hard to believe that some people like warm toilet seats enough to pay for it as a feature.,0
38582,"I have seen quite a few posts in this subreddit on the topic, I would start by searching those.",1
