In [37]:
from datasets import load_dataset
from transformers import AutoTokenizer

original_ds = (
    load_dataset('jziebura/polish_youth_slang_classification')
    .filter(lambda x: x['znaczenie wyrazów slangowych'] is not None)
    .map(
        lambda x: {
            'word': x['słowo slangowe'].strip(),
            'meaning': x['znaczenie wyrazów slangowych'].strip(),
            'text': x['tekst'].strip()}
    )
)

In [38]:
ds = (
    original_ds
    .map(lambda x: {'text': " - ".join([x['word'], x['meaning']])})
    .rename_column('sentyment', 'label')
    .class_encode_column('label')
)

In [39]:
import pandas as pd

df = pd.concat(ds[split].to_pandas() for split in ds.keys())

In [43]:
from transformers import AutoTokenizer
AutoTokenizer.from_pretrained('gpt2').vocab_size

50257

In [40]:
df['label'].value_counts()

label
1    2767
0    1561
2    1093
Name: count, dtype: int64

In [29]:
from sklearn.model_selection import cross_val_predict
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
import numpy as np


def get_initial_model_data(texts: np.ndarray, labels: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    clf = LogisticRegression(max_iter=1000, random_state=0)
    pred_probs = cross_val_predict(
        clf,
        embeddings,
        labels,
        cv=5,
        method="predict_proba",
    )

    return embeddings, pred_probs

In [30]:
from typing import Callable
import pandas as pd


def merge_duplicate_sets(df, merge_key: str):
    """Generate group keys for each row, then merge intersecting sets.

    :param df: DataFrame with columns 'is_near_duplicate_issue' and 'near_duplicate_sets'
    :param merge_key: Name of the column to store the merged sets
    """

    df[merge_key] = df.apply(construct_group_key, axis=1)
    merged_sets = consolidate_sets(df[merge_key].tolist())
    df[merge_key] = df[merge_key].map(
        lambda x: next(s for s in merged_sets if x.issubset(s))
    )
    return df

def construct_group_key(row):
    """Convert near_duplicate_sets into a frozenset and include the row's own index."""
    return frozenset(row['near_duplicate_sets']).union({row.name})

def consolidate_sets(sets_list):
    """Merge sets if they intersect."""

    # Convert the input list of frozensets to a list of mutable sets
    sets_list = [set(item) for item in sets_list]

    # A flag to keep track of whether any sets were merged in the current iteration
    merged = True

    # Continue the merging process as long as we have merged some sets in the previous iteration
    while merged:
        merged = False
        new_sets = []

        # Iterate through each set in our list
        for current_set in sets_list:
            # Skip empty sets
            if not current_set:
                continue

            # Find all sets that have an intersection with the current set
            intersecting_sets = [s for s in sets_list if s & current_set]

            # If more than one set intersects, set the merged flag to True
            if len(intersecting_sets) > 1:
                merged = True

            # Merge all intersecting sets into one set
            merged_set = set().union(*intersecting_sets)
            new_sets.append(merged_set)

            # Empty the sets we've merged to prevent them from being processed again
            for s in intersecting_sets:
                sets_list[sets_list.index(s)] = set()

        # Replace the original sets list with the new list of merged sets
        sets_list = new_sets

    # Convert the merged sets back to frozensets for the output
    return [frozenset(item) for item in sets_list]

def lowest_score_strategy(sub_df):
    """Keep the row with the lowest near_duplicate_score."""
    return sub_df['near_duplicate_score'].idxmin()


def filter_near_duplicates(data: pd.DataFrame, strategy_fn: Callable = lowest_score_strategy, **strategy_kwargs):
    """
    Given a dataframe with columns 'is_near_duplicate_issue' and 'near_duplicate_sets',
    return a series of boolean values where True indicates the rows to be removed.
    The strategy_fn determines which rows to keep within each near_duplicate_set.

    :param data: DataFrame with is_near_duplicate_issue and near_duplicate_sets columns
    :param strategy_fn: Function to determine which rows to keep within each near_duplicate_set
    :return: Series of boolean values where True indicates rows to be removed.
    """

    # Filter out rows where 'is_near_duplicate_issue' is True to get potential duplicates
    duplicate_rows = data.query("is_near_duplicate_issue").copy()

    # Generate group keys for each row and merge intersecting sets
    group_key = "sets"
    duplicate_rows = merge_duplicate_sets(duplicate_rows, merge_key=group_key)

    # Use the strategy function to determine the indices of the rows to keep for each group
    to_keep_indices = duplicate_rows.groupby(group_key).apply(strategy_fn, **strategy_kwargs).explode().values

    # Produce a boolean series indicating which rows should be removed
    to_remove = ~data.index.isin(to_keep_indices)

    return to_remove

In [32]:
from cleanlab import Datalab

for i in range(10):
    texts = df["text"].values
    labels = df["label"].values

    embeddings, pred_probs = get_initial_model_data(texts, labels)
    lab = Datalab(df, label_name='label', task='classification')
    lab.find_issues(pred_probs=pred_probs, features=embeddings)
    lab.report()

    df_deduplicated = df.copy()

    duplicate_issues = lab.get_issues("near_duplicate")

    label_issues = lab.get_issues("label")
    label_issues = label_issues[label_issues["is_label_issue"]]

    duplicates_count = len(duplicate_issues[duplicate_issues["is_near_duplicate_issue"] == True])
    label_issues_count = len(label_issues)

    if duplicates_count:
        print(filter_near_duplicates(duplicate_issues) )
        duplicate_issues = duplicate_issues[filter_near_duplicates(duplicate_issues) & duplicate_issues["is_near_duplicate_issue"]]
        df_deduplicated = df_deduplicated.drop(df_deduplicated[df_deduplicated.index.isin(duplicate_issues.index)].index, axis='index')

    if label_issues_count < 40 and duplicates_count < 40:
        break

    label_issues = label_issues[label_issues.index.isin(df_deduplicated.index)]
    idxs = label_issues.index.tolist()
    pred_labels = label_issues["predicted_label"]

    df_fixed = df_deduplicated.copy()
    df_fixed.loc[idxs, "label"] = pred_labels

    df = df_fixed.reset_index(drop=True)

Batches:   0%|          | 0/170 [00:00<?, ?it/s]

Finding null issues ...
Finding label issues ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 2959 issues found in the dataset.
Dataset Information: num_examples: 5421, num_classes: 3

Here is a summary of various issues found in your data:

    issue_type  num_issues
         label        1469
near_duplicate        1450
       outlier          40

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 1469
Overall dataset q

Batches:   0%|          | 0/139 [00:00<?, ?it/s]

Finding null issues ...
Finding label issues ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 983 issues found in the dataset.
Dataset Information: num_examples: 4440, num_classes: 3

Here is a summary of various issues found in your data:

    issue_type  num_issues
         label         525
near_duplicate         395
       outlier          62
       non_iid           1

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this iss

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

Finding null issues ...
Finding label issues ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 143 issues found in the dataset.
Dataset Information: num_examples: 4181, num_classes: 3

Here is a summary of various issues found in your data:

    issue_type  num_issues
         label          75
       outlier          66
near_duplicate           2

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 75
Overall dataset qual

Batches:   0%|          | 0/131 [00:00<?, ?it/s]

Finding null issues ...
Finding label issues ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 91 issues found in the dataset.
Dataset Information: num_examples: 4180, num_classes: 3

Here is a summary of various issues found in your data:

issue_type  num_issues
   outlier          66
     label          25

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


---------------------- outlier issues ----------------------

About this issue:
	Examples that are very different from the rest of the dataset 
    (i.e. potentially out-of-distribution or rare/anomalous instances).
    

Number of examples with this issue: 66
Overall dataset quality in terms of this issue: 0.3641

Examples

In [33]:
import zstandard as zstd
import json
import io
def filter_dataset(output_path):
    with open(output_path, 'wb') as outfile:
        cctx = zstd.ZstdCompressor()
        writer = cctx.stream_writer(outfile)

        for idx, entry in df_fixed.iterrows():
            filtered = {
                'word': entry['słowo slangowe'],
                'meaning': entry['znaczenie wyrazów slangowych'],
                'text': entry["tekst"],
                'label': entry.label,
            }
            writer.write((json.dumps(filtered) + '\n').encode('utf-8'))

        writer.close()

filter_dataset('polish_youth_slang_classification_filtered_last.json.zst')

In [36]:
df['label'].value_counts()

label
1    2727
0    1191
2     262
Name: count, dtype: int64