In [None]:
pip install py-AutoClean matplotlib datasets cleanlab scikit-learn jenga ftfy pandas numpy setuptools ucimlrepo category_encoders -q

In [None]:
import sys
import os

sys.path.append(os.path.abspath(".."))

## Load original data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from AutoClean import AutoClean
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.utils import resample

df = pd.read_csv("adult_for_manual_edit.csv")

numeric_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
def resample_df(df):
    target_column = df.columns[-1]
    X = df.drop(columns=[target_column]).copy()
    y = df[[target_column]].copy()

    train = X.copy()
    train['income'] = y

    class_1 = train[train['income'] == '<=50K']
    class_2 = train[train['income'] == '>50K']

    class_1_resampled = resample(class_1,
                                replace=False,
                                n_samples=10000,
                                random_state=42)

    class_2_resampled = resample(class_2,
                                replace=False,
                                n_samples=10000,
                                random_state=42)

    df_balanced = pd.concat([class_1_resampled, class_2_resampled]).sample(frac=1, random_state=42)
    X = df_balanced.drop(columns=['income'])
    y = df_balanced['income']

    return X, y

X, y = resample_df(df)

categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    sparse_threshold=0
)

## Train - Logistic Regression

In [None]:
clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(
            random_state=42,
            class_weight='balanced'
        ))
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_reg_og=accuracy_score(y_test, y_pred)
print(f"Accuracy on CSV: {accuracy_reg_og:.4f}")
print(classification_report(y_test, y_pred))

## Corrupt

In [None]:
import src.corruption.inject as inject

X_copy = X.copy()

numeric_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
X_copy[numeric_features] = X_copy[numeric_features].apply(pd.to_numeric, errors='coerce')

def all_numerical_corruptions_with_y(X, y, numeric_columns=None):
    X = X.copy()
    if isinstance(y, pd.Series):
        y = y.to_frame(name=y.name or 'income')
    df = pd.concat([X, y], axis=1)
    y_name = y.columns[0]

    if numeric_columns is None:
        numeric_columns = [
            c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])
        ]

    df_corrupted = inject.all_numerical_corruptions(
        df, columns=numeric_columns
    )
    df_corrupted = df_corrupted[df.columns]

    X_corrupted = df_corrupted.drop(columns=[y_name])
    y_corrupted = df_corrupted[y_name]

    return X_corrupted, y_corrupted

X_corrupted, y_corrupted = all_numerical_corruptions_with_y(X_copy, y, numeric_features)

X_train, X_test, y_train, y_test = train_test_split(X_corrupted.fillna('0'), y_corrupted, test_size=0.3, random_state=42)

clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
accuracy_reg_cor=accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_reg_cor:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
shifted = inject.category_shift(X_corrupted, columns=categorical_features)
typos = inject.category_typo(shifted, columns=categorical_features)

for col in categorical_features:
    typos[col] = typos[col].astype("object")

X_corrupted = inject.missing_values(typos, categorical_features, fraction=0.25)

X_train, X_test, y_train, y_test = train_test_split(X_corrupted.fillna('0'), y_corrupted, test_size=0.3, random_state=42)

clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
accuracy_reg_cat=accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy_reg_cat:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

### Clean

In [None]:
X, y = X_corrupted, y_corrupted
from src.corruption import clean_num

numeric_features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
X_clean_noisy_removed, y_clean_noisy_removed = clean_num.run_num_clean(numeric_features, X, y, clf)

X_train, X_test, y_train, y_test = train_test_split(X_clean_noisy_removed, y_clean_noisy_removed, test_size=0.3, random_state=42)

clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
accuracy_reg_cl=accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy_reg_cl:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
X_clean_noisy_removed, y_clean_noisy_removed = clean_num.run_num_clean(numeric_features, X, y, clf, True)

X_train, X_test, y_train, y_test = train_test_split(X_clean_noisy_removed, y_clean_noisy_removed, test_size=0.3, random_state=42)

clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)
accuracy_reg_cleanlab=accuracy_score(y_test, y_pred)

print(len(X))
print(f"Accuracy cleanlab: {accuracy_reg_cleanlab:.4f}")
print("Dropped ", len(X_corrupted) - len(X_clean_noisy_removed))

## Train - HistGradientBoostingClassifier

In [None]:
clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', HistGradientBoostingClassifier(
            random_state=42,
            class_weight='balanced'
        ))
    ])

X, y = resample_df(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_gb_og=accuracy_score(y_test, y_pred)
print(f"Accuracy on CSV: {accuracy_gb_og:.4f}")
print(classification_report(y_test, y_pred))

Corrupt

In [None]:
X[numeric_features] = X[numeric_features].apply(pd.to_numeric, errors='coerce')
X_corrupted, y_corrupted = all_numerical_corruptions_with_y(X, y, numeric_features)
shifted = inject.category_shift(X_corrupted, columns=categorical_features)
typos = inject.category_typo(shifted, columns=categorical_features)

for col in categorical_features:
    typos[col] = typos[col].astype("object")

X_corrupted = inject.missing_values(typos, categorical_features, fraction=0.5)
X_corrupted = inject.category_default(X_corrupted, categorical_features)
clf.fit(X_corrupted, y_corrupted.values.ravel())
y_pred = clf.predict(X_test)
accuracy_gb_cor=accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy_gb_cor:.4f}")
print("\nClassification Report:", classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred))

Clean

In [None]:
X_clean_noisy_removed, y_clean_noisy_removed = clean_num.run_num_clean(numeric_features, X_corrupted, y_corrupted, clf)

X_train, X_test, y_train, y_test = train_test_split(X_clean_noisy_removed, y_clean_noisy_removed, test_size=0.3, random_state=42)

clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)

accuracy_gb_cl=accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy_gb_cl:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:

X_clean_noisy_removed, y_clean_noisy_removed = clean_num.run_num_clean(numeric_features, X_corrupted, y_corrupted, clf, True)

X_train, X_test, y_train, y_test = train_test_split(X_clean_noisy_removed, y_clean_noisy_removed, test_size=0.3, random_state=42)

clf.fit(X_train, y_train.values.ravel())
y_pred = clf.predict(X_test)

accuracy_gb_cleanlab=accuracy_score(y_test, y_pred)
print(len(X_corrupted))
print("Dropped ", len(X_corrupted) - len(X_clean_noisy_removed))
print(f"Accuracy: {accuracy_gb_cleanlab:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Plotting

In [None]:
import matplotlib.pyplot as plt
import numpy as np

logreg_acc=[accuracy_reg_og, accuracy_reg_cat, accuracy_reg_cl, accuracy_reg_cleanlab]
histgb_acc=[accuracy_gb_og, accuracy_gb_cor, accuracy_gb_cl, accuracy_gb_cleanlab]
labels = [
    "Clean",
    "Corrupted",
    "Cleaned\n(no Cleanlab)",
    "Cleaned\n(Cleanlab)",
]

x = np.arange(len(labels))
width = 0.3

fig, ax = plt.subplots()

ax.bar(x - width/2, logreg_acc, width, label="Logistic Regression")
ax.bar(x + width/2, histgb_acc, width, label="HistGradientBoosting")

ax.set_ylabel("Accuracy")
ax.set_title("Effect of Corruption and Cleaning on Model Accuracy")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.set_ylim(0, 1)
ax.legend()

plt.tight_layout()
plt.show()


## Appendix: manual data malformation

This section was used previously to generate a big batch of corruptions

In [None]:
import random
import csv
import pandas as pd

input_csv = "adult_for_manual_edit.csv"
output_csv = "adult_corrupted_text_2.csv"

fraction_corrupt = 0.1
chars_to_inject = ['#','@','!','x','a']
negate_fraction = 0.1

with open(input_csv, newline='', encoding='utf-8') as f:
    reader = list(csv.reader(f))
    header = reader[0]
    rows = reader[1:]

col_to_idx = {col: i for i, col in enumerate(header)}
print(col_to_idx)

for row in rows:
    for col_name in numeric_features:
        try:
            col_idx = col_to_idx[col_name]
            cell = row[col_idx]
            val = float(cell)
        except:
            continue

        # Negate some values
        if random.random() < negate_fraction:
            val = -abs(val)

        # Multiply by random factor
        if random.random() < fraction_corrupt:
            factor = random.uniform(0.5, 10.0)
            val = val * factor

        # Replace some with '?'
        if random.random() < fraction_corrupt:
            row[col_idx] = '?'
            continue 

        # Inject random char
        if random.random() < fraction_corrupt:
            row[col_idx] = str(val) + random.choice(chars_to_inject)
        else:
            row[col_idx] = str(val)

for row in rows:
    for col_idx in categorical_features:
        if random.random() < fraction_corrupt:
            col_idx = col_to_idx[col_idx]
            val = str(row[col_idx])
            n_chars = random.choice([1,2])
            for _ in range(n_chars):
                pos = random.randint(0, len(val))
                val = val[:pos] + random.choice(chars_to_inject) + val[pos:]
            row[col_idx] = val

with open(output_csv, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(rows)

df_corrupt = pd.read_csv(output_csv)

y = df_corrupt['income']
X = df_corrupt.drop(columns=['income'])

