In [1]:
import pandas

In [2]:
features = ['checking account balance', 'duration', 'credit history',
            'purpose', 'amount', 'savings', 'employment', 'installment',
            'marital status', 'other debtors', 'residence time',
            'property', 'age', 'other installments', 'housing', 'credits',
            'job', 'persons', 'phone', 'foreign']
target = 'repaid'

Data taken from https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29

In [3]:
df = pandas.read_csv('../../data/credit/german.data', sep=' ',
                     names=features+[target])

In [4]:
numerical_features = ['age', 'residence time', 'installment', 'amount', 'duration', 'persons', 'credits']
quantitative_features = list(filter(lambda x: x not in numerical_features, features))

In [5]:
X = pandas.get_dummies(df, columns=quantitative_features)

In [6]:
encoded_features = list(filter(lambda x: x != target, X.columns))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer

In [12]:
cv_accuracy = cross_val_score(LogisticRegression(),
                X[encoded_features],
                X[target],
                scoring=make_scorer(accuracy_score),
                cv=10)

In [14]:
cv_accuracy.mean(), cv_accuracy.std()

(0.7470000000000001, 0.04583666654546335)

# Add noise by flipping random indicators

In [17]:
[a ^ b for a, b in zip((0,1,0,1), (0,0, 1,1))]

[0, 1, 1, 0]

In [18]:
flip_fraction = 0.1 # flip 10%

In [20]:
import numpy as np

In [21]:
X_noise = X.copy()

In [35]:
for c in X_noise.columns:
    if any(c.startswith(i) for i in quantitative_features):
        mask = np.zeros(X_noise.shape[0])
        mask[:int(len(mask) * flip_fraction)] = 1
        np.random.shuffle(mask)
        X_noise[c] = (X_noise[c] + mask) % 2

In [36]:
cv_accuracy = cross_val_score(LogisticRegression(),
                X_noise[encoded_features],
                X_noise[target],
                scoring=make_scorer(accuracy_score),
                cv=10)

In [37]:
cv_accuracy.mean(), cv_accuracy.std()

(0.711, 0.031128764832546757)