# Coursework Assignment: Bias in AI

Link to dataset used (given in 'Project Suggestions'): https://www.kaggle.com/kabure/german-credit-data-with-risk

## Setup

In [263]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from random import randint

In [264]:
# load the data
df = pd.read_csv("./data/german_credit_data.csv", index_col=0)

# make a copy of the original data
original_df = df.copy()

In [265]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


In [266]:
df.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


In [267]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


## Data analysis

In [268]:
# create age group column
df['Age_Group'] = np.nan

df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
0,67,male,2,own,,little,1169,6,radio/TV,good,
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,
2,49,male,1,own,little,,2096,12,education,good,
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,
4,53,male,2,free,little,little,4870,24,car,bad,


In [269]:
for col in [df]:
    print(col)

     Age     Sex  Job Housing Saving accounts Checking account  Credit amount  \
0     67    male    2     own             NaN           little           1169   
1     22  female    2     own          little         moderate           5951   
2     49    male    1     own          little              NaN           2096   
3     45    male    2    free          little           little           7882   
4     53    male    2    free          little           little           4870   
..   ...     ...  ...     ...             ...              ...            ...   
995   31  female    1     own          little              NaN           1736   
996   40    male    3     own          little           little           3857   
997   38    male    2     own          little              NaN            804   
998   23    male    2    free          little           little           1845   
999   27    male    2     own        moderate         moderate           4576   

     Duration              

In [270]:
# populate age group column
for col in [df]:
    col.loc[(col['Age'] > 18) & (col['Age'] <= 29), 'Age_Group'] = 'Young'
    col.loc[(col['Age'] > 29) & (col['Age'] <= 40), 'Age_Group'] = 'Young Adults'
    col.loc[(col['Age'] > 40) & (col['Age'] <= 55), 'Age_Group'] = 'Senior'
    col.loc[col['Age'] > 55, 'Age_Group'] = 'Elder' 
    
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
0,67,male,2,own,,little,1169,6,radio/TV,good,Elder
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,Young
2,49,male,1,own,little,,2096,12,education,good,Senior
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,Senior
4,53,male,2,free,little,little,4870,24,car,bad,Senior


In [271]:
df['Sex'].value_counts()

male      690
female    310
Name: Sex, dtype: int64

In [272]:
df['Age_Group'].value_counts()

Young           371
Young Adults    355
Senior          203
Elder            71
Name: Age_Group, dtype: int64

In [273]:
def print_stats(df):
    print('mean: ', [round(num, 1) for num in list(df.mean())[1:]])
    print('variance: ', [round(num, 1) for num in list(df.var())[1:]])
    print('\n')
    zips = []
    for col in ['Housing', 'Saving accounts', 'Checking account',
       'Purpose', 'Risk']:
       vals = df[col].value_counts().values
       inds = df[col].value_counts().index
       z = list(zip(inds, vals))[:3]
       if col == 'Risk':
           z.append('-')
       zips.append(z)
       print(z)
       print('\n')
    print(list(zip(zips[0], zips[1], zips[2], zips[3], zips[4])))

In [274]:
df_male = df[df['Sex'] == 'male']
print_stats(df_male)

mean:  [1.9, 3448.0, 21.6]
variance:  [0.4, 8412806.3, 154.7]


[('own', 517), ('free', 89), ('rent', 84)]


[('little', 409), ('moderate', 71), ('quite rich', 47)]


[('little', 186), ('moderate', 183), ('rich', 43)]


[('car', 243), ('radio/TV', 195), ('furniture/equipment', 107)]


[('good', 499), ('bad', 191), '-']


[(('own', 517), ('little', 409), ('little', 186), ('car', 243), ('good', 499)), (('free', 89), ('moderate', 71), ('moderate', 183), ('radio/TV', 195), ('bad', 191)), (('rent', 84), ('quite rich', 47), ('rich', 43), ('furniture/equipment', 107), '-')]


In [275]:
df_female = df[df['Sex'] == 'female']
print_stats(df_female)

mean:  [1.8, 2877.8, 19.4]
variance:  [0.5, 6776346.3, 122.1]


[('own', 196), ('rent', 95), ('free', 19)]


[('little', 194), ('moderate', 32), ('rich', 19)]


[('little', 88), ('moderate', 86), ('rich', 20)]


[('car', 94), ('radio/TV', 85), ('furniture/equipment', 74)]


[('good', 201), ('bad', 109), '-']


[(('own', 196), ('little', 194), ('little', 88), ('car', 94), ('good', 201)), (('rent', 95), ('moderate', 32), ('moderate', 86), ('radio/TV', 85), ('bad', 109)), (('free', 19), ('rich', 19), ('rich', 20), ('furniture/equipment', 74), '-')]


In [276]:
df_Young = df[df['Age_Group'] == 'Young']
print_stats(df_Young)

mean:  [1.8, 3089.0, 20.8]
variance:  [0.3, 7261837.7, 142.6]


[('own', 248), ('rent', 113), ('free', 10)]


[('little', 242), ('moderate', 42), ('quite rich', 19)]


[('little', 115), ('moderate', 112), ('rich', 24)]


[('radio/TV', 117), ('car', 102), ('furniture/equipment', 84)]


[('good', 234), ('bad', 137), '-']


[(('own', 248), ('little', 242), ('little', 115), ('radio/TV', 117), ('good', 234)), (('rent', 113), ('moderate', 42), ('moderate', 112), ('car', 102), ('bad', 137)), (('free', 10), ('quite rich', 19), ('rich', 24), ('furniture/equipment', 84), '-')]


In [277]:
df_YoungAdults = df[df['Age_Group'] == 'Young Adults']
print_stats(df_YoungAdults)

mean:  [2.0, 3375.5, 21.5]
variance:  [0.4, 7646336.1, 139.2]


[('own', 278), ('free', 39), ('rent', 38)]


[('little', 201), ('moderate', 41), ('quite rich', 24)]


[('moderate', 100), ('little', 81), ('rich', 18)]


[('car', 128), ('radio/TV', 93), ('furniture/equipment', 58)]


[('good', 264), ('bad', 91), '-']


[(('own', 278), ('little', 201), ('moderate', 100), ('car', 128), ('good', 264)), (('free', 39), ('moderate', 41), ('little', 81), ('radio/TV', 93), ('bad', 91)), (('rent', 38), ('quite rich', 24), ('rich', 18), ('furniture/equipment', 58), '-')]


In [278]:
df_Senior = df[df['Age_Group'] == 'Senior']
print_stats(df_Senior)

mean:  [1.9, 3366.4, 20.2]
variance:  [0.4, 7986564.4, 146.1]


[('own', 143), ('free', 40), ('rent', 20)]


[('little', 117), ('moderate', 16), ('quite rich', 15)]


[('little', 57), ('moderate', 39), ('rich', 15)]


[('car', 79), ('radio/TV', 51), ('furniture/equipment', 36)]


[('good', 150), ('bad', 53), '-']


[(('own', 143), ('little', 117), ('little', 57), ('car', 79), ('good', 150)), (('free', 40), ('moderate', 16), ('moderate', 39), ('radio/TV', 51), ('bad', 53)), (('rent', 20), ('quite rich', 15), ('rich', 15), ('furniture/equipment', 36), '-')]


In [279]:
df_Elder = df[df['Age_Group'] == 'Elder']
print_stats(df_Elder)

mean:  [1.8, 3430.4, 20.5]
variance:  [0.7, 13329819.2, 192.5]


[('own', 44), ('free', 19), ('rent', 8)]


[('little', 43), ('rich', 5), ('quite rich', 5)]


[('little', 21), ('moderate', 18), ('rich', 6)]


[('car', 28), ('radio/TV', 19), ('business', 9)]


[('good', 52), ('bad', 19), '-']


[(('own', 44), ('little', 43), ('little', 21), ('car', 28), ('good', 52)), (('free', 19), ('rich', 5), ('moderate', 18), ('radio/TV', 19), ('bad', 19)), (('rent', 8), ('quite rich', 5), ('rich', 6), ('business', 9), '-')]


In [280]:
for group in [df_male, df_female, df_Young, df_YoungAdults, df_Senior, df_Elder]:
    print(round(sum(group['Risk'] == 'bad')/len(group) * 100, 1))

27.7
35.2
36.9
25.6
26.1
26.8


## Conventional Implementation

### Data cleaning

In [281]:
# Check missing values in our dataframe
df.isnull().sum().sort_values(ascending=False)

Checking account    394
Saving accounts     183
Age                   0
Sex                   0
Job                   0
Housing               0
Credit amount         0
Duration              0
Purpose               0
Risk                  0
Age_Group             0
dtype: int64

In [282]:
df.drop(['Checking account', 'Saving accounts'], axis=1, inplace=True)

In [283]:
df.isnull().sum().sort_values(ascending=False)

Age              0
Sex              0
Job              0
Housing          0
Credit amount    0
Duration         0
Purpose          0
Risk             0
Age_Group        0
dtype: int64

In [284]:
# Create set of only independant variables by dropping Risk
X = df.drop(['Risk'], axis=1)
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Credit amount,Duration,Purpose,Age_Group
0,67,male,2,own,1169,6,radio/TV,Elder
1,22,female,2,own,5951,48,radio/TV,Young
2,49,male,1,own,2096,12,education,Senior
3,45,male,2,free,7882,42,furniture/equipment,Senior
4,53,male,2,free,4870,24,car,Senior


In [285]:
# Create a series of outcome variable only
y = df['Risk']
y.head()

0    good
1     bad
2    good
3    good
4     bad
Name: Risk, dtype: object

### Naive split

In [286]:
# split datasets into training and test subsets for both X and y using sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=5)

In [287]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [288]:
# Scikit-Learn does not handle dataframes in pipeline so we will create our own class.
# Reference: Hands-On Machine Learning
from sklearn.base import BaseEstimator, TransformerMixin
# Create a class to select numerical or cateogrical columns.
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit (self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [289]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

numeric_train_df = X_train.select_dtypes(exclude=['object'])
numeric_test_df = X_test.select_dtypes(exclude=['object'])

categorical_train_df = X_train.select_dtypes(['object'])
categorical_test_df = X_test.select_dtypes(['object'])

numerical_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(numeric_train_df.columns.values.tolist())),
    ("std_scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('select_categoric', DataFrameSelector(categorical_train_df.columns.values.tolist())),
    ('encoding', CategoricalEncoder(encoding='onehot-dense'))
])

# Combine both pipelines
main_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', numerical_pipeline),
    ('cat_pipeline', categorical_pipeline)
])

X_train_scaled = main_pipeline.fit_transform(X_train)
X_test_scaled = main_pipeline.fit_transform(X_test)

In [290]:
from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()
y_train_scaled = encode.fit_transform(y_train)
y_test_scaled = encode.fit_transform(y_test)

In [291]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Implement gridsearchcv to see which are our best p

params = {'C': [0.75, 0.85, 0.95, 1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [3, 4, 5]}

svc_clf = SVC(random_state=42)

grid_search_cv = GridSearchCV(svc_clf, params)
grid_search_cv.fit(X_train_scaled, y_train_scaled)

GridSearchCV(estimator=SVC(random_state=42),
             param_grid={'C': [0.75, 0.85, 0.95, 1], 'degree': [3, 4, 5],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [292]:
grid_search_cv.best_estimator_

SVC(C=0.95, degree=4, kernel='poly', random_state=42)

In [293]:
grid_search_cv.best_params_


{'C': 0.95, 'degree': 4, 'kernel': 'poly'}

In [294]:
svc_clf = grid_search_cv.best_estimator_
svc_clf.fit(X_train_scaled, y_train_scaled)

SVC(C=0.95, degree=4, kernel='poly', random_state=42)

In [295]:
svc_clf.score(X_train_scaled, y_train_scaled)

0.8078078078078078

In [296]:
from sklearn.model_selection import cross_val_score

# Let's make sure the data is not overfitting
svc_clf = SVC(kernel='rbf', C=1, random_state=42)
scores = cross_val_score(svc_clf, X_train_scaled, y_train_scaled)
scores.mean()

0.7026820783301537

In [297]:
from sklearn.metrics import accuracy_score

svc_clf.fit(X_train_scaled, y_train_scaled)
y_pred = svc_clf.predict(X_test_scaled)

# Accuracy score
round(accuracy_score(y_test_scaled, y_pred) * 100, 1)

72.5

### Stratified split

In [298]:
# Feature Engineering (We cannot delete the missing values because we have too litle information)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit

df["Sex"].value_counts()

male      690
female    310
Name: Sex, dtype: int64

In [299]:
df_male_sample = df_male.sample(310)
df_male_sample

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
981,33,male,3,rent,little,,4844,48,business,bad,Young Adults
726,47,male,1,own,quite rich,,1316,15,radio/TV,good,Senior
104,26,male,2,rent,,,2445,12,car,good,Young
6,53,male,2,own,quite rich,,2835,24,furniture/equipment,good,Senior
19,31,male,2,own,quite rich,,3430,24,radio/TV,good,Young Adults
...,...,...,...,...,...,...,...,...,...,...,...
587,21,male,1,own,little,little,1289,12,furniture/equipment,good,Young
383,26,male,2,own,little,rich,1330,12,car,good,Young
465,63,male,2,own,little,little,2924,24,car,good,Elder
699,40,male,3,rent,little,rich,1905,15,education,good,Young Adults


In [300]:
df_equal_sex = df_female.append(df_male_sample)
df_equal_sex

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,Young
10,25,female,2,rent,little,moderate,1295,12,car,bad,Young
11,24,female,2,rent,little,little,4308,48,business,bad,Young
12,22,female,2,own,little,moderate,1567,12,radio/TV,good,Young
14,28,female,2,rent,little,little,1403,15,car,good,Young
...,...,...,...,...,...,...,...,...,...,...,...
587,21,male,1,own,little,little,1289,12,furniture/equipment,good,Young
383,26,male,2,own,little,rich,1330,12,car,good,Young
465,63,male,2,own,little,little,2924,24,car,good,Elder
699,40,male,3,rent,little,rich,1905,15,education,good,Young Adults


In [301]:
df_equal_sex = df_equal_sex.reset_index(drop=True)
df_equal_sex

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
0,22,female,2,own,little,moderate,5951,48,radio/TV,bad,Young
1,25,female,2,rent,little,moderate,1295,12,car,bad,Young
2,24,female,2,rent,little,little,4308,48,business,bad,Young
3,22,female,2,own,little,moderate,1567,12,radio/TV,good,Young
4,28,female,2,rent,little,little,1403,15,car,good,Young
...,...,...,...,...,...,...,...,...,...,...,...
615,21,male,1,own,little,little,1289,12,furniture/equipment,good,Young
616,26,male,2,own,little,rich,1330,12,car,good,Young
617,63,male,2,own,little,little,2924,24,car,good,Elder
618,40,male,3,rent,little,rich,1905,15,education,good,Young Adults


In [302]:
df_equal_sex.drop(['Checking account', 'Saving accounts'], axis=1, inplace=True)

In [303]:
df_equal_sex.isnull().sum().sort_values(ascending=False)

Age              0
Sex              0
Job              0
Housing          0
Credit amount    0
Duration         0
Purpose          0
Risk             0
Age_Group        0
dtype: int64

In [304]:
print(len(df_equal_sex))
print(df_equal_sex["Sex"].value_counts())

620
female    310
male      310
Name: Sex, dtype: int64


In [305]:
stratified = StratifiedShuffleSplit(n_splits=1, test_size=1/3, random_state=42)

for train, test in stratified.split(df_equal_sex, df_equal_sex["Sex"]):
    strat_train = df_equal_sex.loc[train]
    strat_test = df_equal_sex.loc[test]
    
print('train', round(len(strat_train[strat_train['Sex'] == 'female']) * 100 / len(strat_train), 1)) 
print('test', round(len(strat_test[strat_test['Sex'] == 'female']) * 100 / len(strat_test), 1)) 

train 49.9
test 50.2


In [306]:
strat_test["Sex"].value_counts()

female    104
male      103
Name: Sex, dtype: int64

In [307]:
# Have our new train and test data
train = strat_train
test = strat_test


# Our features
X_train = train.drop('Risk', axis=1)
X_test = test.drop('Risk', axis=1)

# Our Labels we will use them later
y_train = train["Risk"]
y_test = test["Risk"]

In [308]:
numeric_train_df = X_train.select_dtypes(exclude=['object'])
numeric_test_df = X_test.select_dtypes(exclude=['object'])

categorical_train_df = X_train.select_dtypes(['object'])
categorical_test_df = X_test.select_dtypes(['object'])

numerical_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(numeric_train_df.columns.values.tolist())),
    ("std_scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('select_categoric', DataFrameSelector(categorical_train_df.columns.values.tolist())),
    ('encoding', CategoricalEncoder(encoding='onehot-dense'))
])

# Combine both pipelines
main_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', numerical_pipeline),
    ('cat_pipeline', categorical_pipeline)
])

X_train_scaled = main_pipeline.fit_transform(X_train)
X_test_scaled = main_pipeline.fit_transform(X_test)

encode = LabelEncoder()
y_train_scaled = encode.fit_transform(y_train)
y_test_scaled = encode.fit_transform(y_test)

In [309]:
params = {'C': [0.75, 0.85, 0.95, 1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [3, 4, 5]}

svc_clf = SVC(random_state=42)

grid_search_cv = GridSearchCV(svc_clf, params)
grid_search_cv.fit(X_train_scaled, y_train_scaled)

GridSearchCV(estimator=SVC(random_state=42),
             param_grid={'C': [0.75, 0.85, 0.95, 1], 'degree': [3, 4, 5],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [310]:
grid_search_cv.best_estimator_

SVC(C=0.95, kernel='poly', random_state=42)

In [311]:
grid_search_cv.best_params_

{'C': 0.95, 'degree': 3, 'kernel': 'poly'}

In [312]:
svc_clf = grid_search_cv.best_estimator_
svc_clf.fit(X_train_scaled, y_train_scaled)

SVC(C=0.95, kernel='poly', random_state=42)

In [313]:
svc_clf.score(X_train_scaled, y_train_scaled)

0.7917675544794189

In [314]:
# Let's make sure the data is not overfitting
svc_clf = SVC(kernel='rbf', C=1, random_state=42)
scores = cross_val_score(svc_clf, X_train_scaled, y_train_scaled)
scores.mean()

0.7166617690273288

In [315]:
svc_clf.fit(X_train_scaled, y_train_scaled)
y_pred = svc_clf.predict(X_test_scaled)

# Accuracy score
round(accuracy_score(y_test_scaled, y_pred) * 100, 1)

71.0

## Repair algorithm

In [316]:
df = df.drop(["Age"], axis=1)
df.head()

Unnamed: 0,Sex,Job,Housing,Credit amount,Duration,Purpose,Risk,Age_Group
0,male,2,own,1169,6,radio/TV,good,Elder
1,female,2,own,5951,48,radio/TV,bad,Young
2,male,1,own,2096,12,education,good,Senior
3,male,2,free,7882,42,furniture/equipment,good,Senior
4,male,2,free,4870,24,car,bad,Senior


In [317]:
from itertools import product 
from math import inf

# cartesian product
all_stratified_combinations = list(product(df['Sex'].unique(), df['Age_Group'].unique()))

combination_sizes = []
for combination in all_stratified_combinations:
    sex, age_group = combination
    combination_size = len(df[(df['Sex'] == sex) & (df['Age_Group'] == age_group)])
    combination_sizes.append(combination_size)
    if combination_size == 0:
        all_stratified_combinations.remove(combination)

number_of_quantiles = min(combination_sizes)

D_prime = df.copy()

In [318]:
Y_columns = list(D_prime.select_dtypes('number').columns)
Y_columns

['Job', 'Credit amount', 'Duration']

In [319]:
combination_sizes

[48, 200, 161, 281, 23, 171, 42, 74]

In [336]:
D_prime['Credit amount']

0      2333.0
1      2333.0
2      2333.0
3      2333.0
4      2333.0
        ...  
995    1736.0
996    3857.0
997     804.0
998    1845.0
999    4576.0
Name: Credit amount, Length: 1000, dtype: float64

In [333]:
from math import ceil

lmbda = 1

def repair(Y_columns, all_stratified_groups, number_of_quantiles, lmbda):
    quantile_unit = 1.0/number_of_quantiles
    column = 'Credit amount'
    for column in Y_columns:
        for quantile_num in range(1, number_of_quantiles - 1):
            median_values_at_quantile = []
            entries_at_quantile = []    
            for group in all_stratified_groups:
                sex, age_group = group
                df_group = df[column][(df['Sex'] == sex) & (df['Age_Group'] == age_group)]
                
                quantiles = np.array_split(df_group, number_of_quantiles)
                selected_quantile = quantiles[quantile_num - 1]

                median_values_at_quantile.append(selected_quantile.median())

                for entry_ID in list(selected_quantile.index):
                    entries_at_quantile.append(entry_ID)
            
            median_values_at_quantile.sort()
            target_index = ceil(len(median_values_at_quantile)/2) - 1 # pick smaller in the case of even length lists
            target_value = median_values_at_quantile[target_index]

            for entry_ID in entries_at_quantile:
                original_value = df.loc[entry_ID, column]
                repair_value = (1 - lmbda)*original_value + lmbda*target_value
                D_prime.loc[entry_ID, column] = repair_value

repair(Y_columns, all_stratified_combinations, number_of_quantiles, lmbda)


In [338]:
len(D_prime['Credit amount'].unique())

101

In [339]:
# Create set of only independant variables by dropping Risk
def run_naive_experiment(dataframe):
    X = dataframe.drop(['Risk'], axis=1)
    y = dataframe['Risk']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=5)

    X_train_scaled = main_pipeline.fit_transform(X_train)
    X_test_scaled = main_pipeline.fit_transform(X_test)

    svc_clf = SVC(random_state=42)
    
    grid_search_cv = GridSearchCV(svc_clf, params)
    grid_search_cv.fit(X_train_scaled, y_train_scaled)

    print(grid_search_cv.best_params_)

    svc_clf = grid_search_cv.best_estimator_
    svc_clf.fit(X_train_scaled, y_train_scaled)

Unnamed: 0,Sex,Job,Housing,Credit amount,Duration,Purpose,Age_Group
0,male,2.0,own,2333.0,18.0,radio/TV,Elder
1,female,2.0,own,2333.0,18.0,radio/TV,Young
2,male,2.0,own,2333.0,18.0,education,Senior
3,male,2.0,free,2333.0,18.0,furniture/equipment,Senior
4,male,2.0,free,2333.0,18.0,car,Senior


In [346]:
svc_clf = grid_search_cv.best_estimator_
svc_clf.fit(X_train_scaled, y_train_scaled)

SVC(C=0.95, kernel='poly', random_state=42)

In [347]:
svc_clf.score(X_train_scaled, y_train_scaled)

0.7917675544794189

In [349]:
# Let's make sure the data is not overfitting
svc_clf = SVC(kernel='poly', C=0.95, random_state=42)
scores = cross_val_score(svc_clf, X_train_scaled, y_train_scaled)
scores.mean()

0.7264178665883044

In [350]:
svc_clf.fit(X_train_scaled, y_train_scaled)
y_pred = svc_clf.predict(X_test_scaled)

# Accuracy score
round(accuracy_score(y_test_scaled, y_pred) * 100, 1)

68.6