# Coursework Assignment: Bias in AI

Link to dataset used (given in 'Project Suggestions'): https://www.kaggle.com/kabure/german-credit-data-with-risk

In [160]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [161]:
# load the data
df = pd.read_csv("./data/german_credit_data.csv", index_col=0)

# make a copy of the original data
original_df = df.copy()

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 85.9+ KB


In [163]:
df.describe()

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.546,1.904,3271.258,20.903
std,11.375469,0.653614,2822.736876,12.058814
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


In [164]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [165]:
# create age group column
df['Age_Group'] = np.nan

df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
0,67,male,2,own,,little,1169,6,radio/TV,good,
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,
2,49,male,1,own,little,,2096,12,education,good,
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,
4,53,male,2,free,little,little,4870,24,car,bad,


In [166]:
for col in [df]:
    print(col)

     Age     Sex  Job Housing Saving accounts Checking account  Credit amount  \
0     67    male    2     own             NaN           little           1169   
1     22  female    2     own          little         moderate           5951   
2     49    male    1     own          little              NaN           2096   
3     45    male    2    free          little           little           7882   
4     53    male    2    free          little           little           4870   
..   ...     ...  ...     ...             ...              ...            ...   
995   31  female    1     own          little              NaN           1736   
996   40    male    3     own          little           little           3857   
997   38    male    2     own          little              NaN            804   
998   23    male    2    free          little           little           1845   
999   27    male    2     own        moderate         moderate           4576   

     Duration              

In [167]:
# populate age group column
for col in [df]:
    col.loc[(col['Age'] > 18) & (col['Age'] <= 29), 'Age_Group'] = 'Young'
    col.loc[(col['Age'] > 29) & (col['Age'] <= 40), 'Age_Group'] = 'Young Adults'
    col.loc[(col['Age'] > 40) & (col['Age'] <= 55), 'Age_Group'] = 'Senior'
    col.loc[col['Age'] > 55, 'Age_Group'] = 'Elder' 
    
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Age_Group
0,67,male,2,own,,little,1169,6,radio/TV,good,Elder
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,Young
2,49,male,1,own,little,,2096,12,education,good,Senior
3,45,male,2,free,little,little,7882,42,furniture/equipment,good,Senior
4,53,male,2,free,little,little,4870,24,car,bad,Senior


In [168]:
df['Sex'].value_counts()

male      690
female    310
Name: Sex, dtype: int64

In [169]:
df['Age_Group'].value_counts()

Young           371
Young Adults    355
Senior          203
Elder            71
Name: Age_Group, dtype: int64

In [170]:
def print_stats(df):
    print('mean: ', [round(num, 1) for num in list(df.mean())[1:]])
    print('variance: ', [round(num, 1) for num in list(df.var())[1:]])
    print('\n')
    zips = []
    for col in ['Housing', 'Saving accounts', 'Checking account',
       'Purpose', 'Risk']:
       vals = df[col].value_counts().values
       inds = df[col].value_counts().index
       z = list(zip(inds, vals))[:3]
       if col == 'Risk':
           z.append('-')
       zips.append(z)
       print(z)
       print('\n')
    print(list(zip(zips[0], zips[1], zips[2], zips[3], zips[4])))

In [171]:
df_male = df[df['Sex'] == 'male']
print_stats(df_male)

mean:  [1.9, 3448.0, 21.6]
variance:  [0.4, 8412806.3, 154.7]


[('own', 517), ('free', 89), ('rent', 84)]


[('little', 409), ('moderate', 71), ('quite rich', 47)]


[('little', 186), ('moderate', 183), ('rich', 43)]


[('car', 243), ('radio/TV', 195), ('furniture/equipment', 107)]


[('good', 499), ('bad', 191), '-']


[(('own', 517), ('little', 409), ('little', 186), ('car', 243), ('good', 499)), (('free', 89), ('moderate', 71), ('moderate', 183), ('radio/TV', 195), ('bad', 191)), (('rent', 84), ('quite rich', 47), ('rich', 43), ('furniture/equipment', 107), '-')]


In [172]:
df_female = df[df['Sex'] == 'female']
print_stats(df_female)

mean:  [1.8, 2877.8, 19.4]
variance:  [0.5, 6776346.3, 122.1]


[('own', 196), ('rent', 95), ('free', 19)]


[('little', 194), ('moderate', 32), ('rich', 19)]


[('little', 88), ('moderate', 86), ('rich', 20)]


[('car', 94), ('radio/TV', 85), ('furniture/equipment', 74)]


[('good', 201), ('bad', 109), '-']


[(('own', 196), ('little', 194), ('little', 88), ('car', 94), ('good', 201)), (('rent', 95), ('moderate', 32), ('moderate', 86), ('radio/TV', 85), ('bad', 109)), (('free', 19), ('rich', 19), ('rich', 20), ('furniture/equipment', 74), '-')]


In [173]:
df_Young = df[df['Age_Group'] == 'Young']
print_stats(df_Young)

mean:  [1.8, 3089.0, 20.8]
variance:  [0.3, 7261837.7, 142.6]


[('own', 248), ('rent', 113), ('free', 10)]


[('little', 242), ('moderate', 42), ('quite rich', 19)]


[('little', 115), ('moderate', 112), ('rich', 24)]


[('radio/TV', 117), ('car', 102), ('furniture/equipment', 84)]


[('good', 234), ('bad', 137), '-']


[(('own', 248), ('little', 242), ('little', 115), ('radio/TV', 117), ('good', 234)), (('rent', 113), ('moderate', 42), ('moderate', 112), ('car', 102), ('bad', 137)), (('free', 10), ('quite rich', 19), ('rich', 24), ('furniture/equipment', 84), '-')]


In [174]:
df_YoungAdults = df[df['Age_Group'] == 'Young Adults']
print_stats(df_YoungAdults)

mean:  [2.0, 3375.5, 21.5]
variance:  [0.4, 7646336.1, 139.2]


[('own', 278), ('free', 39), ('rent', 38)]


[('little', 201), ('moderate', 41), ('quite rich', 24)]


[('moderate', 100), ('little', 81), ('rich', 18)]


[('car', 128), ('radio/TV', 93), ('furniture/equipment', 58)]


[('good', 264), ('bad', 91), '-']


[(('own', 278), ('little', 201), ('moderate', 100), ('car', 128), ('good', 264)), (('free', 39), ('moderate', 41), ('little', 81), ('radio/TV', 93), ('bad', 91)), (('rent', 38), ('quite rich', 24), ('rich', 18), ('furniture/equipment', 58), '-')]


In [175]:
df_Senior = df[df['Age_Group'] == 'Senior']
print_stats(df_Senior)

mean:  [1.9, 3366.4, 20.2]
variance:  [0.4, 7986564.4, 146.1]


[('own', 143), ('free', 40), ('rent', 20)]


[('little', 117), ('moderate', 16), ('quite rich', 15)]


[('little', 57), ('moderate', 39), ('rich', 15)]


[('car', 79), ('radio/TV', 51), ('furniture/equipment', 36)]


[('good', 150), ('bad', 53), '-']


[(('own', 143), ('little', 117), ('little', 57), ('car', 79), ('good', 150)), (('free', 40), ('moderate', 16), ('moderate', 39), ('radio/TV', 51), ('bad', 53)), (('rent', 20), ('quite rich', 15), ('rich', 15), ('furniture/equipment', 36), '-')]


In [176]:
df_Elder = df[df['Age_Group'] == 'Elder']
print_stats(df_Elder)

mean:  [1.8, 3430.4, 20.5]
variance:  [0.7, 13329819.2, 192.5]


[('own', 44), ('free', 19), ('rent', 8)]


[('little', 43), ('rich', 5), ('quite rich', 5)]


[('little', 21), ('moderate', 18), ('rich', 6)]


[('car', 28), ('radio/TV', 19), ('business', 9)]


[('good', 52), ('bad', 19), '-']


[(('own', 44), ('little', 43), ('little', 21), ('car', 28), ('good', 52)), (('free', 19), ('rich', 5), ('moderate', 18), ('radio/TV', 19), ('bad', 19)), (('rent', 8), ('quite rich', 5), ('rich', 6), ('business', 9), '-')]


In [177]:
for group in [df_male, df_female, df_Young, df_YoungAdults, df_Senior, df_Elder]:
    print(round(sum(group['Risk'] == 'bad')/len(group) * 100, 1))

27.7
35.2
36.9
25.6
26.1
26.8


In [178]:
# Check missing values in our dataframe
df.isnull().sum().sort_values(ascending=False)

Checking account    394
Saving accounts     183
Age_Group             0
Risk                  0
Purpose               0
Duration              0
Credit amount         0
Housing               0
Job                   0
Sex                   0
Age                   0
dtype: int64

In [179]:
df.drop(['Checking account', 'Saving accounts'], axis=1, inplace=True)

In [180]:
df.isnull().sum().sort_values(ascending=False)

Age_Group        0
Risk             0
Purpose          0
Duration         0
Credit amount    0
Housing          0
Job              0
Sex              0
Age              0
dtype: int64

In [182]:
# Create set of only independant variables by dropping Risk
X = df.drop(['Risk'], axis=1)
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Credit amount,Duration,Purpose,Age_Group
0,67,male,2,own,1169,6,radio/TV,Elder
1,22,female,2,own,5951,48,radio/TV,Young
2,49,male,1,own,2096,12,education,Senior
3,45,male,2,free,7882,42,furniture/equipment,Senior
4,53,male,2,free,4870,24,car,Senior


In [184]:
# Create a series of outcome variable only
y = df['Risk']
y.head()

0    good
1     bad
2    good
3    good
4     bad
Name: Risk, dtype: object

In [185]:
# split datasets into training and test subsets for both X and y using sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=5)

In [186]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """Encode categorical features as a numeric array.
    The input to this transformer should be a matrix of integers or strings,
    denoting the values taken on by categorical (discrete) features.
    The features can be encoded using a one-hot aka one-of-K scheme
    (``encoding='onehot'``, the default) or converted to ordinal integers
    (``encoding='ordinal'``).
    This encoding is needed for feeding categorical data to many scikit-learn
    estimators, notably linear models and SVMs with the standard kernels.
    Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
    Parameters
    ----------
    encoding : str, 'onehot', 'onehot-dense' or 'ordinal'
        The type of encoding to use (default is 'onehot'):
        - 'onehot': encode the features using a one-hot aka one-of-K scheme
          (or also called 'dummy' encoding). This creates a binary column for
          each category and returns a sparse matrix.
        - 'onehot-dense': the same as 'onehot' but returns a dense array
          instead of a sparse matrix.
        - 'ordinal': encode the features as ordinal integers. This results in
          a single column of integers (0 to n_categories - 1) per feature.
    categories : 'auto' or a list of lists/arrays of values.
        Categories (unique values) per feature:
        - 'auto' : Determine categories automatically from the training data.
        - list : ``categories[i]`` holds the categories expected in the ith
          column. The passed categories are sorted before encoding the data
          (used categories can be found in the ``categories_`` attribute).
    dtype : number type, default np.float64
        Desired dtype of output.
    handle_unknown : 'error' (default) or 'ignore'
        Whether to raise an error or ignore if a unknown categorical feature is
        present during transform (default is to raise). When this is parameter
        is set to 'ignore' and an unknown category is encountered during
        transform, the resulting one-hot encoded columns for this feature
        will be all zeros.
        Ignoring unknown categories is not supported for
        ``encoding='ordinal'``.
    Attributes
    ----------
    categories_ : list of arrays
        The categories of each feature determined during fitting. When
        categories were specified manually, this holds the sorted categories
        (in order corresponding with output of `transform`).
    Examples
    --------
    Given a dataset with three features and two samples, we let the encoder
    find the maximum value per feature and transform the data to a binary
    one-hot encoding.
    >>> from sklearn.preprocessing import CategoricalEncoder
    >>> enc = CategoricalEncoder(handle_unknown='ignore')
    >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
    ... # doctest: +ELLIPSIS
    CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
              encoding='onehot', handle_unknown='ignore')
    >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()
    array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.],
           [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]])
    See also
    --------
    sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
      integer ordinal features. The ``OneHotEncoder assumes`` that input
      features take on values in the range ``[0, max(feature)]`` instead of
      using the unique values.
    sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
      dictionary items (also handles string-valued features).
    sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
      encoding of dictionary items or strings.
    """

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [187]:
# Scikit-Learn does not handle dataframes in pipeline so we will create our own class.
# Reference: Hands-On Machine Learning
from sklearn.base import BaseEstimator, TransformerMixin
# Create a class to select numerical or cateogrical columns.
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit (self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [188]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

numeric_train_df = X_train.select_dtypes(exclude=['object'])
numeric_test_df = X_test.select_dtypes(exclude=['object'])

categorical_train_df = X_train.select_dtypes(['object'])
categorical_test_df = X_test.select_dtypes(['object'])

numerical_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(numeric_train_df.columns.values.tolist())),
    ("std_scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('select_categoric', DataFrameSelector(categorical_train_df.columns.values.tolist())),
    ('encoding', CategoricalEncoder(encoding='onehot-dense'))
])

# Combine both pipelines
main_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', numerical_pipeline),
    ('cat_pipeline', categorical_pipeline)
])

X_train_scaled = main_pipeline.fit_transform(X_train)
X_test_scaled = main_pipeline.fit_transform(X_test)

In [189]:
from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()
y_train_scaled = encode.fit_transform(y_train)
y_test_scaled = encode.fit_transform(y_test)

In [190]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Implement gridsearchcv to see which are our best p

params = {'C': [0.75, 0.85, 0.95, 1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [3, 4, 5]}

svc_clf = SVC(random_state=42)

grid_search_cv = GridSearchCV(svc_clf, params)
grid_search_cv.fit(X_train_scaled, y_train_scaled)

GridSearchCV(estimator=SVC(random_state=42),
             param_grid={'C': [0.75, 0.85, 0.95, 1], 'degree': [3, 4, 5],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [191]:
grid_search_cv.best_estimator_

SVC(C=0.95, degree=4, kernel='poly', random_state=42)

In [192]:
grid_search_cv.best_params_


{'C': 0.95, 'degree': 4, 'kernel': 'poly'}

In [193]:
svc_clf = grid_search_cv.best_estimator_
svc_clf.fit(X_train_scaled, y_train_scaled)

SVC(C=0.95, degree=4, kernel='poly', random_state=42)

In [194]:
svc_clf.score(X_train_scaled, y_train_scaled)

0.8078078078078078

In [195]:
from sklearn.model_selection import cross_val_score

# Let's make sure the data is not overfitting
svc_clf = SVC(kernel='rbf', C=1, random_state=42)
scores = cross_val_score(svc_clf, X_train_scaled, y_train_scaled)
scores.mean()

0.7026820783301537

In [196]:
from sklearn.metrics import accuracy_score

svc_clf.fit(X_train_scaled, y_train_scaled)
y_pred = svc_clf.predict(X_test_scaled)

# Accuracy score
accuracy_score(y_test_scaled, y_pred)

0.7245508982035929