### Categorical encodings


In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2

In [5]:
X = pd.read_csv('credit.csv')
X.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',buy_radio_tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',buy_radio_tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',buy_furniture_equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously',buy_new_car,4870,'<100','1<=X<4',3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


In [17]:
# Create numeric encoding for credit_history
credit_history_num = LabelEncoder().fit_transform(
  credit['credit_history'])

# Create a new feature matrix including the numeric encoding
X_num = pd.concat([X, pd.Series(credit_history_num)], axis=1)

# Create new feature matrix with dummies for credit_history
X_hot = pd.concat(
  [X, pd.get_dummies(credit['credit_history'])], axis=1)

# Compare the number of features of the resulting DataFrames
print(X_hot.shape[1] > X_num.shape[1])

True


- now we have the choice between label and one-hot encoding at our fingertips

In [20]:
X_hot

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,job,num_dependents,own_telephone,foreign_worker,class,'all paid','critical/other existing credit','delayed previously','existing paid','no credits/all paid'
0,'<0',6,'critical/other existing credit',buy_radio_tv,1169,'no known savings','>=7',4,'male single',none,...,skilled,1,yes,yes,good,0,1,0,0,0
1,'0<=X<200',48,'existing paid',buy_radio_tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,skilled,1,none,yes,bad,0,0,0,1,0
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'unskilled resident',2,none,yes,good,0,1,0,0,0
3,'<0',42,'existing paid',buy_furniture_equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,skilled,2,none,yes,good,0,0,0,1,0
4,'<0',24,'delayed previously',buy_new_car,4870,'<100','1<=X<4',3,'male single',none,...,skilled,2,none,yes,bad,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,'no checking',12,'existing paid',buy_furniture_equipment,1736,'<100','4<=X<7',3,'female div/dep/mar',none,...,'unskilled resident',1,none,yes,good,0,0,0,1,0
996,'<0',30,'existing paid',buy_used_car,3857,'<100','1<=X<4',4,'male div/sep',none,...,'high qualif/self emp/mgmt',1,yes,yes,good,0,0,0,1,0
997,'no checking',12,'existing paid',buy_radio_tv,804,'<100','>=7',4,'male single',none,...,skilled,1,none,yes,good,0,0,0,1,0
998,'<0',45,'existing paid',buy_radio_tv,1845,'<100','1<=X<4',4,'male single',none,...,skilled,1,yes,yes,bad,0,0,0,1,0


### Feature transformations
- The safest loan applications tend to request mid-range credit amounts. Values that are either too low or too high suggest high risk. This means that a non-linear relationship might exist between this variable and the class. 
- We want to test this hypothesis. 
- We will construct a non-linear transformation of the feature. Then, we will assess which of the two features is better at predicting the class using `SelectKBest()` and the `chi2()` metric
- Define a function that transforms a numeric vector by considering the absolute difference of each value from the average value of the vector.
- Apply this transformation to the credit_amount column of the dataset and store in new column called diff
- Create a SelectKBest() feature selector to pick one of the two columns, credit_amount and diff using the chi2() metric.

In [28]:
# Function computing absolute difference from column mean
def abs_diff(x):
    return np.abs(x-np.mean(x))

# Apply it to the credit amount and store to new column
credit['diff'] = abs_diff(credit['credit_amount'])

# Create a feature selector with chi2 that picks one feature
sk = SelectKBest(chi2, k=1)

# Use the selector to pick between credit_amount and diff
sk.fit(credit[['credit_amount', 'diff']], credit['class'])

# Inspect the results
sk.get_support()

array([ True, False])