## Feature Engineering for categorical features
- String encoding via `LabelEncoder`

In [29]:
import os
import pandas  as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib   import Path
from sklearn.preprocessing           import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# set root directory
path_root = Path("C:/Users/giann/data-science-core")
os.chdir(path_root)
print(f'- Root directory = {os.getcwd()}')

- Root directory = C:\Users\giann\data-science-core


In [24]:
# import dataset
path_dataset = path_root / 'dataset/credit.csv'
credit       = pd.read_csv(path_dataset)
credit.head(3)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',buy_radio_tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',buy_radio_tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good


In [17]:
credit['checking_status'].unique()

array(["'<0'", "'0<=X<200'", "'no checking'", "'>=200'"], dtype=object)

### Label Encoding
Encode the string columns numerically using `LabelEncoder`. Watch out when using LabelEncoder as a feature because the ML algorithm would think that there is a natural order of the number. One way to handle this problem is using **One-Hot-Encoding**.

In [19]:
non_numeric_columns = sales_df.select_dtypes(include = 'object').columns
# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    credit[column] = le.fit_transform(credit[column])
# Inspect the data types of the columns of the data frame
print(credit.dtypes)

checking_status           int32
duration                  int64
credit_history            int32
purpose                   int32
credit_amount             int64
savings_status            int32
employment                int32
installment_commitment    int64
personal_status           int32
other_parties             int32
residence_since           int64
property_magnitude        int32
age                       int64
other_payment_plans       int32
housing                   int32
existing_credits          int64
job                       int32
num_dependents            int64
own_telephone             int32
foreign_worker            int32
class                     int32
dtype: object


In [21]:
credit['checking_status'].unique()

array([1, 0, 3, 2], dtype=int64)

### Categorical Encoding

In [35]:
credit['purpose'].unique()

**One-hot Encoding**

In [36]:
Xhot = pd.get_dummies(credit['purpose'])
Xhot.head()

Unnamed: 0,business,buy_domestic_appliance,buy_furniture_equipment,buy_new_car,buy_radio_tv,buy_used_car,education,other,repairs,retraining
0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0


**Keyword Encoding**

In [41]:
vec = CountVectorizer()
credit['purpose'] = credit['purpose'].apply(lambda s: ' '.join(s.split('_')), 0)
dummy_matrix = vec.fit_transform(credit['purpose']).toarray()
pd.DataFrame(dummy_matrix, columns=vec.get_feature_names()).head()

Unnamed: 0,appliance,business,buy,car,domestic,education,equipment,furniture,new,other,radio,repairs,retraining,tv,used
0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0
