# Feature Engineering
## Einfuehrung
## Exploration
## Transformation
## Konstruktion
## Selektion
## Implementation
### Data Frame

In [2]:
## preparation: import libraries and read data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

datapath = '../3_data'
from os import chdir

chdir(datapath)

data = pd.read_csv('bank_data.csv', sep=';')

#### E1: Entfernen von Beobachtungen nach Bedingung

In [4]:
## remove case for age > 100
data.drop(data[data.age >= 100].index, inplace=True)

#### E2: Entfernen von Duplikaten

In [6]:
## remove duplicates
data.drop_duplicates(ignore_index=True, inplace = True)

#### E3: Entfernen fragwürdiger Variablen

In [8]:
## alternative ['default', 'poutcome', 'duration']
vars_to_drop = ['default', 'poutcome']
data = data.drop(vars_to_drop, axis=1)

#### E4: Einsetzen von Werten für NAs

In [10]:
## create lists of names of of categorical and numerical variables
cat_vars = data.select_dtypes(include='object').columns.tolist()
num_vars = data.select_dtypes(exclude='object').columns.tolist()

## import SimpleImputer class
from sklearn.impute import SimpleImputer

## imput for categorical variables
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data[cat_vars] = pd.DataFrame(imp_mode.fit_transform(data[cat_vars]), columns=data[cat_vars].columns)

## imput for numerical variables
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
data[num_vars] = pd.DataFrame(imp_median.fit_transform(data[num_vars]), columns=data[num_vars].columns)

### Kategoriale Variablen

#### E5: Reduzieren der Kardinalität

In [13]:
## education: illiterate : basic.4y
data.education = np.where(
    data.education == 'illiterate', 
    'basic.4y',
    data.education)

#### Nummerisiren - Faktorisieren (Platzhalter)
hier kein Bedarf

#### E6: Nummerisiren - Ordial Encodieren

In [16]:
## education, day_of_week, month
replace_nums = {
    'education': {
        'basic.4y': 1,
        'basic.6y': 2,
        'basic.9y': 3,
        'professional.course': 4,
        'high.school': 5,
        'university.degree': 6
    },
    'month': {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12
    },
    'day_of_week': {
        'mon': 1,
        'tue': 2,
        'wed': 3,
        'thu': 4,
        'fri': 5
    }
}
data.replace(replace_nums, inplace=True)

#### E7: Nummerisieren - Binär Encodieren

In [18]:
## housing : no -> 0 else 1
data.housing = np.where(data.housing == 'no', 0, 1)

## contact : celular -> 1 else 0
data.contact = np.where(data.contact == 'cellular', 1, 0)
## rename
data = data.rename(columns={'contact': 'contact_cellular'})

#### E8: Nummerisieren - Ordinal Encodieren

In [20]:
## one-hot encoding
## apply for all categorical variables except target
target = 'y'
sel_vars = data.select_dtypes(include=['object']).columns.drop(target)
data = pd.get_dummies(data, columns=sel_vars, drop_first=True)

### Numerische Variablen

#### E9: Logarithmieren

In [23]:
## duration and campaign
data.duration = np.log10(data.duration + data.duration.min() + 1)
data.campaign = np.log10(data.campaign + data.campaign.min() + 1)

#### E10: Binär umcodieren

In [25]:
## pdays : 999 -> 0, else 1
data.pdays = np.where(data.pdays == 999, 0, 1)

## previous : > 0 -> 1 else 0
data.previous = np.where(data.previous > 0, 1, 0)

### Andere Tätigkeiten

#### Konstruktion (Platzhalter)
hier kein Bedarf

#### E11: Bereinigen der Variablennamen

In [29]:
old_names = data.columns
new_names = old_names.str.replace('[^a-zA-Z0-9_]', '_', regex=True)
for i in range(len(old_names)):
    data.rename(columns={old_names[i]:new_names[i]}, inplace=True)

#### Standardisieren (Platzhalter)
hier kein Bedarf

#### E12: Speichern unter neuem Namen

In [32]:
## as bank_data_prep.csv
## parameters
##   sep = ',' (default)
##   index = False (default True would add an index column on the left)
data.to_csv('bank_data_prep.csv', index=False)