# Feature Engineering
# Klassifikation
# Regression
# Validierung und mehr
## Sampling und Resampling
## Validierungstechniken
## Grid Search und Random Search
## Performancemetriken
## Unbalancierte Daten
### Motivation und Vorbereitung 

In [2]:
## import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

In [3]:
## read and prepare data
datapath = '../3_data'
from os import chdir; chdir(datapath)
data = pd.read_csv('bank-additional-full.csv', sep=';')
print('dim =', data.shape)
print(data.y.value_counts(normalize=True)) ## proportion

X_full = data.drop('y', axis=1)
y_full = data['y']             

dim = (41188, 21)
y
no     0.887346
yes    0.112654
Name: proportion, dtype: float64


In [4]:
## minimal feature engineering: one hot encoding for not numerical features
X_full = pd.get_dummies(X_full, drop_first=True)

In [5]:
## test - train - split
from sklearn.model_selection import train_test_split
X_full_train, X_full_test, y_full_train, y_full_test, = train_test_split(
    X_full,
    y_full,
    train_size=2/3,
    random_state=1234)

In [6]:
## function for evaluate different sampling methods
##   train a RandomForestClassifier model with train data
##   return
##     internal scorer (accuracy) for test data
##     proportion of classes after resampling

from sklearn.ensemble import RandomForestClassifier
def getResampledRfScore(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(random_state=1234)
    model.fit(X_train, y_train)
    print('score ', model.score(X_test, y_test))
    print(y_train.value_counts(normalize=True)) 

In [7]:
## test call (without resampling)
getResampledRfScore(X_full_train, y_full_train, X_full_test, y_full_test)

score  0.9131828113619811
y
no     0.886773
yes    0.113227
Name: proportion, dtype: float64


### Random under-sampling

In [9]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=1234)
X_resampled_train, y_resampled_train =\
    rus.fit_resample(X_full_train, y_full_train)
getResampledRfScore(
    X_resampled_train, y_resampled_train, X_full_test, y_full_test)

score  0.8481427530954115
y
no     0.5
yes    0.5
Name: proportion, dtype: float64


### Random over-sampling

In [11]:
from imblearn.over_sampling import\
    RandomOverSampler
ros = RandomOverSampler(random_state=1234)
X_resampled_train, y_resampled_train =\
    ros.fit_resample(X_full_train, y_full_train)
getResampledRfScore(
    X_resampled_train, y_resampled_train, X_full_test, y_full_test)

score  0.9043699927166788
y
no     0.5
yes    0.5
Name: proportion, dtype: float64


### Undersampling mit Tomek Links

In [13]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X_resampled_train, y_resampled_train = tl.fit_resample(
    X_full_train, y_full_train)
getResampledRfScore(
    X_resampled_train, y_resampled_train, X_full_test, y_full_test)

found 0 physical cores < 1
  File "C:\Users\werne\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


score  0.9116533139111435
y
no     0.883063
yes    0.116937
Name: proportion, dtype: float64


### Oversampling mit SMOTE

In [15]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_resampled_train, y_resampled_train = sm.fit_resample(
    X_full_train, y_full_train)
getResampledRfScore(
    X_resampled_train, y_resampled_train, X_full_test, y_full_test)

score  0.902476329206118
y
no     0.5
yes    0.5
Name: proportion, dtype: float64


### Weights beim Trainieren

ref: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

* the formula for class_weights:

        n_samples / (n_classes * np.bincount(y))

* the weights of y are calculated inversely proportional to the frequencies of the present classes

In [19]:
## with weights: balanced
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=1234)
model.fit(X_full_train, y_full_train)
print(model.score(X_full_test, y_full_test))

0.9097596504005827


In [20]:
## with weights: balanced: mannualy set
n_no = y_full_train.value_counts()['no']
n_yes = y_full_train.value_counts()['yes']
weight_no = len(y_full_train) / (2 * n_no)
weight_yes = len(y_full_train) / (2 * n_yes)
print(weight_no, weight_yes)

model = RandomForestClassifier(
    n_estimators=100,
    class_weight={'no': weight_no,
                  'yes': weight_yes}, 
    random_state=1234)

model.fit(X_full_train, y_full_train)
print(model.score(X_full_test, y_full_test))

0.5638424575957945 4.415889353489868
0.9097596504005827
