# Feature engeneering

Przygotowanie do zajęć:
- najnowsza wersja sklearn; pip install sklearn --upgrade

In [1]:
import sklearn

In [2]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

from __future__ import print_function

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

print(data[['age', 'fare','embarked', 'sex', 'pclass']].head(15))

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

        age      fare embarked     sex  pclass
0   29.0000  211.3375        S  female       1
1    0.9167  151.5500        S    male       1
2    2.0000  151.5500        S  female       1
3   30.0000  151.5500        S    male       1
4   25.0000  151.5500        S  female       1
5   48.0000   26.5500        S    male       1
6   63.0000   77.9583        S  female       1
7   39.0000    0.0000        S    male       1
8   53.0000   51.4792        S  female       1
9   71.0000   49.5042        C    male       1
10  47.0000  227.5250        C    male       1
11  18.0000  227.5250        C  female       1
12  24.0000   69.3000        C  female       1
13  26.0000   78.8500        S  female       1
14  80.0000   30.0000        S    male       1
model score: 0.790


In [3]:
pd.DataFrame(preprocessor.fit_transform(X_train[:10]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.606584,-0.665334,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.296425,1.962814,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.130832,-0.669314,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-1.439151,-0.668448,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,-0.130832,-0.39385,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5,-1.320213,-0.671044,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6,1.891116,-0.290032,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7,-0.130832,-0.301105,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
8,-0.130832,-0.290032,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
9,0.701735,1.986345,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


### Zadanie 1

Zaimplementuj transformator, który wybiera z danych kolumny określonego typu.
* argument `column_type` - typ lub lista typów, które chcemy uwzględnić
* użyj metody pandasowej ramki danych `select_dtypes`

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnsSelectorByType(BaseEstimator, TransformerMixin):
    """
    Transformer to select columns of specified types.
    """
    
    def __init__(self):    
        ...
            
    def fit(self, X, y=None):
        ...
        return self

    def transform(self, X):
        ...

Rozwiązanie:

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnsSelectorByType(BaseEstimator, TransformerMixin):
    """
    Transformer to select columns of specified types.
    """    

    def __init__(self, column_type):
        self.column_type = column_type

    def fit(self, X, y=None):
        return self    

    def transform(self, X):
        return X.select_dtypes(include=self.column_type)

In [None]:
col_sel = ColumnsSelectorByType(object)

In [None]:
X = np.array(col_sel.fit_transform(data)[:5])
X = col_sel.fit_transform(data)[:5]
print(X)
#X[:,np.isnan(X).mean(0)<0.3]

### Zadanie 2

Zaimplementuj transormer, który zamienia zmienne, w których ponad `treshold` procent obserwacji zawiera brak danych, na zmienne binarne z wartościami 1, tam gdzie jest dana wartość oraz 0 tam, gdzie występuje brak.

In [None]:
class MissingIndicatorForSparseVariables(BaseEstimator, TransformerMixin):
    """
    Transformer to transform variables with more than treshold (%) missing values to binary - value/missing.
    """
    
    def __init__(self,treshold):
        self.treshold = treshold
            
    def fit(self, X, y=None):
        ...
        return self

    def transform(self, X):
        ...

Rozwiązanie

In [5]:
class MissingIndicatorForSparseVariables(BaseEstimator, TransformerMixin):
    """
    Transformer to transform variables with more than treshold (%) missing values to binary - value/missing.
    """

    def __init__(self,treshold):
        self.treshold = treshold

    def fit(self,X,y=None):
        column_indicators = X.isnull().mean() > self.treshold
        self.columns_to_transform = X.columns[column_indicators]
        return self

    def transform(self,X):
        Y = X.copy()
        Y[self.columns_to_transform] = \
        Y[self.columns_to_transform].notnull().astype(int)
        return Y


In [None]:
#X = np.array(col_sel.fit_transform(data)[:5])
#X = col_sel.fit_transform(data)[:5]
print(X)
MissingIndicatorForSparse(0.3).fit_transform(X)

### Zadanie 3

Zaimplementuj transformer, który usuwa wybrane kolumny.

In [None]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified columns.
    """
    
    def __init__(self):    
        ...
            
    def fit(self, X, y=None):
        ...
        return self

    def transform(self, X):
        ...

In [7]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified columns.
    """    

    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self       

    def transform(self, X):
        return X.drop(labels=self.columns,axis=1,inplace=False)

In [None]:
DropColumns(["name","boat","age"]).fit_transform(data)

### Zadanie 4

Zaimplementuj transformator, który redukuje zbiór wartości zmiennych nominalnych poprzez zastępowanie wartości występujących w mniej niż `treshold` obserwacji wartością `replace_value`, domyślnie równą `"rare_value"`.

Rozwiązanie

In [8]:
class ReduceRareValues(BaseEstimator, TransformerMixin):

    def __init__(self, treshold, replace_value="rare_value"):
        self.treshold = treshold
        self.replace_value = replace_value

    def fit(self,X,y=None):
        
        self.values_to_leave = []
        
        for column in X.columns:
            self.values_to_leave.append([column, list(
                (X[column].value_counts() > self.treshold)\
                [(X[column].value_counts() > self.treshold) == True].index)])

        self.values_to_leave = dict(self.values_to_leave)

        return self       

    def transform(self,X):

        Y = X.copy()

        for column, most_frequent_values in self.values_to_leave.items():

            values_to_replace = \
            np.setdiff1d(Y[column][Y[column].notnull()].unique(),most_frequent_values)

            if len(values_to_replace)>0:
                Y[column].replace(values_to_replace, 
                                  self.replace_value,
                                  inplace=True)

        return Y

In [None]:
X2

In [None]:
ReduceRareValues(2000,replace_value=7).fit_transform(X_train)

Klasa pomocnicza:

In [9]:
class SimpleImputerWrapper(BaseEstimator, TransformerMixin):
    """

    """   

    def __init__(self, strategy, fill_value=None):    
        self.imputer = SimpleImputer(strategy=strategy, 
                                     fill_value=fill_value)
        
    def fit(self, X, y=None):
        self.imputer.fit(X,y)
        self.columns = X.columns
        return self

    def transform(self, X):
        return pd.DataFrame(self.imputer.transform(X),
                           columns = self.columns)

### Zadanie 5 

Z zaimplementowanych transformatorów, skonstruuj pipeline do przetworzenia danych titanic od surowego zbioru do zbioru gotowego do modelowania i przetestuje model regresji logistycznej z domyślnymi parametrami. Pipeline ma przebiegać następująco:
1. Usuń kolumny: `body, boat, name, ticket, cabin, embarked, home.dest`
2. odziel zbiór na zmienne numeryczne i kategoryczne - połącz oba po osobnym przetworzeniu

3a. Zmienne numeryczne - uzupełnij braki danych średnią

3b. Zmienne kategoryczne:
    - zmienne z brakami w ponad 50% obserwacji zamiań na zmienne binarne
    - uzupełnij braki danych wartością `missing_value`
    - zredukuj wartosci wystepujące w co najwyżej 20 obserwacjach
    - zakoduj te zmienne kodowaniem one-hot, zwracając macierz gęstą

`X = data.drop(["survived"],axis=1)
y = data.survived`

In [10]:
data.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [11]:
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

X = data.drop(["survived"],axis=1,inplace=False)
y = data.survived

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

In [12]:
columns_to_drop = ['body', 'boat', 'name', 'ticket', 'cabin', 'embarked', 'home.dest']

num_pipeline = Pipeline([
        ("select_numeric", ColumnsSelectorByType([np.number])),
        ("imputer", SimpleImputer(strategy="mean")),
    ])

cat_pipeline = Pipeline([
        ("select_cat", ColumnsSelectorByType(['object'])),
        ("to_binary", MissingIndicatorForSparseVariables(0.5)),
        ("imputer", SimpleImputerWrapper(strategy = 'constant',
                                         fill_value='missing_value`')),
        ('reduce_rare', ReduceRareValues(20)),
        ('onehot', OneHotEncoder(handle_unknown='ignore',
                                 sparse=False))
    ])

preprocess_pipeline = Pipeline([
    ('drop', DropColumns(columns_to_drop)),
    ('num_cat_merge', FeatureUnion([
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ]))
])

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [14]:
model = make_pipeline(
    preprocess_pipeline, 
    LogisticRegression()
)

model.fit(X_train, y_train)
accuracy_score(model.predict(X_test),y_test)



0.8055555555555556