In [286]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

from __future__ import print_function

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

print(data[['age', 'fare','embarked', 'sex', 'pclass']].head(15))

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

        age      fare embarked     sex  pclass
0   29.0000  211.3375        S  female       1
1    0.9167  151.5500        S    male       1
2    2.0000  151.5500        S  female       1
3   30.0000  151.5500        S    male       1
4   25.0000  151.5500        S  female       1
5   48.0000   26.5500        S    male       1
6   63.0000   77.9583        S  female       1
7   39.0000    0.0000        S    male       1
8   53.0000   51.4792        S  female       1
9   71.0000   49.5042        C    male       1
10  47.0000  227.5250        C    male       1
11  18.0000  227.5250        C  female       1
12  24.0000   69.3000        C  female       1
13  26.0000   78.8500        S  female       1
14  80.0000   30.0000        S    male       1
model score: 0.790


In [7]:
?SimpleImputer

In [2]:
pd.DataFrame(preprocessor.fit_transform(X_train[:10]))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.606584,-0.665334,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.296425,1.962814,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.130832,-0.669314,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-1.439151,-0.668448,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,-0.130832,-0.39385,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5,-1.320213,-0.671044,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6,1.891116,-0.290032,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7,-0.130832,-0.301105,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
8,-0.130832,-0.290032,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
9,0.701735,1.986345,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [5]:
preprocessor.

SyntaxError: invalid syntax (<ipython-input-5-0c1862a4e2bb>, line 1)

### Zadanie 0

Zaimplementuj standardScaler

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyStandardScaler(BaseEstimator, TransformerMixin):
    """
    Subtract mean and divide by std
    """
    
    def __init__(self, with_mean=True): 
        self.with_mean = with_mean
            
    def fit(self, X, y=None):
        
        if self.with_mean:
            self.means = X.mean(axis=0)
        self.stds = X.std(axis=0)
        
        return self

    def transform(self, X):
        if self.with_mean:
            return (X-self.means) / self.stds 
        return X / self.stds

In [4]:
a = pd.DataFrame(
        {"A":[1,2,3],
         "B":[5,6,7]}
)
a

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7


In [5]:
sc = MyStandardScaler(True)
sc

MyStandardScaler(with_mean=True)

In [6]:
sc.fit(a)

MyStandardScaler(with_mean=True)

In [7]:
sc.means

A    2.0
B    6.0
dtype: float64

In [8]:
sc.transform(a)

Unnamed: 0,A,B
0,-1.0,-1.0
1,0.0,0.0
2,1.0,1.0


### Zadanie 1

Zaimplementuj transformer, który usuwa wybrane kolumny.

In [279]:
class DropColumns(BaseEstimator, TransformerMixin):
    """
    Transformer to drop specified columns.
    """
    
    def __init__(self, columns): 
        self.columns = columns
            
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns = self.columns)

In [187]:
X.columns

Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',
       'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [11]:
drop = DropColumns(["pclass","name"])
drop.transform(X).columns

Index(['sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked',
       'boat', 'body', 'home.dest'],
      dtype='object')

### Zadanie 2

Zaimplementuj transformator, który wybiera z danych kolumny określonego typu.
* argument `column_type` - typ lub lista typów, które chcemy uwzględnić
* użyj metody pandasowej ramki danych `select_dtypes`

In [12]:
column_types = [np.int]
X.select_dtypes(column_types)[:5]

Unnamed: 0,pclass,sibsp,parch
0,1,0,0
1,1,1,2
2,1,1,2
3,1,1,2
4,1,1,2


In [280]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnsSelectorByType(BaseEstimator, TransformerMixin):
    """
    Transformer to select columns of specified types.
    """
    
    def __init__(self, column_type):    
        self.column_type = column_type
            
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.select_dtypes(self.column_type)

In [14]:
X.dtypes

pclass         int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [15]:
columns_select_types = ColumnsSelectorByType([np.int,np.object])
columns_select_types.transform(X).dtypes

pclass        int64
name         object
sex          object
sibsp         int64
parch         int64
ticket       object
cabin        object
embarked     object
boat         object
home.dest    object
dtype: object

Rozwiązanie:

### Zadanie 3

Zaimplementuj transormer, który zamienia zmienne, w których ponad `treshold` procent obserwacji zawiera brak danych, na zmienne binarne z wartościami 1, tam gdzie jest dana wartość oraz 0 tam, gdzie występuje brak.
* wykorzystaj `MissingIndicator` z podmodułu `sklearn.impute`

In [38]:
len(X)

1309

In [64]:
pd.Series([True]).astype(np.int)

0    1
dtype: int64

In [65]:
treshold = 0.3
columns_to_transform = X.columns[X.isnull().sum() / len(X) > treshold]
X[columns_to_transform].notnull().astype(np.int)[:5]
#Y = X.copy()
#Y[columns_to_transform] = Y[columns_to_transform].isnull().astype(np.int)
#Y[:5]

Unnamed: 0,cabin,boat,body,home.dest
0,1,1,0,1
1,1,1,0,1
2,1,0,0,1
3,1,0,1,1
4,1,0,0,1


In [281]:
class MissingIndicatorForSparse(BaseEstimator, TransformerMixin):
    """
    Transformer to transform variables with more than treshold (%) missing values to binary - value/missing.
    """
    
    def __init__(self, treshold):    
        self.treshold = treshold
            
    def fit(self, X, y=None):
        column_indicators = X.isnull().mean() > self.treshold
        self.columns_to_transform = X.columns[column_indicators]
        return self

    def transform(self, X):
        Y = X.copy()
        Y[self.columns_to_transform] = \
        Y[self.columns_to_transform].notnull().astype(np.int)
        return Y

In [75]:
miss_ind = MissingIndicatorForSparse(0.15)
miss_ind.fit(X)

MissingIndicatorForSparse(treshold=0.15)

In [76]:
miss_ind.transform(X)[:5]

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,1,S,1,0,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,1,S,1,0,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,1,S,0,0,1
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,1,S,0,1,1
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,1,S,0,0,1


In [69]:
X[:5]

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


Rozwiązanie

### Zadanie 4

Zaimplementuj transformator, który redukuje zbiór wartości zmiennych nominalnych poprzez zastępowanie wartości występujących w mniej niż `treshold` obserwacji wartością `replace_value`, domyślnie równą `"rare_value"`.

In [119]:
s = pd.Series(["a","b","a","a"], dtype="object")
df = pd.DataFrame({"s": s, "n": [1,2,3,4]})
df

Unnamed: 0,s,n
0,a,1
1,b,2
2,a,3
3,a,4


In [160]:
cat_columns = df.columns[df.dtypes == 'object']
cat_columns

Index(['s'], dtype='object')

In [127]:
df[cat_columns[0]].astype("category")

0    a
1    b
2    a
3    a
Name: s, dtype: category
Categories (2, object): [a, b]

In [161]:
columns = df.columns[df.dtypes == 'object']
treshold = 2
non_rares = {}
for column in columns:
    non_rares[column] = []
    # df[column].cat.add_categories(["rare_value"], inplace=True)
    cats = df[column].unique()
    for cat in cats:
        if (df[column] == cat).sum() >= treshold:
            non_rares[column].append(cat)
non_rares

{'s': ['a']}

Rozwiązanie

In [282]:
class ReduceRareValues(BaseEstimator, TransformerMixin):
    """
    Transformer to transform variables with more than treshold (%) missing values to binary - value/missing.
    """
    
    def __init__(self, treshold, replace_value = 'rare_value'):    
        self.treshold = treshold
        self.replace_value = replace_value
            
    def fit(self, X, y=None):
        self._obj_columns_ = X.columns[X.dtypes == 'object']
        self.non_rares = {}
        
        for column in self._obj_columns_:
            self.non_rares[column] = []
            cats = X[column].unique()
            for cat in cats:
                if (X[column] == cat).sum() >= treshold:
                    self.non_rares[column].append(cat)
                    
        return self

    def transform(self, X):
        Y = X.copy()
        for column, cats in self.non_rares.items():
            Y[column] = Y[column].astype("category")
            Y[column].cat.add_categories([self.replace_value], inplace=True)
            
            values_to_replace = np.setdiff1d(Y[column][Y[column].notnull()], \
                                             cats)
                    
            if len(values_to_replace):
                Y[column].replace(values_to_replace, self.replace_value, inplace=True)
#             print(values_to_replace)
#             Y[column].values not  
#             Y[column][inds] = "rare_value"

        return Y

In [252]:
pd.Series([None]).astype('category')

0    NaN
dtype: category
Categories (0, object): []

In [259]:
red_rares = ReduceRareValues(10)
red_rares.fit(X)

ReduceRareValues(replace_value='rare_value', treshold=10)

In [254]:
X.columns[X.dtypes == 'object']

Index(['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'], dtype='object')

In [255]:
red_rares._obj_columns_

Index(['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'], dtype='object')

In [256]:
red_rares.non_rares

{'name': ['Connolly, Miss. Kate', 'Kelly, Mr. James'],
 'sex': ['female', 'male'],
 'ticket': ['24160',
  '113781',
  '13502',
  '11769',
  'PC 17757',
  'PC 17477',
  '19877',
  'PC 17558',
  '11813',
  '13050',
  '11751',
  'PC 17483',
  '11967',
  'PC 17760',
  '36928',
  'PC 17608',
  '113505',
  '16966',
  '113798',
  'PC 17755',
  '113059',
  '113760',
  'W.E.P. 5734',
  '113806',
  '110152',
  '13508',
  '110465',
  'PC 17756',
  'WE/P 5735',
  'PC 17599',
  'F.C. 12750',
  '17474',
  '33638',
  'PC 17761',
  'PC 17485',
  '11767',
  '36947',
  '17421',
  '19950',
  'PC 17611',
  '13567',
  '112058',
  '113803',
  '113503',
  '112378',
  'PC 17593',
  '17453',
  'PC 17582',
  'PC 17759',
  '11765',
  'PC 17572',
  '113796',
  '36973',
  '12749',
  '17463',
  '111361',
  '113789',
  '19943',
  '113572',
  '17464',
  '11753',
  'PC 17592',
  'PC 17569',
  '113773',
  'PC 17604',
  '19928',
  '13236',
  '35273',
  'PC 17758',
  '113509',
  '113776',
  'PC 17585',
  'PC 17603',
  '1

In [260]:
red_rares.transform(X)[:5]

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,rare_value,female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,rare_value,male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,rare_value,female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,rare_value,male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,rare_value,female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [248]:
X[:5]

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Zadanie 5 

Z zaimplementowanych transformatorów, skonstruuj pipeline do przetworzenia danych titanic od surowego zbioru do zbioru gotowego do modelowania i przetestuje model regresji logistycznej z domyślnymi parametrami. Pipeline ma przebiegać następująco:
1. Usuń kolumny: `body, boat, name, ticket, cabin, embarked, home.dest`
2. odziel zbiór na zmienne numeryczne i kategoryczne - połącz oba po osobnym przetworzeniu. Uzyć FeatureUnion

3a. Zmienne numeryczne - uzupełnij braki danych średnią

3b. Zmienne kategoryczne:
    - zmienne z brakami w ponad 50% obserwacji zamiań na zmienne binarne
    - uzupełnij braki danych wartością `missing_value`
    - zredukuj wartosci wystepujące w co najwyżej 20 obserwacjach
    - zakoduj te zmienne kodowaniem one-hot, zwracając macierz gęstą

In [300]:
class SimpleImputerWrapper(BaseEstimator, TransformerMixin):
    '''
    '''
    def __init__(self, strategy, fill_value = None):
        self.imputer = SimpleImputer(strategy=strategy, fill_value=fill_value)
    
    def fit(self, X, y=None):
        self.columns = X.columns
        self.imputer.fit(X,y)
        return self
    
    def transform(self, X, y=None):
        Y = self.imputer.transform(X)
        Y = pd.DataFrame(Y, columns = self.columns)
        return Y

In [305]:
SimpleImputerWrapper("constant", fill_value = 'dupa').fit_transform(pd.DataFrame({'a':['b','c', np.nan]}))

Unnamed: 0,a
0,b
1,c
2,dupa


In [None]:
data.head(10)

In [283]:
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

X = data.drop(["survived"],axis=1,inplace=False)
y = data.survived

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

In [None]:


flexibiuro0pc

20flexibiuro0pc15

In [314]:
from sklearn.linear_model import LinearRegression

In [330]:
num_pipeline = Pipeline([ ('', ColumnsSelectorByType([np.number]),
                          ('',  SimpleImputerWrapper(strategy='mean')
                        ])
objs_pipeline = Pipeline([('', ColumnsSelectorByType(['object'])),\
                          ('', MissingIndicatorForSparse(0.5)),\
                          ('', SimpleImputerWrapper(strategy='constant', fill_value='missing_value'),
                                                    ReduceRareValues(20)),\
                          ('', OneHotEncoder(handle_unknown='ignore',
                                             sparse = False))
                         ])

preprocess_pipeline = Pipeline([
    ('drop_cols', DropColumns(['body', 'boat', 'name', 'ticket', 'cabin', 'embarked', 'home.dest']) )
     ,
    ('do_nums_objs', FeatureUnion([('do_nums', num_pipeline),
                                   ('do_objs', objs_pipeline)
                                 ])
    )
    ])

pipeline = Pipeline([
    ('preprocess', preprocess_pipeline),
    ('model', LogisticRegression())    
])

In [340]:
preprocess_pipeline.fit(X_train)

Pipeline(memory=None,
     steps=[('drop_cols', DropColumns(columns=['body', 'boat', 'name', 'ticket', 'cabin', 'embarked', 'home.dest'])), ('do_nums_objs', FeatureUnion(n_jobs=None,
       transformer_list=[('do_nums', Pipeline(memory=None,
     steps=[('columnsselectorbytype', ColumnsSelectorByType(column_type=[<class 'nump...andle_unknown='ignore',
       n_values=None, sparse=False))]))],
       transformer_weights=None))])

In [341]:
preprocess_pipeline.transform(X_train)

array([[  3.        ,  25.        ,   0.        , ...,   7.925     ,
          0.        ,   1.        ],
       [  1.        ,  41.        ,   0.        , ..., 134.5       ,
          1.        ,   0.        ],
       [  3.        ,  30.05182319,   0.        , ...,   7.7333    ,
          0.        ,   1.        ],
       ...,
       [  3.        ,  30.05182319,   0.        , ...,   7.7333    ,
          1.        ,   0.        ],
       [  2.        ,  20.        ,   0.        , ...,  36.75      ,
          1.        ,   0.        ],
       [  3.        ,  32.        ,   1.        , ...,  15.5       ,
          1.        ,   0.        ]])

In [345]:
pipe = pipeline.fit(X_train, y_train)



In [351]:
pipe.score(X_test, y_test)

0.7900763358778626

In [353]:
accuracy_score(pipe.predict(X_test, y_test))

NameError: name 'accuracy_score' is not defined

In [346]:
pipe.predict(X_train)

array([0, 1, 0, ..., 1, 1, 1])

In [347]:
(~(pipe.predict(X_train) == y_train)).mean()

0.21203438395415472

In [None]:
from sklearn.model_selection