In [1]:
import pandas as pd

titanic_train = pd.read_csv('./datasets/train.csv')

In [2]:
X_train, y_train = titanic_train.drop("Survived", axis=1), titanic_train['Survived']

In [3]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
num_attr = ['Pclass, Age, SibSp, Parch, Fare']

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 59.2+ KB


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [7]:
import numpy as np

class AttribCombiner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = self
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        n_x = X[:, 2] + X[:, 3] + 1
        #X.drop(['SibSp', 'Parch'], axis=1)
        n_x = np.reshape(n_x, (-1,1))
        return np.append(X, n_x, axis=1)

In [129]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        #("select_numeric", DataFrameSelector(["Pclass","Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
        ("combiner", AttribCombiner()),
    ])

In [130]:
x = num_pipeline.fit_transform(X_train)

In [131]:
x

array([[ 3.    , 22.    ,  1.    ,  0.    ,  7.25  ,  2.    ],
       [ 1.    , 38.    ,  1.    ,  0.    , 71.2833,  2.    ],
       [ 3.    , 26.    ,  0.    ,  0.    ,  7.925 ,  1.    ],
       ...,
       [ 3.    , 28.    ,  1.    ,  2.    , 23.45  ,  4.    ],
       [ 1.    , 26.    ,  0.    ,  0.    , 30.    ,  1.    ],
       [ 3.    , 32.    ,  0.    ,  0.    ,  7.75  ,  1.    ]])

In [20]:
class frequencyCell(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.mostFrequent = pd.Series([X[c].value_count[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.mostFrequent)

In [136]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, list(num_attr)),
    ("imputer", frequencyCell(), ["Sex", "Embarked"] ),
    ("cat_encoder", OneHotEncoder(), ["Sex", "Embarked"]),
])

In [137]:
x = full_pipeline.fit_transform(X_train)

ValueError: A given column is not a column of the dataframe