In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class DoubleAndMissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns):
        self.numerical_columns = numerical_columns
        self.transformed_columns = [col + '_nan' for col in numerical_columns]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.numerical_columns:
            doubled_col = pd.concat([X[col], X[col]], axis=1)
            new_col_name = col + '_nan'
            X_transformed[new_col_name] = np.isnan(X[col]).astype(int)
        return X_transformed


In [3]:
df = pd.read_csv("raw-data.csv")
numerical_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']

transformer = DoubleAndMissingIndicator(numerical_cols)
df_transformed = transformer.transform(df)


In [4]:
df_transformed.head(20)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked,title,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan
0,1,1,female,29.0,0,0,211.3375,B5,S,Miss,0,0,0,0,0
1,1,1,male,0.9167,1,2,151.55,C22,S,Master,0,0,0,0,0
2,1,0,female,2.0,1,2,151.55,C22,S,Miss,0,0,0,0,0
3,1,0,male,30.0,1,2,151.55,C22,S,Mr,0,0,0,0,0
4,1,0,female,25.0,1,2,151.55,C22,S,Mrs,0,0,0,0,0
5,1,1,male,48.0,0,0,26.55,E12,S,Mr,0,0,0,0,0
6,1,1,female,63.0,1,0,77.9583,D7,S,Miss,0,0,0,0,0
7,1,0,male,39.0,0,0,0.0,A36,S,Mr,0,0,0,0,0
8,1,1,female,53.0,2,0,51.4792,C101,S,Mrs,0,0,0,0,0
9,1,0,male,71.0,0,0,49.5042,,C,Mr,0,0,0,0,0
