In [1]:
import numpy as np 
import pandas as pd 
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline



In [2]:
data = {
    'age':[40, 41, 45, 38, np.nan, np.nan, 56, 25, 30, 31, 40],
    'income':[10, 20, 5, 8, 15, 30, 50, 3, 2, np.nan, np.nan],
    'gender':['male', 'male', 'female', 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female']
}
X_train = pd.DataFrame(data)
X_train

Unnamed: 0,age,income,gender
0,40.0,10.0,male
1,41.0,20.0,male
2,45.0,5.0,female
3,38.0,8.0,male
4,,15.0,female
5,,30.0,female
6,56.0,50.0,female
7,25.0,3.0,male
8,30.0,2.0,male
9,31.0,,male


In [3]:
X_train[['age', 'income']].mean().round(2).to_dict()

{'age': 38.44, 'income': 15.89}

In [4]:
X_train

Unnamed: 0,age,income,gender
0,40.0,10.0,male
1,41.0,20.0,male
2,45.0,5.0,female
3,38.0,8.0,male
4,,15.0,female
5,,30.0,female
6,56.0,50.0,female
7,25.0,3.0,male
8,30.0,2.0,male
9,31.0,,male


In [5]:
s = 'o d' 
if s in ['omr', 'omd']:
    print('included')
else:
    print('excluded')

excluded


In [6]:
class OrganicScore(BaseEstimator, TransformerMixin):

    def __init__(self, transformation, variables=None):

        if transformation not in ['TypeConverter', 'OrganicScoreCalc', 'MeanImputer']:
            raise ValueError('"transformation" has to take one of the following values: ' + \
                             'TypeConverter, OrganicScoreCalc', 'MeanImputer')       

        if transformation == 'MeanImputer' and not isinstance(variables, list):
                raise ValueError('variables should be a list and cannot be left empty')
            
        self.transformation = transformation
        self.variables = variables

        # pass   

    def fit(self, X, y=None):
        # we need this step to fit the sklearn pipeline
        if self.transformation == 'MeanImputer':
            self.imputer_dict_ = X[self.variables].mean().round(2).to_dict()
        return self

    def transform(self, X):
        if self.transformation == 'TypeConverter':
            X['age'] = X['age'].astype(float)
        elif self.transformation == 'OrganicScoreCalc':
            X['CrrScore_organic'] =  0.2 * X['age'] + 0.5 * X['income']
        elif self.transformation == 'MeanImputer':
            X = X.copy()
            for var in self.variables:
                X[var].fillna(self.imputer_dict_[var], inplace=True)
        return X

In [7]:
pm_pipe = Pipeline([
    # impute missings with mean 
    ('mean_imputer', OrganicScore(transformation='MeanImputer', variables=['age', 'income'])),
    
    # convert data type of age to float
    ('type_conversion', OrganicScore(transformation='TypeConverter')),

    # calculate organic CRR score
    ('organic_CRR_score', OrganicScore(transformation='OrganicScoreCalc'))
])

In [8]:
X_test = pd.DataFrame({'age':[8, 12, np.nan], 'income':[5, np.nan, 8]})
X_test

Unnamed: 0,age,income
0,8.0,5.0
1,12.0,
2,,8.0


In [9]:
pm_pipe.fit(X_train)

In [10]:
X_train[['age', 'income']].mean()

age       38.444444
income    15.888889
dtype: float64

In [11]:
X_train = pm_pipe.fit_transform(X_train)

In [16]:
pm_pipe['mean_imputer'].imputer_dict_

{'age': 38.44, 'income': 15.89}

In [15]:
pm_pipe['mean_imputer'].variables

['age', 'income']

In [13]:
X_test = pm_pipe.transform(X_test)

In [14]:
X_test

Unnamed: 0,age,income,CrrScore_organic
0,8.0,5.0,4.1
1,12.0,15.89,10.345
2,38.44,8.0,11.688


In [4]:
class MeanImputer():
    def __init__(self, variables):
        if not isinstance (variables, list):
            print('not list')
        self.variables = variables

    def fit(self, X, y=None):
        self.imputer_dict_ = X[self.variables].mean().round(2).to_dict()
        return self 
    
    def transform(self, X, y=None):
        X = X.copy()
        for var in self.variables:
            X[var].fillna(self.imputer_dict_[var], inplace=True)
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)