In [45]:
import pandas as pd
from inspect import isfunction

def check_str(user_input):
    if type(user_input) != str:
        raise TypeError("Expected input is str, received: " + str(user_input))
def check_str_list(user_input):
    if not type(user_input) in [str, list]:
        raise TypeError("Expected input is str/list, received: " + str(user_input))
    if type(user_input) == str:
        [check_str(_) for _ in user_input]
def check_fns(user_input):
    if not isfunction(user_input):
        raise TypeError("Expected input is function, received: " + str(user_input))

def config_mapper(engineering_config):
    '''
    To Apply engineering config to data columns
    '''
    for new_column, required_col, function in data_engineering_config:
        if type(required_col) == str:
            # ONLY ONE COLUMN
            new_df[new_column] = new_df[required_col].apply(function)
        if type(required_col) == list:
            # Multi-column - axis=1
            new_df[new_column] = new_df[required_col].apply(function, axis=1)


In [78]:
def config_checker(engineering_config, default_columns=[]):
    '''
    To check provided config is structure.
    
    1. New column does not exist till that point of time in data
    2. First values is always a String
    3. Second values is either a String or list of strings
    4. Third value is a function
    '''
    
    # checking column names are progressive
    newly_added_columns = []

    def check_col_req(req_col):
        all_known_cols = list(default_columns + newly_added_columns)
        if type(req_col) == str:
            return req_col in all_known_cols
        if type(req_col) == list:
            unknown_cols = [_ for _ in req_col if _ not in all_known_cols]
            if unknown_cols:
                return False
            else:
                return True
    
    for new_col, req_col, new_col_func in engineering_config:
        check_str(new_col)
        check_str_list(req_col)
        check_fns(new_col_func)
        
        if check_col_req(req_col):
            print('Identified new column name:' + new_col)
            newly_added_columns.append(new_col)
        else:
            raise KeyError('Missing/Mis-spelled Column: ' + str(req_col))


In [79]:
sample_data = {
    'col1': [100, 200],
    'col2': [0.0003, 0.0004],
    'col3': ['x-12313', 'y-12313'],
}

sample_df = pd.DataFrame(sample_data)

sample_engineering_config = [
    ('new_col1', 'col1', lambda x: x/100),
    ('new_col2', 'new_col1', lambda x: x * 100), # col8 does nt exist
]

config_checker(sample_engineering_config, default_columns=sample_df.columns.tolist())

Identified new column name:new_col1
Identified new column name:new_col2


In [88]:
set(list(map(len, sample_engineering_config))) == set([3])

True

In [80]:
config_checker(sample_engineering_config, default_columns=sample_df.columns.tolist())

Identified new column name:new_col1
Identified new column name:new_col2


In [57]:
# col10 does not exist
sample_engineering_config = [
    ('new_col1', 'col10', lambda x: x/100)
]
config_checker(sample_engineering_config)

KeyError: 'Missing/Mis-spelled Column: col10'

In [61]:
# Instead of column name - a number is there
sample_engineering_config = [
    ('new_col1', ['col1'], lambda x: x/100)
]
config_checker(sample_engineering_config)

KeyError: "Missing/Mis-spelled Column: ['col1']"

In [62]:
# Instead of column name - a number is there
sample_engineering_config = [
    ('new_col1', [12313], lambda x: x/100)
]
config_checker(sample_engineering_config)

KeyError: 'Missing/Mis-spelled Column: [12313]'

In [48]:
# Instead of column name - a number is there
sample_engineering_config = [
    ('new_col1', 12313, lambda x: x/100)
]
config_checker(sample_engineering_config)

TypeError: Expected input is str/list, received: 12313

In [50]:
# function name is required
sample_engineering_config = [
    ('new_col1', 'col1', 'i am not a function ;)')
]
config_checker(sample_engineering_config)

TypeError: Expected input is function, received: i am not a function ;)

In [54]:
sample_df['asdf']

KeyError: 'asdf'