## GroupbyTransformer

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

df = sns.load_dataset("iris")
print(df.shape)
df.head()

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [0]:
class GroupbyTransformer():
    def __init__(self, param_dict=None):
        self.param_dict = param_dict

    def _get_params(self, p_dict):
        key = p_dict['key']
        if 'var' in p_dict.keys():
            var = p_dict['var']
        else:
            var = self.var
        if 'agg' in p_dict.keys():
            agg = p_dict['agg']
        else:
            agg = self.agg
        if 'on' in p_dict.keys():
            on = p_dict['on']
        else:
            on = key
        return key, var, agg, on

    def _aggregate(self, dataframe):
        self.features = []
        for param_dict in self.param_dict:
            key, var, agg, on = self._get_params(param_dict)
            all_features = list(set(key + var))
            new_features = self._get_feature_names(key, var, agg)
            features = dataframe[all_features].groupby(key)[
                var].agg(agg).reset_index()
            features.columns = key + new_features
            self.features.append(features)
        return self

    def _merge(self, dataframe, merge=True):
        for param_dict, features in zip(self.param_dict, self.features):
            key, var, agg, on = self._get_params(param_dict)
            if merge:
                dataframe = dataframe.merge(features, how='left', on=on)
            else:
                new_features = self._get_feature_names(key, var, agg)
                dataframe = pd.concat([dataframe, features[new_features]], axis=1)
        return dataframe

    def transform(self, dataframe):
        self._aggregate(dataframe)
        return self._merge(dataframe, merge=True)

    def _get_feature_names(self, key, var, agg):
        _agg = []
        for a in agg:
            if not isinstance(a, str):
                _agg.append(a.__name__)
            else:
                _agg.append(a)
        return ['_'.join([a, v, 'groupby'] + key) for v in var for a in _agg]

    def get_feature_names(self):
        self.feature_names = []
        for param_dict in self.param_dict:
            key, var, agg, on = self._get_params(param_dict)
            self.feature_names += self._get_feature_names(key, var, agg)
        return self.feature_names

    def get_numerical_features(self):
        return self.get_feature_names()

In [3]:
train = df.copy()
train.columns.values

array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'], dtype=object)

In [4]:
stats = ['mean', 'sum', 'min', 'max', 'median']
var = ['sepal_length_k', 'sepal_width_k']
for c in ['sepal_length', 'sepal_width']:
    train[c+"_k"] = train[c]
groupby_dict = [
                    {
                        'key': ['species'], 
                        'var': ['petal_length'], 
                        'agg': ['count']
                    },
                    {
                        'key': ['species'], 
                        'var': ['sepal_length_k', 'sepal_width_k'], 
                        'agg': stats + ["var"]
                    },
                    {
                        'key': ['species'], 
                        'var': ['petal_length', 'petal_width'], 
                        'agg': stats
                    }
                ]

groupby = GroupbyTransformer(param_dict=groupby_dict)
train = groupby.transform(train)
train.drop(var, axis=1, inplace=True)

train.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,count_petal_length_groupby_species,mean_sepal_length_k_groupby_species,sum_sepal_length_k_groupby_species,min_sepal_length_k_groupby_species,max_sepal_length_k_groupby_species,...,mean_petal_length_groupby_species,sum_petal_length_groupby_species,min_petal_length_groupby_species,max_petal_length_groupby_species,median_petal_length_groupby_species,mean_petal_width_groupby_species,sum_petal_width_groupby_species,min_petal_width_groupby_species,max_petal_width_groupby_species,median_petal_width_groupby_species
0,5.1,3.5,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,1.462,73.1,1.0,1.9,1.5,0.246,12.3,0.1,0.6,0.2
1,4.9,3.0,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,1.462,73.1,1.0,1.9,1.5,0.246,12.3,0.1,0.6,0.2
2,4.7,3.2,1.3,0.2,setosa,50,5.006,250.3,4.3,5.8,...,1.462,73.1,1.0,1.9,1.5,0.246,12.3,0.1,0.6,0.2


## DiffGroupbyTransformer

In [0]:
class DiffGroupbyTransformer(GroupbyTransformer):
    def _aggregate(self):
        raise NotImplementedError
        
    def _merge(self):
        raise NotImplementedError
    
    def transform(self, dataframe):
        for param_dict in self.param_dict:
            key, var, agg, on = self._get_params(param_dict)
            for a in agg:
                for v in var:
                    new_feature = '_'.join(['diff', a, v, 'groupby'] + key)
                    base_feature = '_'.join([a, v, 'groupby'] + key)
                    dataframe[new_feature] = dataframe[base_feature] - dataframe[v]
        return dataframe

    def _get_feature_names(self, key, var, agg):
        _agg = []
        for a in agg:
            if not isinstance(a, str):
                _agg.append(a.__name__)
            else:
                _agg.append(a)
        return ['_'.join(['diff', a, v, 'groupby'] + key) for v in var for a in _agg]

In [0]:
stats = ['mean', 'sum', 'min', 'max', 'median']
groupby_dict = [
                    {
                        'key': ['species'], 
                        'var': ['petal_length'], 
                        'agg': ['count']
                    },
                    {
                        'key': ['species'], 
                        'var': ['sepal_length', 'sepal_width'], 
                        'agg': stats + ["var"]
                    },
                    {
                        'key': ['species'], 
                        'var': ['petal_length', 'petal_width'], 
                        'agg': stats
                    }
                ]

In [7]:
train_diff = df.copy()

groupby = GroupbyTransformer(param_dict=groupby_dict)
train_diff = groupby.transform(train_diff)
diff = DiffGroupbyTransformer(param_dict=groupby_dict)
train_diff = diff.transform(train_diff)

train_diff.head()
                

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,count_petal_length_groupby_species,mean_sepal_length_groupby_species,sum_sepal_length_groupby_species,min_sepal_length_groupby_species,max_sepal_length_groupby_species,...,diff_mean_petal_length_groupby_species,diff_mean_petal_width_groupby_species,diff_sum_petal_length_groupby_species,diff_sum_petal_width_groupby_species,diff_min_petal_length_groupby_species,diff_min_petal_width_groupby_species,diff_max_petal_length_groupby_species,diff_max_petal_width_groupby_species,diff_median_petal_length_groupby_species,diff_median_petal_width_groupby_species
0,5.1,3.5,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.062,0.046,71.7,12.1,-0.4,-0.1,0.5,0.4,0.1,0.0
1,4.9,3.0,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.062,0.046,71.7,12.1,-0.4,-0.1,0.5,0.4,0.1,0.0
2,4.7,3.2,1.3,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.162,0.046,71.8,12.1,-0.3,-0.1,0.6,0.4,0.2,0.0
3,4.6,3.1,1.5,0.2,setosa,50,5.006,250.3,4.3,5.8,...,-0.038,0.046,71.6,12.1,-0.5,-0.1,0.4,0.4,0.0,0.0
4,5.0,3.6,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.062,0.046,71.7,12.1,-0.4,-0.1,0.5,0.4,0.1,0.0


## RatioGroupbyTransformer

In [0]:
class RatioGroupbyTransformer(GroupbyTransformer):
    def _aggregate(self):
        raise NotImplementedError
        
    def _merge(self):
        raise NotImplementedError
    
    def transform(self, dataframe):
        for param_dict in self.param_dict:
            key, var, agg, on = self._get_params(param_dict)
            for a in agg:
                for v in var:
                    new_feature = '_'.join(['ratio', a, v, 'groupby'] + key)
                    base_feature = '_'.join([a, v, 'groupby'] + key)
                    dataframe[new_feature] = dataframe[v] / dataframe[base_feature]
        return dataframe

    def _get_feature_names(self, key, var, agg):
        _agg = []
        for a in agg:
            if not isinstance(a, str):
                _agg.append(a.__name__)
            else:
                _agg.append(a)
        return ['_'.join(['ratio', a, v, 'groupby'] + key) for v in var for a in _agg]

In [9]:
train_ratio = df.copy()

groupby = GroupbyTransformer(param_dict=groupby_dict)
train_ratio = groupby.transform(train_ratio)
ratio = RatioGroupbyTransformer(param_dict=groupby_dict)
train_ratio = ratio.transform(train_ratio)

train_ratio.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,count_petal_length_groupby_species,mean_sepal_length_groupby_species,sum_sepal_length_groupby_species,min_sepal_length_groupby_species,max_sepal_length_groupby_species,...,ratio_mean_petal_length_groupby_species,ratio_mean_petal_width_groupby_species,ratio_sum_petal_length_groupby_species,ratio_sum_petal_width_groupby_species,ratio_min_petal_length_groupby_species,ratio_min_petal_width_groupby_species,ratio_max_petal_length_groupby_species,ratio_max_petal_width_groupby_species,ratio_median_petal_length_groupby_species,ratio_median_petal_width_groupby_species
0,5.1,3.5,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.957592,0.813008,0.019152,0.01626,1.4,2.0,0.736842,0.333333,0.933333,1.0
1,4.9,3.0,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.957592,0.813008,0.019152,0.01626,1.4,2.0,0.736842,0.333333,0.933333,1.0
2,4.7,3.2,1.3,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.889193,0.813008,0.017784,0.01626,1.3,2.0,0.684211,0.333333,0.866667,1.0
3,4.6,3.1,1.5,0.2,setosa,50,5.006,250.3,4.3,5.8,...,1.025992,0.813008,0.02052,0.01626,1.5,2.0,0.789474,0.333333,1.0,1.0
4,5.0,3.6,1.4,0.2,setosa,50,5.006,250.3,4.3,5.8,...,0.957592,0.813008,0.019152,0.01626,1.4,2.0,0.736842,0.333333,0.933333,1.0
