In [4]:
import pandas as pd
import numpy as np
import warnings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.options.display.max_colwidth = 1000
warnings.filterwarnings("ignore")

In [38]:
class DataProcessing():
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def categorial_columns(self):
        return self.train.columns[self.train.dtypes == 'object']

    def numeric_columns(self):
        categorial_columns = self.categorial_columns()
        numeric_columns = list(set(self.train.columns) - set(categorial_columns))
        for col in numeric_columns:
            if len(self.train[col].unique()) <= 2:
                numeric_columns.remove(col)
        return numeric_columns

    def categorial_column_unique_values(self, part=0.01):
        categorial_column_unique_values = {}
        for i in self.categorial_columns():
            categorial_column_unique_values[i] = (self.train[i].value_counts(normalize=True) \
                                                      [self.train[i].value_counts(normalize=True) > part].index)
        return categorial_column_unique_values

    def fillna(self, nan='Median', column_index=False):
        #         Работает
        train_ = train.copy()
        test_ = test.copy()

        def max_frequency(lst):
            return max(set(lst), key=lst.count)

        if nan == 'Median':
            for column in self.numeric_columns():
                train_[column].fillna(train_[column].median(), inplace=True)
                test_[column].fillna(train_[column].median(), inplace=True)

            for column in self.categorial_columns():
                train_[column].fillna(max_frequency(list(train_[column])), inplace=True)
                test_[column].fillna(max_frequency(list(train_[column])), inplace=True)

        elif nan == 'Null':
            for column in self.numeric_columns():
                train_[column].fillna(0, inplace=True)
                test_[column].fillna(0, inplace=True)

            for column in self.categorial_columns():
                train_[column].fillna('No_data', inplace=True)
                test_[column].fillna('No_data', inplace=True)
        elif nan == 'Mean':
            for column in self.numeric_columns():
                train_[column].fillna(train_[column].mean(), inplace=True)
                test_[column].fillna(train_[column].mean(), inplace=True)

            for column in self.categorial_columns():
                train_[column].fillna(max_frequency(list(train_[column])), inplace=True)
                test_[column].fillna(max_frequency(list(train_[column])), inplace=True)
        else:
            return 'Invaild parametr'
        train_['TRAIN_TEST'] = 'TRAIN'
        test_['TRAIN_TEST'] = 'TEST'
        return train_.append(test_)

    def coding_by_column(self, categorial_columns, numeric_columns, func='mean'):
        if func in ['mean', 'max', 'min', 'sum']:
            train_ = train.copy()
            test_ = test.copy()

            columns = []

            for i in categorial_columns:
                for j in numeric_columns:
                    train_[i].fillna(0, inplace=True)
                    tmp = train_.groupby(i).agg({j: func}).reset_index().rename(columns={i: i,
                                                                                         j: i + '_' + j + '_' + func.upper()
                                                                                         })
                    train_ = train_.merge(tmp)
                    test_[i].fillna(0, inplace=True)
                    tmp = test_.groupby(i).agg({j: func}).reset_index().rename(columns={i: i,
                                                                                        j: i + '_' + j + '_' + func.upper()
                                                                                        })

                    columns.append(i + '_' + j + '_' + func.upper())
                    test_ = test_.merge(tmp)

            return train_.append(test_)[columns]
        else:
            return 'Argument func has invalid value'

    def add_nan_index(self):
        #         Работает
        train_ = train.copy()
        test_ = test.copy()
        columns = train_.columns[train_.isna().any()].tolist()
        for i in columns:
            train_[i + '=NAN'] = list(train_['CITY'].isna().astype(int))
            test_[i + '=NAN'] = list(test_['CITY'].isna().astype(int))
        columns = [i + '=NAN' for i in columns]
        return train_.append(test_)[columns]

    def get_data(self):
        #         Работает
        train_ = train.copy()
        test_ = test.copy()
        train_['TRAIN_TEST'] = 'TRAIN'
        test_['TRAIN_TEST'] = 'TEST'
        return train_.append(test_)

    def value_counts(self):
        #         Работает
        train_ = self.train.copy()
        test_ = self.test.copy()
        train_.fillna('NO_DATA', inplace=True)
        test_.fillna('NO_DATA', inplace=True)
        columns = []
        for i in self.categorial_columns():
            tmp = train_[i].value_counts(normalize=True). \
                reset_index(). \
                rename(columns={'index': i, i: i + '_PART'})
            train_ = train_.merge(tmp, how='left')
            columns.append(i + '_PART')
        for i in self.categorial_columns():
            tmp = train_[i].value_counts(normalize=True). \
                reset_index(). \
                rename(columns={'index': i, i: i + '_PART'})
            test_ = test_.merge(tmp, how='left')
            df = train_.append(test_)[columns].fillna(0, inplace=False)
        return df

    def value_counts_by_target(self):
        #         Работает
        train_ = train.copy()
        test_ = test.copy()
        for i in self.categorial_columns():
            tmp = train_.query('TARGET==1')[i].fillna('NO_DATA').value_counts(normalize=True). \
                reset_index(). \
                rename(columns={'index': i, i: i + '_PART_TARGET=1'})
            train_ = train_.fillna('NO_DATA', inplace=False).merge(tmp, how='left')
            train_[i + '_PART_TARGET=1'].fillna(0, inplace=True)

        for i in self.categorial_columns():
            tmp = train_.query('TARGET==0')[i].fillna('NO_DATA').value_counts(normalize=True). \
                reset_index(). \
                rename(columns={'index': i, i: i + '_PART_TARGET=0'})
            train_ = train_.fillna('NO_DATA', inplace=False).merge(tmp, how='left')
            train_[i + '_PART_TARGET=0'].fillna(0, inplace=True)

        for i in self.categorial_columns():
            tmp = train_.query('TARGET==1')[i].fillna('NO_DATA').value_counts(normalize=True). \
                reset_index(). \
                rename(columns={'index': i, i: i + '_PART_TARGET=1'})
            test_ = test_.fillna('NO_DATA', inplace=False).merge(tmp, how='left')
            test_[i + '_PART_TARGET=1'].fillna(0, inplace=True)

        for i in self.categorial_columns():
            tmp = train_.query('TARGET==0')[i].fillna('NO_DATA').value_counts(normalize=True). \
                reset_index(). \
                rename(columns={'index': i, i: i + '_PART_TARGET=0'})
            test_ = test_.fillna('NO_DATA', inplace=False).merge(tmp, how='left')
            test_[i + '_PART_TARGET=0'].fillna(0, inplace=True)
        return train_.append(test_).drop(list(self.categorial_columns()) + ['TARGET'] +
                                         list(self.numeric_columns()), axis=1)

    def getdummies(self, part=0.1):
        #         Работает
        train_ = self.train.copy()
        test_ = self.test.copy()
        categorial_column_unique_values = self.categorial_column_unique_values(part)
        columns = list()
        for i in self.categorial_columns():
            for j in range(len(categorial_column_unique_values[i])):
                columns.append(str(i + '=' + str(categorial_column_unique_values[i][j])))
                train_[i + '=' + str(categorial_column_unique_values[i][j])] = \
                    train_[i].apply(lambda x: 1 if x == str(categorial_column_unique_values[i][j]) else 0)
        for i in self.categorial_columns():
            for j in range(len(categorial_column_unique_values[i])):
                test_[i + '=' + str(categorial_column_unique_values[i][j])] = \
                    test_[i].apply(lambda x: 1 if x == str(categorial_column_unique_values[i][j]) else 0)
        return train_.append(test_)[columns]

In [39]:
train = pd.DataFrame([['Sayan',float('Nan'),float('Nan'),190, 90,1],
                        ['Batima','F','Kostanay',162,54,0],
                        ['Nadira','F','Astana',165,50,0],
                        ['Zhanbolat','M','Aqtobe',179,70,1],
                        ['Azamat','M','Aqtobe',179,75,1]
                       ],
                       columns = ['NAME', 'SEX', 'CITY','HEIGHT', 'WEIGHT', 'TARGET'])
test = pd.DataFrame([['Madiyar','M','Kostanay',185, 72,''],
                        ['Marat','M','Pavlodar',180,float('Nan'),''],
                        ['Dair','M','Karaganda',184,80,''],
                        ['Madina','F',float('Nan'),
                         float('Nan')
                         ,52,''],
                        ['Perizat','F','Pavlodar',165,50,'']
                       ],
                       columns = ['NAME', 'SEX', 'CITY','HEIGHT', 'WEIGHT', 'TARGET'])

In [40]:
train

Unnamed: 0,NAME,SEX,CITY,HEIGHT,WEIGHT,TARGET
0,Sayan,,,190,90,1
1,Batima,F,Kostanay,162,54,0
2,Nadira,F,Astana,165,50,0
3,Zhanbolat,M,Aqtobe,179,70,1
4,Azamat,M,Aqtobe,179,75,1


In [30]:
test

Unnamed: 0,NAME,SEX,CITY,HEIGHT,WEIGHT,TARGET
0,Madiyar,M,Kostanay,185.0,72.0,
1,Marat,M,Pavlodar,180.0,,
2,Dair,M,Karaganda,184.0,80.0,
3,Madina,F,,,52.0,
4,Perizat,F,Pavlodar,165.0,50.0,


In [36]:
a = DataProcessing(train,test)
# a.fillna('Mean')
# a.get_data()
a.add_nan_index()
a.value_counts_by_target()
# a.coding_by_column()

Unnamed: 0,NAME_PART_TARGET=1,SEX_PART_TARGET=1,CITY_PART_TARGET=1,NAME_PART_TARGET=0,SEX_PART_TARGET=0,CITY_PART_TARGET=0
0,0.333333,0.333333,0.333333,0.0,0.0,0.0
1,0.0,0.0,0.0,0.5,1.0,0.5
2,0.0,0.0,0.0,0.5,1.0,0.5
3,0.333333,0.666667,0.666667,0.0,0.0,0.0
4,0.333333,0.666667,0.666667,0.0,0.0,0.0
0,0.0,0.666667,0.0,0.0,0.0,0.5
1,0.0,0.666667,0.0,0.0,0.0,0.0
2,0.0,0.666667,0.0,0.0,0.0,0.0
3,0.0,0.0,0.333333,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0


In [37]:
train

Unnamed: 0,NAME,SEX,CITY,HEIGHT,WEIGHT,TARGET
0,Sayan,,,190,90,1
1,Batima,F,Kostanay,162,54,0
2,Nadira,F,Astana,165,50,0
3,Zhanbolat,M,Aqtobe,179,70,1
4,Azamat,M,Aqtobe,179,75,1
