**using this there is no need to reload your packages every time when you will make any improvement to them**

In [185]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [186]:
autoreload 2

**import necessary libraries**

In [187]:
import numpy as np
import pandas as pd 
import sys
import warnings

In [188]:
from matplotlib import pyplot as plt

In [189]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

**import modelling helpers**

In [190]:
from sklearn.model_selection import train_test_split

**set pandas options**

In [7]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

**set warnings options**

In [8]:
warnings.filterwarnings('ignore')

#### to have ease in importing modules

In [9]:
sys.path.insert(0, '/Users/mjasiecz/PycharmProjects/new_offer_success_predictor/src')

**load data**

In [305]:
from data.data_manager import DataManager
from ml_preprocessing.categorical_encoders import LeaveOneOutEncoder

In [306]:
DM = DataManager()

In [307]:
df = DM.load_data()

In [308]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1289 entries, C7CBB5C5613449B to B8A3C4B5FDE74D2
Data columns (total 14 columns):
offer_class           1289 non-null object
accepted              1289 non-null object
gender                1289 non-null object
age                   877 non-null float64
phone_calls           1286 non-null float64
emails                1287 non-null float64
customer_code         1265 non-null object
salary                1281 non-null float64
offer_code            1287 non-null object
customer_type         1287 non-null object
number                1280 non-null object
offer_value           1277 non-null float64
estimated_expenses    1286 non-null float64
center                1286 non-null object
dtypes: float64(6), object(8)
memory usage: 151.1+ KB


In [309]:
DM.create_train_test()

Split is already done. Do not data snoop!


In [310]:
train_df, test_df = DM.load_train_test()

In [311]:
test_df.drop(columns=['accepted'], inplace=True)

#### dealing with missing values

how much missing values?

In [17]:
df.isnull().sum()

offer_class             0
accepted                0
gender                  0
age                   412
phone_calls             3
emails                  2
customer_code          24
salary                  8
offer_code              2
customer_type           2
number                  9
offer_value            12
estimated_expenses      3
center                  3
dtype: int64

percentage of missing values

In [18]:
print((df.isnull().sum()/df.shape[0]*100).round(2))

offer_class            0.00
accepted               0.00
gender                 0.00
age                   31.96
phone_calls            0.23
emails                 0.16
customer_code          1.86
salary                 0.62
offer_code             0.16
customer_type          0.16
number                 0.70
offer_value            0.93
estimated_expenses     0.23
center                 0.23
dtype: float64


# prepare something similar to automatize it on test set

#### emails and phone_calls columns - dealing with these columns / (without dividing process onto parts - missing, outliers etc)

#### emails

In [19]:
train_df['emails'].mean()

0.9271137026239067

In [20]:
round(train_df['emails'].mean())

1.0

In [21]:
train_df['cat_emails'] = train_df['emails']

In [22]:
train_df.loc[train_df['cat_emails'].isna(), 'cat_emails'] = round(train_df['emails'].mean())

In [23]:
train_df[train_df['emails']>4]['accepted'].value_counts()

no     11
yes     1
Name: accepted, dtype: int64

In [24]:
train_df.loc[train_df['cat_emails'] > 4, 'cat_emails'] = 5

In [25]:
train_df['cat_emails'] = train_df['cat_emails'].astype(object)

#### phone_calls

In [26]:
train_df['phone_calls'].mean()

0.5150631681243926

In [27]:
round(train_df['phone_calls'].mean())

1.0

In [28]:
train_df['cat_phone_calls'] = train_df['phone_calls']

In [29]:
train_df.loc[train_df['cat_phone_calls'].isna(), 'cat_phone_calls'] = round(train_df['phone_calls'].mean())

In [30]:
train_df[train_df['phone_calls']>3]['accepted'].value_counts()

no     30
yes     2
Name: accepted, dtype: int64

In [31]:
train_df.loc[train_df['cat_phone_calls'] > 3, 'cat_phone_calls'] = 4

In [32]:
train_df['cat_phone_calls'] = train_df['cat_phone_calls'].astype(object)

#### check after every iteration :)

In [33]:
train_df.isnull().sum()

accepted                0
offer_class             0
gender                  0
age                   343
phone_calls             2
emails                  2
customer_code          16
salary                  7
offer_code              2
customer_type           2
number                  6
offer_value            10
estimated_expenses      3
center                  2
cat_emails              0
cat_phone_calls         0
dtype: int64

In [34]:
cc = train_df[train_df['salary'].isnull()]['customer_code'].tolist()

In [35]:
# here i need to show that every person with the same salary has the same customer_code

In [36]:
train_df[train_df['customer_code'].isin(cc)]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,cat_phone_calls
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
29C5493CE7EF447,yes,Medium,female,22.0,1.0,1.0,113776,6660.0,D78,S,7D77D,61.329828,2754.0,A,1,1
BFAEC8F911F841B,no,Premium,male,32.0,1.0,0.0,3101278,,9C0,S,52591,137.395918,5384.0,B,0,1
CA249ABE7D4E43C,yes,Medium,female,,1.0,0.0,PC 17611,13365.0,57E,S,E4391,101.924325,1534.0,B,0,1
2D7B8459CBD2414,yes,Medium,male,50.0,2.0,0.0,PC 17611,,2F3,S,8997E,90.384374,5152.0,B,0,2
A14D00E1FEBF433,no,Premium,male,60.5,0.0,0.0,3701,,05A,S,00D61,80.832678,7521.0,A,0,0
FFFDD0C3B34644E,no,Medium,male,42.0,1.0,1.0,113789,5200.0,CE9,S,4272B,66.355074,5090.0,A,1,1
487662632F6B484,yes,Medium,female,35.0,1.0,0.0,113789,,B04,S,4DBC4,60.800476,7570.0,A,0,1
EF319AAD8691472,no,Medium,male,29.0,1.0,1.0,113776,,88A,S,7DFE9,85.21346,4003.0,A,1,1
CCF5A463D3BC472,yes,Premium,female,,0.0,0.0,14313,,6F2,Q,D21EF,260.922393,2631.0,A,0,0
63193E3CA0F840B,yes,Premium,female,33.0,3.0,0.0,3101278,1585.0,46D,S,A3D8B,76.753846,2650.0,B,0,3


In [37]:
train_df[train_df['salary'] == 0]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,cat_phone_calls
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C9DE5A2E7BC24AB,no,Medium,male,40.0,0.0,1.0,112059,0.0,0.00E+00,S,29B9B,176.279975,5579.0,A,1,0
8816EB571C6B435,no,High,male,,0.0,0.0,239853,0.0,302,S,A5285,146.192831,2825.0,A,0,0
B9A7CAE408AC42E,no,High,male,,0.0,1.0,239855,0.0,F82,S,C1685,77.400738,3649.0,B,1,0
3515F26EEEFA40C,no,Medium,male,,0.0,0.0,,0.0,D04,S,91D26,111.165165,2077.0,B,0,0
59273C9523B94F0,no,Premium,male,19.0,0.0,0.0,LINE,0.0,EB6,S,FA978,178.460182,7885.0,A,0,0
4669F129273C41C,no,High,male,,0.0,0.0,239853,0.0,B02,S,58287,119.168887,7642.0,B,0,0
27AE9D32887D47D,no,High,male,,0.0,0.0,239853,0.0,DAC,S,4CA98,159.785647,6931.0,A,0,0
36EF30F44B644F4,no,Premium,male,49.0,0.0,1.0,LINE,0.0,624,S,688F8,150.333741,3976.0,A,1,0
27C908A4720E4AD,no,Medium,male,39.0,0.0,0.0,112050,0.0,9A1,S,90915,289.613239,2146.0,A,0,0
3FA638BEB4CD498,no,Medium,male,,0.0,0.0,112052,0.0,6.00E+09,S,26A16,86.234876,3164.0,B,0,0


In [38]:
cc1 = train_df[train_df['salary'] == 0]['customer_code'].tolist()

In [39]:
train_df[train_df['customer_code'].isin(cc1)]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,cat_phone_calls
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
4CF2BD7C25314F0,no,Premium,male,38.5,0.0,1.0,,725.0,1.00E+00,S,120AA,98.242553,5182.0,A,1,0
C9DE5A2E7BC24AB,no,Medium,male,40.0,0.0,1.0,112059,0.0,0.00E+00,S,29B9B,176.279975,5579.0,A,1,0
8816EB571C6B435,no,High,male,,0.0,0.0,239853,0.0,302,S,A5285,146.192831,2825.0,A,0,0
B9A7CAE408AC42E,no,High,male,,0.0,1.0,239855,0.0,F82,S,C1685,77.400738,3649.0,B,1,0
6DF656E9901B476,no,Premium,male,21.0,0.0,0.0,,725.0,BE2,S,69D51,199.38107,2755.0,B,0,0
3515F26EEEFA40C,no,Medium,male,,0.0,0.0,,0.0,D04,S,91D26,111.165165,2077.0,B,0,0
E915D2F2863B474,no,Premium,male,20.0,0.0,1.0,,705.0,85,S,A167C,102.076908,3416.0,B,1,0
074B66A3B9F2426,yes,High,female,45.0,0.0,1.0,,1350.0,8.00E+06,S,DF9E0,155.645549,3849.0,B,1,0
59273C9523B94F0,no,Premium,male,19.0,0.0,0.0,LINE,0.0,EB6,S,FA978,178.460182,7885.0,A,0,0
4669F129273C41C,no,High,male,,0.0,0.0,239853,0.0,B02,S,58287,119.168887,7642.0,B,0,0


In [40]:
test_df.isnull().sum()

offer_class            0
gender                 0
age                   69
phone_calls            1
emails                 0
customer_code          8
salary                 1
offer_code             0
customer_type          0
number                 3
offer_value            2
estimated_expenses     0
center                 1
dtype: int64

In [41]:
train_df['salary_modif'] = train_df['salary']
train_df.loc[train_df['salary_modif'].isna(), 'salary_modif'] = 0.1

In [42]:
ccctsalary = train_df[['customer_type', 'customer_code', 'salary_modif']].groupby(['customer_code' , 'customer_type', 'salary_modif']).size().reset_index().rename(columns={0:'size'})# .duplicated(subset=['customer_code', 'customer_type'])

In [43]:
ccctsalary.shape

(757, 4)

In [44]:
#ccctsalary

In [45]:
slownik = ccctsalary[ccctsalary.duplicated(subset=['customer_code', 'customer_type'], keep=False)][['customer_code', 'salary_modif']]

In [46]:
slownik = slownik[slownik['salary_modif'] != 0.1]

In [47]:
slownik = slownik.set_index('customer_code')['salary_modif']

In [48]:
slownik = slownik.to_dict()

In [49]:
slownik

{'113776': 6660.0,
 '113789': 5200.0,
 '3101278': 1585.0,
 '7534': 984.58,
 'PC 17611': 13365.0}

In [50]:
train_df['salary'].isna().sum()

7

In [51]:
train_df['salary'].fillna(train_df['customer_code'].map(slownik)).isna().sum()

3

In [52]:
train_df[train_df['salary'].isna()].groupby('customer_type').size()

customer_type
C    1
Q    1
S    5
dtype: int64

In [53]:
#train_df[['customer_code', 'salary']].sort_values('salary')

In [54]:
#train_df['customer_code'].fillna('cc_missing')

In [55]:
#train_df['number'].fillna('number_missing')

In [56]:
#train_df['']

In [57]:
train_df.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,cat_phone_calls,salary_modif
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,2,1,1524.58
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,0,0,1300.0
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,1,1,1550.0
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,1,0,722.92
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,1,0,5649.58


In [68]:
train_df[['customer_type', 'offer_value']].groupby('customer_type').mean().reset_index()

Unnamed: 0,customer_type,offer_value
0,C,129.787996
1,Q,117.489225
2,S,129.230987


In [None]:
train_df['cc_startswith_a'] = train_df['customer_code'].str.startswith('A')
train_df['cc_startswith_p'] = train_df['customer_code'].str.startswith('P')
train_df['cc_startswith_c'] = train_df['customer_code'].str.startswith('C')
train_df['cc_len_5'] = train_df['customer_code'].str.len() == 5

In [438]:
import pandas as pd
import numpy as np
import itertools

from collections import defaultdict
from fancyimpute import KNN

from sklearn.preprocessing import StandardScaler

class DataProcessor:
    """

    """
    def __init__(self, train_df: pd.DataFrame) -> None:
        self.df = train_df.copy(deep=True)
    
    def missing_information(self, percentage: bool = False) -> pd.Series:
        df_shape = self.df.shape
        print(f'df.shape: {df_shape}')
        is_null_sum = self.df.isnull().sum()
        missing_information = is_null_sum if not percentage else (is_null_sum/df_shape[0]*100).round(2)
        return missing_information
    
    @staticmethod
    def _group_columns():
        groups = defaultdict()
        groups['age'] = ['age']
        groups['emails_and_phone_calls'] = ['emails', 'phone_calls']
        groups['low_cardinality'] = ['offer_class', 'gender', 'customer_type', 'center']
        groups['high_cardinality'] = ['customer_code', 'offer_code', 'number']
        groups['numerical'] = ['salary', 'offer_value', 'estimated_expenses']
        groups['target'] = ['accepted']
        return groups
    
    @staticmethod
    def _missing_salary(df: pd.Series):
        df = df.copy(deep=True)
        df['salary_temp'] = df['salary'].fillna(0.1)
        mask = ['customer_type', 'customer_code', 'salary_temp']
        temp = df[mask].groupby(mask).size().reset_index().rename(columns={0: 'size'})
        temp = temp[temp.duplicated(subset=['customer_code', 'customer_type'], keep=False)][['customer_code', 'salary_temp']]
        temp = temp[temp['salary_temp'] != 0.1]
        temp = temp.set_index('customer_code')['salary_temp']
        salary_map = temp.to_dict()
        df['salary'] = df['salary'].fillna(df['customer_code'].map(salary_map))
        temp = df[['customer_type', 'salary']].groupby('customer_type').median()
        temp = temp['salary']
        salary_map = temp.to_dict()
        df['salary'] = df['salary'].fillna(df['customer_type'].map(salary_map))
        # to be able to np.log('salary')
        df.loc[df['salary'] == 0, 'salary'] = 0.0001
        return df['salary']

    def _prepare_to_knn(self, df: pd.DataFrame):
        df = df.copy(deep=True)
        temp = pd.get_dummies(df['gender'])
        groups = self._group_columns()
        if 'accepted' in df.columns:
            temp = pd.concat([pd.get_dummies(df['accepted']), temp], axis=1)
        for col in itertools.chain(groups['emails_and_phone_calls'], groups['low_cardinality']):
            temp = pd.concat([pd.get_dummies(df[col]), temp], axis=1)
        knn_to_complete = pd.concat([temp, df[groups['numerical']+groups['age']]], axis=1)
        return knn_to_complete
                                 
    def deal_with_missing_values(self, n_neighbors: int = 5):
        df = self.df.copy(deep=True)
        groups = self._group_columns()
        for col in groups['high_cardinality']:
            df[col] = df[col].fillna('missing')
        
        def _prt_process_emails_and_phone_calls(df):
            df.loc[df['emails'] > 4, 'emails'] = 5
            df.loc[df['phone_calls'] > 3, 'phone_calls'] = 4
            return df
        
        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].fillna(round(df[col].mean()))
        
        df = _prt_process_emails_and_phone_calls(df)
            
        for col in groups['low_cardinality']:
            df[col] = df[col].fillna(method='ffill')
        df['salary'] = self._missing_salary(df)
        temp = df[groups['emails_and_phone_calls']+groups['age']+groups['low_cardinality']+groups['numerical']]
        knn_unfilled_table = self._prepare_to_knn(temp)
        knn_filled = KNN(k=n_neighbors, print_interval=1032).fit_transform(knn_unfilled_table.to_numpy())
        knn_imputed_cols = ['age_knn', 'estimated_expenses_knn', 'offer_value_knn']
        for col in knn_imputed_cols:
            df[col] = knn_filled[:,-knn_imputed_cols.index(col)-1]
        df = df.drop(columns=['offer_value', 'estimated_expenses'])
        return df
    
    @staticmethod
    def _process_age(age_df: pd.DataFrame) -> pd.DataFrame:
        age_df = age_df.copy(deep=True)
        age_df['nan_age'] = age_df['age'].isna()
        age_df['not_nan_age'] = age_df['age'].notna()
        return age_df[['nan_age', 'not_nan_age']]
        
    @staticmethod
    def _process_target(target_df: pd.DataFrame) -> pd.DataFrame:
        target = target_df['accepted'] == 'yes'
        return target

    
    @staticmethod
    def _process_high_cardinality_categorical_cols(high_cardinality_df: pd.DataFrame) -> pd.DataFrame:
        pass
    
    @staticmethod
    def _process_numerical_cols(numerical_df: pd.DataFrame) -> pd.DataFrame:
        numeric_cols = [col for col in numerical_df.columns if numerical_df[col].dtype != object]
        numeric_cols.remove('age')
        temp = numerical_df[numeric_cols]
        scaler = StandardScaler()
        for col in numeric_cols:
            temp['log_'+col] = np.log(temp[col])
        log_cols = [col for col in temp.columns if col not in numerical_df.columns]
        log_subset = temp[log_cols]
        temp_subset = temp[numeric_cols]
        scaled = scaler.fit_transform(temp_subset)
        scaled = pd.DataFrame(scaled, columns='scaled_'+temp_subset.columns, index=temp.index)
        processed = pd.concat([log_subset, scaled], axis=1)
        return processed
        
        
        
        
    def perform_initial_features_engineering(self):
        groups = self._group_columns()
        df = self.deal_with_missing_values().copy(deep=True)
        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].astype(object)        
        target = self._process_target(df)
        age = self._process_age(df)
        numerical = self._process_numerical_cols(df)
        df = pd.concat([target, df], axis=1)
        df = pd.concat([age, df], axis=1)
        df = pd.concat([numerical, df], axis=1)
        return df

In [579]:
import itertools
from collections import defaultdict

import numpy as np
import pandas as pd
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler


class DataProcessor:
    """
    """

    def __init__(self, train_df: pd.DataFrame) -> None:
        """

        :param train_df:
        """
        self.df = train_df.copy(deep=True)

    def missing_information(self, percentage: bool = False) -> pd.Series:
        """

        :param percentage:
        :return:
        """
        df_shape = self.df.shape
        print(f'df.shape: {df_shape}')
        is_null_sum = self.df.isnull().sum()
        missing_information = (is_null_sum
                               if not percentage
                               else (is_null_sum / df_shape[0] * 100).round(2))
        return missing_information

    @staticmethod
    def _group_columns():
        """

        :return:
        """
        groups = defaultdict()
        groups['age'] = ['age']
        groups['emails_and_phone_calls'] = ['emails', 'phone_calls']
        groups['low_cardinality'] = ['offer_class', 'gender', 'customer_type', 'center']
        groups['high_cardinality'] = ['customer_code', 'offer_code', 'number']
        groups['numerical'] = ['salary', 'offer_value', 'estimated_expenses']
        groups['target'] = ['accepted']
        return groups

    @staticmethod
    def _missing_salary(df: pd.DataFrame):
        """

        :param df:
        :return:
        """
        df = df.copy(deep=True)
        df['salary_temp'] = df['salary'].fillna(0.1)
        mask = ['customer_type', 'customer_code', 'salary_temp']
        temp = (df[mask]
                .groupby(mask)
                .size()
                .reset_index()
                .rename(columns={0: 'size'}))
        temp = (temp[temp
            .duplicated(subset=['customer_code', 'customer_type'],
                        keep=False)][['customer_code', 'salary_temp']])
        temp = temp[temp['salary_temp'] != 0.1]
        temp = temp.set_index('customer_code')['salary_temp']
        salary_map = temp.to_dict()
        df['salary'] = (df['salary']
                        .fillna(df['customer_code']
                                .map(salary_map)))
        temp = (df[['customer_type', 'salary']]
                .groupby('customer_type')
                .median())
        temp = temp['salary']
        salary_map = temp.to_dict()
        df['salary'] = (df['salary']
                        .fillna(df['customer_type']
                                .map(salary_map)))
        # to be able to np.log('salary')
        df.loc[df['salary'] == 0, 'salary'] = 0.0001
        return df['salary']

    def _prepare_to_knn(self, df: pd.DataFrame):
        """

        :param df:
        :return:
        """
        df = df.copy(deep=True)
        temp = pd.get_dummies(df['gender'])
        groups = self._group_columns()
        if 'accepted' in df.columns:
            temp = pd.concat([pd.get_dummies(df['accepted']), temp], axis=1)
        for col in itertools.chain(groups['emails_and_phone_calls'], groups['low_cardinality']):
            temp = pd.concat([pd.get_dummies(df[col]), temp], axis=1)
        knn_to_complete = pd.concat([temp, df[groups['numerical'] + groups['age']]], axis=1)
        return knn_to_complete

    def deal_with_missing_values(self, n_neighbors: int = 5):
        """

        :param n_neighbors:
        :return:
        """
        df = self.df.copy(deep=True)
        groups = self._group_columns()
        for col in groups['high_cardinality']:
            df[col] = df[col].fillna('missing')

        def _prt_process_emails_and_phone_calls(emails_phone_calls_df: pd.DataFrame):
            """

            :param emails_phone_calls_df:
            :return:
            """
            emails_phone_calls_df.loc[emails_phone_calls_df['emails'] > 4, 'emails'] = 5
            emails_phone_calls_df.loc[emails_phone_calls_df['phone_calls'] > 3, 'phone_calls'] = 4
            return emails_phone_calls_df

        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].fillna(round(df[col].mean()))

        df = _prt_process_emails_and_phone_calls(df)

        for col in groups['low_cardinality']:
            df[col] = df[col].fillna(method='ffill')
        df['salary'] = self._missing_salary(df)
        temp = (df[groups['emails_and_phone_calls']
                   + groups['age']
                   + groups['low_cardinality']
                   + groups['numerical']])
        knn_unfilled_table = self._prepare_to_knn(temp)
        knn_filled = (KNN(k=n_neighbors,
                         print_interval=1032)
                      .fit_transform(knn_unfilled_table
                                     .to_numpy()))
        knn_imputed_cols = ['age_knn', 'estimated_expenses_knn', 'offer_value_knn']
        for col in knn_imputed_cols:
            df[col] = knn_filled[:, -knn_imputed_cols.index(col) - 1]
        df = df.drop(columns=['offer_value', 'estimated_expenses'])
        return df

    @staticmethod
    def _process_age(age_df: pd.DataFrame) -> pd.DataFrame:
        """

        :param age_df:
        :return:
        """
        age_df = age_df.copy(deep=True)
        age_df['nan_age'] = age_df['age'].isna()
        age_df['not_nan_age'] = age_df['age'].notna()
        return age_df[['nan_age', 'not_nan_age']]

    @staticmethod
    def _process_target(target_df: pd.DataFrame) -> pd.DataFrame:
        """

        :param target_df:
        :return:
        """
        target_df = target_df.copy(deep=True)
        target_df['target'] = target_df['accepted'] == 'yes'
        return target_df[['target']]

    @staticmethod
    def _process_high_cardinality_categorical_cols(high_cardinal_df: pd.DataFrame) -> pd.DataFrame:
        """

        :param high_cardinal_df:
        :return:
        """
        high_cardinal_df = high_cardinal_df.copy(deep=True)
        high_cardinal_df['cc_len'] = high_cardinal_df['customer_code'].str.len()
        high_cardinal_df.loc[high_cardinal_df['cc_len'].isin([5,8]), 'cc_len'] = '58'
        high_cardinal_df.loc[~high_cardinal_df['cc_len'].isin(['58']), 'cc_len'] = 'ELSE'
        high_cardinal_df['cc_startswith'] = high_cardinal_df['customer_code']
        a_p_c = ['A', 'P', 'C']
        for letter in a_p_c:
            high_cardinal_df.loc[high_cardinal_df['customer_code'].str.startswith(letter), 'cc_startswith'] = letter
        high_cardinal_df.loc[~high_cardinal_df['customer_code'].str.startswith(tuple(a_p_c)), 'cc_startswith'] = 'ELSE'
        return high_cardinal_df[['cc_len', 'cc_startswith']]
        

    @staticmethod
    def _process_numerical_cols(numerical_df: pd.DataFrame) -> pd.DataFrame:
        """

        :param numerical_df:
        :return:
        """
        numeric_cols = [col for col in numerical_df.columns if numerical_df[col].dtype != object]
        numeric_cols.remove('age')
        temp = numerical_df[numeric_cols]
        scaler = StandardScaler()
        for col in numeric_cols:
            temp['log_' + col] = np.log(temp[col])
        log_cols = [col for col in temp.columns if col not in numerical_df.columns]
        log_subset = temp[log_cols]
        temp_subset = temp[numeric_cols]
        scaled = scaler.fit_transform(temp_subset)
        scaled = pd.DataFrame(scaled, columns='scaled_' + temp_subset.columns, index=temp.index)
        processed = pd.concat([log_subset, scaled], axis=1)
        return processed

    def perform_initial_features_engineering(self):
        """

        :return:
        """
        groups = self._group_columns()
        df = self.deal_with_missing_values().copy(deep=True)
        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].astype(object)
        target = self._process_target(df)
        age = self._process_age(df)
        numerical = self._process_numerical_cols(df)
        high_cardinal = self._process_high_cardinality_categorical_cols(df)
        df = pd.concat([target, df], axis=1)
        df = pd.concat([age, df], axis=1)
        df = pd.concat([numerical, df], axis=1)
        df = pd.concat([high_cardinal, df], axis=1)
        df.drop(columns=['accepted', 'customer_code', 'number', 'offer_code'], inplace=True)
        
        if False:
            df.to_csv('/data/processed/processed_train.csv')
            
        
        return df

class TestDataProcessor(DataProcessor):
    
class TrainDataProcessor(DataProcessor):
    

IndentationError: expected an indented block (<ipython-input-579-55fbe9ef5186>, line 235)

In [575]:
DP = DataProcessor(train_df)

In [576]:
a = DP.perform_initial_features_engineering()

Imputing row 1/1031 with 1 missing, elapsed time: 0.230


In [577]:
a.shape

(1031, 24)

In [578]:
a.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031 entries, 8550AB469CB2445 to 358EEC4160A4478
Data columns (total 24 columns):
cc_len                           1031 non-null object
cc_startswith                    1031 non-null object
log_salary                       1031 non-null float64
log_age_knn                      1031 non-null float64
log_estimated_expenses_knn       1031 non-null float64
log_offer_value_knn              1031 non-null float64
scaled_salary                    1031 non-null float64
scaled_age_knn                   1031 non-null float64
scaled_estimated_expenses_knn    1031 non-null float64
scaled_offer_value_knn           1031 non-null float64
nan_age                          1031 non-null bool
not_nan_age                      1031 non-null bool
target                           1031 non-null bool
offer_class                      1031 non-null object
gender                           1031 non-null object
age                              688 non-null float64
phone_c

In [628]:
train_df[train_df['salary'].isna()]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
BFAEC8F911F841B,no,Premium,male,32.0,1.0,0.0,3101278,,9C0,S,52591,137.395918,5384.0,B
2D7B8459CBD2414,yes,Medium,male,50.0,2.0,0.0,PC 17611,,2F3,S,8997E,90.384374,5152.0,B
A14D00E1FEBF433,no,Premium,male,60.5,0.0,0.0,3701,,05A,S,00D61,80.832678,7521.0,A
487662632F6B484,yes,Medium,female,35.0,1.0,0.0,113789,,B04,S,4DBC4,60.800476,7570.0,A
EF319AAD8691472,no,Medium,male,29.0,1.0,1.0,113776,,88A,S,7DFE9,85.21346,4003.0,A
CCF5A463D3BC472,yes,Premium,female,,0.0,0.0,14313,,6F2,Q,D21EF,260.922393,2631.0,A
DAF778DD00F8407,yes,Medium,male,32.0,0.0,1.0,13214,,F22,C,55CC4,54.717256,6754.0,A


In [632]:
a.head()

Unnamed: 0_level_0,cc_len,cc_startswith,log_salary,log_age_knn,log_estimated_expenses_knn,log_offer_value_knn,scaled_salary,scaled_age_knn,scaled_estimated_expenses_knn,scaled_offer_value_knn,nan_age,not_nan_age,target,offer_class,gender,age,phone_calls,emails,salary,customer_type,center,age_knn,estimated_expenses_knn,offer_value_knn
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
8550AB469CB2445,ELSE,ELSE,7.329474,3.401259,8.882808,5.71694,-0.34553,-0.297159,1.36225,3.035567,True,False,False,Premium,female,,1,2,1524.58,C,B,30.001848,7207.0,303.973257
07355EE27DD1493,ELSE,ELSE,7.17012,3.465736,8.209308,4.160055,-0.389281,-0.111655,-0.492006,-1.109564,False,True,False,High,male,32.0,0,0,1300.0,S,A,32.0,3675.0,64.075055
034E73A251554F0,ELSE,ELSE,7.34601,3.177471,8.4659,4.687503,-0.340578,-0.855654,0.072355,-0.340546,True,False,True,Premium,female,,1,1,1550.0,Q,B,23.986022,4750.0,108.58175
0AF961B4AC7A439,ELSE,ELSE,6.583299,3.323884,7.244942,5.04662,-0.501704,-0.504546,-1.685828,0.470071,True,False,False,Premium,male,,0,1,722.92,C,A,27.767979,1401.0,155.495957
8535BBCA690A4AE,ELSE,ELSE,8.639336,3.405772,7.652546,4.798852,0.458072,-0.284561,-1.315712,-0.119564,True,False,True,Premium,male,,0,1,5649.58,S,B,30.137541,2106.0,121.371033


In [638]:
train_df[train_df['customer_code'] == 'PC 17611']

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CA249ABE7D4E43C,yes,Medium,female,,1.0,0.0,PC 17611,13365.0,57E,S,E4391,101.924325,1534.0,B
2D7B8459CBD2414,yes,Medium,male,50.0,2.0,0.0,PC 17611,,2F3,S,8997E,90.384374,5152.0,B


In [636]:
a[a.index == '2D7B8459CBD2414']['salary']

name
2D7B8459CBD2414    13365.0
Name: salary, dtype: float64

In [645]:
import itertools
from collections import defaultdict

import numpy as np
import pandas as pd
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler


class DataProcessor:
    """
    processes data, is well adapted for processing training DataFrame,
    - deals with missing data,
    - converts column types for further work (CategoricalEncoding of categorical columns),
    - performs initial features engineering
    """

    def __init__(self, train_df: pd.DataFrame) -> None:
        """
        :param train_df: training DataFrame
        """
        self.df = train_df.copy(deep=True)

    def missing_information(self, percentage: bool = False) -> pd.Series:
        """
        shows us information about missing data

        :param percentage: set to True, if you want to have percentage information on the output
        :return: Series with information on missing rows in each column
        """
        df_shape = self.df.shape
        print(f'df.shape: {df_shape}')
        is_null_sum = self.df.isnull().sum()
        missing_information = (is_null_sum
                               if not percentage
                               else (is_null_sum / df_shape[0] * 100).round(2))
        return missing_information

    @staticmethod
    def _group_columns() -> defaultdict:
        """
        returns dictionary with logically grouped columns
        """
        groups = defaultdict()
        groups['age'] = ['age']
        groups['emails_and_phone_calls'] = ['emails', 'phone_calls']
        groups['low_cardinality'] = ['offer_class', 'gender', 'customer_type', 'center']
        groups['high_cardinality'] = ['customer_code', 'offer_code', 'number']
        groups['numerical'] = ['salary', 'offer_value', 'estimated_expenses']
        groups['target'] = ['accepted']
        return groups

    @staticmethod
    def _missing_salary(df: pd.DataFrame) -> pd.Series:
        """
        fills missing data in salary column:
        for people with the same customer_code fills the same data as in other row,
        for other people
        """
        df = df.copy(deep=True)
        df['salary_temp'] = df['salary'].fillna(0.1)
        mask = ['customer_type', 'customer_code', 'salary_temp']
        temp = (df[mask]
                .groupby(mask)
                .size()
                .reset_index()
                .rename(columns={0: 'size'}))
        temp = (temp[temp
                .duplicated(subset=['customer_code', 'customer_type'],
                            keep=False)][['customer_code', 'salary_temp']])
        temp = temp[temp['salary_temp'] != 0.1]
        temp = temp.set_index('customer_code')['salary_temp']
        salary_map = temp.to_dict()
        df['salary'] = (df['salary']
                        .fillna(df['customer_code']
                                .map(salary_map)))
        temp = (df[['customer_type', 'salary']]
                .groupby('customer_type')
                .median())
        temp = temp['salary']
        salary_map = temp.to_dict()
        df['salary'] = (df['salary']
                        .fillna(df['customer_type']
                                .map(salary_map)))
        # to be able to np.log('salary')
        df.loc[df['salary'] == 0, 'salary'] = 0.0001
        return df['salary']

    def _prepare_to_knn(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        prepares data for knn imputing (of missing data)
        for training data uses also labels to give more information then to test set
        """
        df = df.copy(deep=True)
        temp = pd.get_dummies(df['gender'])
        groups = self._group_columns()
        if 'accepted' in df.columns:
            temp = pd.concat([pd.get_dummies(df['accepted']), temp], axis=1)
        for col in itertools.chain(groups['emails_and_phone_calls'], groups['low_cardinality']):
            temp = pd.concat([pd.get_dummies(df[col]), temp], axis=1)
        knn_to_complete = pd.concat([temp, df[groups['numerical'] + groups['age']]], axis=1)
        return knn_to_complete

    def deal_with_missing_values(self, n_neighbors: int = 5) -> pd.DataFrame:
        """
        :param n_neighbors: number of neighbours for knn algorithm
        :return: returns DataFrame with filled nans (also, leaves unchanged 'age' column ->
        to be used further for possible age binning)
        """
        df = self.df.copy(deep=True)
        groups = self._group_columns()
        for col in groups['high_cardinality']:
            df[col] = df[col].fillna('missing')

        def _prt_process_emails_and_phone_calls(
                emails_phone_calls_df: pd.DataFrame) -> pd.DataFrame:
            """
            initially processes emails and phone_calls columns:
            models some outliers with almost full no responses, to have constant value
            """
            emails_phone_calls_df.loc[emails_phone_calls_df['emails'] > 4, 'emails'] = 5
            emails_phone_calls_df.loc[emails_phone_calls_df['phone_calls'] > 3, 'phone_calls'] = 4
            return emails_phone_calls_df

        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].fillna(round(df[col].mean()))

        df = _prt_process_emails_and_phone_calls(df)

        for col in groups['low_cardinality']:
            df[col] = df[col].fillna(method='ffill')
        df['salary'] = self._missing_salary(df)
        temp = (df[groups['emails_and_phone_calls']
                   + groups['age']
                   + groups['low_cardinality']
                   + groups['numerical']])
        knn_unfilled_table = self._prepare_to_knn(temp)
        knn_filled = (KNN(k=n_neighbors,
                          print_interval=1032)
                      .fit_transform(knn_unfilled_table
                                     .to_numpy()))
        knn_imputed_cols = ['age_knn', 'estimated_expenses_knn', 'offer_value_knn']
        for col in knn_imputed_cols:
            df[col] = knn_filled[:, -knn_imputed_cols.index(col) - 1]
        df = df.drop(columns=['offer_value', 'estimated_expenses'])
        return df

    @staticmethod
    def _process_age(age_df: pd.DataFrame) -> pd.DataFrame:
        """
        Divides age for 2 bins: where age is nan and opposite case
        (we consider situation, when it's possible that client does not gave us their age because
        he wasn't truly interested in cooperation with us)
        """
        age_df = age_df.copy(deep=True)
        age_df['nan_age'] = age_df['age'].isna()
        age_df['not_nan_age'] = age_df['age'].notna()
        return age_df[['nan_age', 'not_nan_age']]

    @staticmethod
    def _process_target(target_df: pd.DataFrame) -> pd.DataFrame:
        """
        Maps target:
        1 for accepted
        0 for not accepted
        (for ML algorithms purposes)
        """
        target_df = target_df.copy(deep=True)
        target_df['target'] = target_df['accepted'] == 'yes'
        return target_df[['target']]

    @staticmethod
    def _process_high_cardinality_categorical_cols(high_cardinal_df: pd.DataFrame) -> pd.DataFrame:
        """
        creates features for high cardinality categorical columns: (those which won't bring any
        additional value where encoded)
        at this moment we consider only customer_code column as profitable
        """
        high_cardinal_df = high_cardinal_df.copy(deep=True)
        high_cardinal_df['cc_len'] = high_cardinal_df['customer_code'].str.len()
        high_cardinal_df.loc[high_cardinal_df['cc_len'].isin([5, 8]), 'cc_len'] = '58'
        high_cardinal_df.loc[~high_cardinal_df['cc_len'].isin(['58']), 'cc_len'] = 'ELSE'
        high_cardinal_df['cc_startswith'] = high_cardinal_df['customer_code']
        a_p_c = ['A', 'P', 'C']
        for letter in a_p_c:
            (high_cardinal_df
                .loc[high_cardinal_df['customer_code']
                                      .str
                                      .startswith(letter), 'cc_startswith']) = letter
        (high_cardinal_df
            .loc[~high_cardinal_df['customer_code']
                                   .str
                                   .startswith(tuple(a_p_c)), 'cc_startswith']) = 'ELSE'
        return high_cardinal_df[['cc_len', 'cc_startswith']]

    @staticmethod
    def _process_numerical_cols(numerical_df: pd.DataFrame) -> pd.DataFrame:
        """
        creates features:
        - standard scales numerical columns,
        - log scales numerical columns,
        to consider three cases when modelling:
        only standard scaled, only log scaled, and mixed using variances
        """
        numeric_cols = [col for col in numerical_df.columns if numerical_df[col].dtype != object]
        numeric_cols.remove('age')
        temp = numerical_df[numeric_cols]
        scaler = StandardScaler()
        for col in numeric_cols:
            temp['log_' + col] = np.log(temp[col])
        log_cols = [col for col in temp.columns if col not in numerical_df.columns]
        log_subset = temp[log_cols]
        temp_subset = temp[numeric_cols]
        scaled = scaler.fit_transform(temp_subset)
        scaled = pd.DataFrame(scaled,
                              columns='scaled_' + temp_subset.columns,
                              index=temp.index)
        processed = pd.concat([log_subset, scaled], axis=1)
        return processed

    def perform_initial_features_engineering(self):
        """
        performs initial feature engineering (without encoding -> will be done as individual part
        due to some maintenance issues (how to cross validate target encoding?)
        """
        groups = self._group_columns()
        df = self.deal_with_missing_values().copy(deep=True)
        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].astype(object)
        age = self._process_age(df)
        numerical = self._process_numerical_cols(df)
        high_cardinal = self._process_high_cardinality_categorical_cols(df)
        columns_to_drop = ['customer_code', 'number', 'offer_code']
        if 'accepted' in df.columns:
            columns_to_drop.append('accepted')
            target = self._process_target(df)
            df = pd.concat([target, df], axis=1)
        df = pd.concat([age, df], axis=1)
        df = pd.concat([numerical, df], axis=1)
        df = pd.concat([high_cardinal, df], axis=1)
        df.drop(columns=columns_to_drop, inplace=True)

        return df


class TestDataProcessor(DataProcessor):
    """
    DataProcessor adapted to test set needs
    """

    def __init__(self, not_processed_train_df, processed_train_df, test_df, sneaky_peaky=True):
        """
        :param not_processed_train_df: self explanatory
        :param processed_train_df: self explanatory
        :param test_df: self explanatory
        :param sneaky_peaky: set to True:
            uses some knn columns as 'original' to bring a little 'overfitting' to test set.
            As we have only lot of missing values in age, tries to sneak some correlation between
            age and responses
        """
        DataProcessor.__init__(self, not_processed_train_df)

        if sneaky_peaky:
            self.df['age'] = processed_train_df['age_knn']
            self.df['estimated_expenses'] = processed_train_df['estimated_expenses_knn']
            self.df['offer_value'] = processed_train_df['offer_value_knn']
        self.df = (pd
                   .concat([self
                           .df
                           .drop(columns=['salary', 'accepted']), processed_train_df['salary']],
                           axis=1))

        self.train_len = len(processed_train_df)
        self.df = pd.concat([self.df, test_df], axis=0).copy(deep=True)

    def perform_initial_features_engineering(self):
        """
        performs initial features engineering on test set
        """
        df = DataProcessor.perform_initial_features_engineering(self)
        df = df[self.train_len:]

        return df



In [646]:
a.columns

Index(['cc_len', 'cc_startswith', 'log_salary', 'log_age_knn',
       'log_estimated_expenses_knn', 'log_offer_value_knn', 'scaled_salary',
       'scaled_age_knn', 'scaled_estimated_expenses_knn',
       'scaled_offer_value_knn', 'nan_age', 'not_nan_age', 'target',
       'offer_class', 'gender', 'age', 'phone_calls', 'emails', 'salary',
       'customer_type', 'center', 'age_knn', 'estimated_expenses_knn',
       'offer_value_knn'],
      dtype='object')

In [651]:
#a

In [659]:
test_df['age'].isna().sum()

69

In [661]:
test_df.shape

(258, 13)

In [665]:
import itertools
from collections import defaultdict

import numpy as np
import pandas as pd
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler


class DataProcessor:
    """
    processes data, is well adapted for processing training DataFrame,
    - deals with missing data,
    - converts column types for further work (CategoricalEncoding of categorical columns),
    - performs initial features engineering
    """

    def __init__(self, train_df: pd.DataFrame) -> None:
        """
        :param train_df: training DataFrame
        """
        self.df = train_df.copy(deep=True)

    def missing_information(self, percentage: bool = False) -> pd.Series:
        """
        shows us information about missing data

        :param percentage: set to True, if you want to have percentage information on the output
        :return: Series with information on missing rows in each column
        """
        df_shape = self.df.shape
        print(f'df.shape: {df_shape}')
        is_null_sum = self.df.isnull().sum()
        missing_information = (is_null_sum
                               if not percentage
                               else (is_null_sum / df_shape[0] * 100).round(2))
        return missing_information

    @staticmethod
    def _group_columns() -> defaultdict:
        """
        returns dictionary with logically grouped columns
        """
        groups = defaultdict()
        groups['age'] = ['age']
        groups['emails_and_phone_calls'] = ['emails', 'phone_calls']
        groups['low_cardinality'] = ['offer_class', 'gender', 'customer_type', 'center']
        groups['high_cardinality'] = ['customer_code', 'offer_code', 'number']
        groups['numerical'] = ['salary', 'offer_value', 'estimated_expenses']
        groups['target'] = ['accepted']
        return groups

    @staticmethod
    def _missing_salary(df: pd.DataFrame) -> pd.Series:
        """
        fills missing data in salary column:
        for people with the same customer_code fills the same data as in other row,
        for other people
        """
        df = df.copy(deep=True)
        df['salary_temp'] = df['salary'].fillna(0.1)
        mask = ['customer_type', 'customer_code', 'salary_temp']
        temp = (df[mask]
                .groupby(mask)
                .size()
                .reset_index()
                .rename(columns={0: 'size'}))
        temp = (temp[temp
                .duplicated(subset=['customer_code', 'customer_type'],
                            keep=False)][['customer_code', 'salary_temp']])
        temp = temp[temp['salary_temp'] != 0.1]
        temp = temp.set_index('customer_code')['salary_temp']
        salary_map = temp.to_dict()
        df['salary'] = (df['salary']
                        .fillna(df['customer_code']
                                .map(salary_map)))
        temp = (df[['customer_type', 'salary']]
                .groupby('customer_type')
                .median())
        temp = temp['salary']
        salary_map = temp.to_dict()
        df['salary'] = (df['salary']
                        .fillna(df['customer_type']
                                .map(salary_map)))
        # to be able to np.log('salary')
        df.loc[df['salary'] == 0, 'salary'] = 0.0001
        return df['salary']

    def _prepare_to_knn(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        prepares data for knn imputing (of missing data)
        for training data uses also labels to give more information then to test set
        """
        df = df.copy(deep=True)
        temp = pd.get_dummies(df['gender'])
        groups = self._group_columns()
        if 'accepted' in df.columns:
            temp = pd.concat([pd.get_dummies(df['accepted']), temp], axis=1)
        for col in itertools.chain(groups['emails_and_phone_calls'], groups['low_cardinality']):
            temp = pd.concat([pd.get_dummies(df[col]), temp], axis=1)
        knn_to_complete = pd.concat([temp, df[groups['numerical'] + groups['age']]], axis=1)
        return knn_to_complete

    def deal_with_missing_values(self, n_neighbors: int = 5) -> pd.DataFrame:
        """
        :param n_neighbors: number of neighbours for knn algorithm
        :return: returns DataFrame with filled nans (also, leaves unchanged 'age' column ->
        to be used further for possible age binning)
        """
        df = self.df.copy(deep=True)
        groups = self._group_columns()
        for col in groups['high_cardinality']:
            df[col] = df[col].fillna('missing')

        def _prt_process_emails_and_phone_calls(
                emails_phone_calls_df: pd.DataFrame) -> pd.DataFrame:
            """
            initially processes emails and phone_calls columns:
            models some outliers with almost full no responses, to have constant value
            """
            emails_phone_calls_df.loc[emails_phone_calls_df['emails'] > 4, 'emails'] = 5
            emails_phone_calls_df.loc[emails_phone_calls_df['phone_calls'] > 3, 'phone_calls'] = 4
            return emails_phone_calls_df

        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].fillna(round(df[col].mean()))

        df = _prt_process_emails_and_phone_calls(df)

        for col in groups['low_cardinality']:
            df[col] = df[col].fillna(method='ffill')
        df['salary'] = self._missing_salary(df)
        temp = (df[groups['emails_and_phone_calls']
                   + groups['age']
                   + groups['low_cardinality']
                   + groups['numerical']])
        knn_unfilled_table = self._prepare_to_knn(temp)
        knn_filled = (KNN(k=n_neighbors,
                          print_interval=1032)
                      .fit_transform(knn_unfilled_table
                                     .to_numpy()))
        knn_imputed_cols = ['age_knn', 'estimated_expenses_knn', 'offer_value_knn']
        for col in knn_imputed_cols:
            df[col] = knn_filled[:, -knn_imputed_cols.index(col) - 1]
        df = df.drop(columns=['offer_value', 'estimated_expenses'])
        return df

    @staticmethod
    def _process_age(age_df: pd.DataFrame) -> pd.DataFrame:
        """
        Divides age for 2 bins: where age is nan and opposite case
        (we consider situation, when it's possible that client does not gave us their age because
        he wasn't truly interested in cooperation with us)
        """
        age_df = age_df.copy(deep=True)
        age_df['nan_age'] = age_df['age'].isna()
        age_df['not_nan_age'] = age_df['age'].notna()
        return age_df[['nan_age', 'not_nan_age']]

    @staticmethod
    def _process_target(target_df: pd.DataFrame) -> pd.DataFrame:
        """
        Maps target:
        1 for accepted
        0 for not accepted
        (for ML algorithms purposes)
        """
        target_df = target_df.copy(deep=True)
        target_df['target'] = target_df['accepted'] == 'yes'
        return target_df[['target']]

    @staticmethod
    def _process_high_cardinality_categorical_cols(high_cardinal_df: pd.DataFrame) -> pd.DataFrame:
        """
        creates features for high cardinality categorical columns: (those which won't bring any
        additional value where encoded)
        at this moment we consider only customer_code column as profitable
        """
        high_cardinal_df = high_cardinal_df.copy(deep=True)
        high_cardinal_df['cc_len'] = high_cardinal_df['customer_code'].str.len()
        high_cardinal_df.loc[high_cardinal_df['cc_len'].isin([5, 8]), 'cc_len'] = '58'
        high_cardinal_df.loc[~high_cardinal_df['cc_len'].isin(['58']), 'cc_len'] = 'ELSE'
        high_cardinal_df['cc_startswith'] = high_cardinal_df['customer_code']
        a_p_c = ['A', 'P', 'C']
        for letter in a_p_c:
            (high_cardinal_df
                .loc[high_cardinal_df['customer_code']
                                      .str
                                      .startswith(letter), 'cc_startswith']) = letter
        (high_cardinal_df
            .loc[~high_cardinal_df['customer_code']
                                   .str
                                   .startswith(tuple(a_p_c)), 'cc_startswith']) = 'ELSE'
        return high_cardinal_df[['cc_len', 'cc_startswith']]

    @staticmethod
    def _process_numerical_cols(numerical_df: pd.DataFrame) -> pd.DataFrame:
        """
        creates features:
        - standard scales numerical columns,
        - log scales numerical columns,
        to consider three cases when modelling:
        only standard scaled, only log scaled, and mixed using variances
        """
        numeric_cols = [col for col in numerical_df.columns if numerical_df[col].dtype != object]
        numeric_cols.remove('age')
        temp = numerical_df[numeric_cols]
        scaler = StandardScaler()
        for col in numeric_cols:
            temp['log_' + col] = np.log(temp[col])
        log_cols = [col for col in temp.columns if col not in numerical_df.columns]
        log_subset = temp[log_cols]
        temp_subset = temp[numeric_cols]
        scaled = scaler.fit_transform(temp_subset)
        scaled = pd.DataFrame(scaled,
                              columns='scaled_' + temp_subset.columns,
                              index=temp.index)
        processed = pd.concat([log_subset, scaled], axis=1)
        return processed

    def perform_initial_features_engineering(self):
        """
        performs initial feature engineering (without encoding -> will be done as individual part
        due to some maintenance issues (how to cross validate target encoding?)
        """
        groups = self._group_columns()
        df = self.deal_with_missing_values().copy(deep=True)
        for col in groups['emails_and_phone_calls']:
            df[col] = df[col].astype(object)
        age = self._process_age(df)
        numerical = self._process_numerical_cols(df)
        high_cardinal = self._process_high_cardinality_categorical_cols(df)
        columns_to_drop = ['customer_code', 'number', 'offer_code']
        if 'accepted' in df.columns:
            columns_to_drop.append('accepted')
            target = self._process_target(df)
            df = pd.concat([target, df], axis=1)
        df = pd.concat([age, df], axis=1)
        df = pd.concat([numerical, df], axis=1)
        df = pd.concat([high_cardinal, df], axis=1)
        df.drop(columns=columns_to_drop, inplace=True)

        return df


class TestDataProcessor(DataProcessor):
    """
    DataProcessor adapted to test set needs
    """

    def __init__(self, not_processed_train_df, processed_train_df, test_df, sneaky_peaky=True):
        """
        :param not_processed_train_df: self explanatory
        :param processed_train_df: self explanatory
        :param test_df: self explanatory
        :param sneaky_peaky: set to True:
            uses some knn columns as 'original' to bring a little 'overfitting' to test set.
            As we have only lot of missing values in age, tries to sneak some correlation between
            age and responses
        """
        DataProcessor.__init__(self, not_processed_train_df)

        if sneaky_peaky:
            self.df['age'] = processed_train_df['age_knn']
            self.df['estimated_expenses'] = processed_train_df['estimated_expenses_knn']
            self.df['offer_value'] = processed_train_df['offer_value_knn']
        self.df = (pd
                   .concat([self
                           .df
                           .drop(columns=['salary', 'accepted']), processed_train_df['salary']],
                           axis=1))

        self.train_len = len(processed_train_df)
        self.df = pd.concat([self.df, test_df], axis=0).copy(deep=True)

    def perform_initial_features_engineering(self):
        """
        performs initial features engineering on test set
        """
        df = DataProcessor.perform_initial_features_engineering(self)
        df = df[self.train_len:]

        return df









In [668]:
TDP = TestDataProcessor(train_df, a, test_df, sneaky_peaky=True)

In [669]:
b = TDP.perform_initial_features_engineering()

Imputing row 1/1289 with 0 missing, elapsed time: 0.343
Imputing row 1033/1289 with 0 missing, elapsed time: 0.345


In [664]:
c = TDP.perform_initial_features_engineering()

Imputing row 1/1289 with 1 missing, elapsed time: 0.366
Imputing row 1033/1289 with 0 missing, elapsed time: 0.377


In [657]:
np.sum(c['age_knn'] != b['age_knn'])

59

In [654]:
b.head()

Unnamed: 0_level_0,cc_len,cc_startswith,log_salary,log_age_knn,log_estimated_expenses_knn,log_offer_value_knn,scaled_salary,scaled_age_knn,scaled_estimated_expenses_knn,scaled_offer_value_knn,nan_age,not_nan_age,age,center,customer_type,emails,gender,offer_class,phone_calls,salary,age_knn,estimated_expenses_knn,offer_value_knn
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
DA3BAEB8BF604EB,ELSE,ELSE,6.594755,3.489576,8.940891,4.9636,-0.501839,-0.035049,1.603528,0.246935,True,False,,A,S,1,male,Premium,0,731.25,32.772049,7638.0,143.108044
AEF3DE08DFED4E0,ELSE,ELSE,6.656084,3.433987,8.662851,4.153202,-0.492946,-0.197055,0.631794,-1.130746,False,True,31.0,A,S,0,male,Premium,0,777.5,31.0,5784.0,63.637416
B621BB29484E46D,ELSE,ELSE,6.671501,3.218876,8.434898,5.348081,-0.490623,-0.745594,0.013846,1.410101,False,True,25.0,B,S,1,male,Premium,0,789.58,25.0,4605.0,210.204552
2D0945802F92423,ELSE,ELSE,6.514713,2.890372,8.40268,4.876044,-0.512654,-1.385557,-0.062677,0.038956,False,True,18.0,B,Q,1,female,Premium,0,675.0,18.0,4459.0,131.110924
640ABFC7E49B403,58,ELSE,8.029296,4.025352,8.38845,5.193064,-0.052226,2.088525,-0.095697,0.886816,False,True,56.0,B,C,1,male,Medium,0,3069.58,56.0,4396.0,180.019199


In [625]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Index: 258 entries, DA3BAEB8BF604EB to 2E27EC78F50943B
Data columns (total 23 columns):
cc_len                           258 non-null object
cc_startswith                    258 non-null object
log_salary                       258 non-null float64
log_age_knn                      258 non-null float64
log_estimated_expenses_knn       258 non-null float64
log_offer_value_knn              258 non-null float64
scaled_salary                    258 non-null float64
scaled_age_knn                   258 non-null float64
scaled_estimated_expenses_knn    258 non-null float64
scaled_offer_value_knn           258 non-null float64
nan_age                          258 non-null bool
not_nan_age                      258 non-null bool
age                              189 non-null float64
center                           258 non-null object
customer_type                    258 non-null object
emails                           258 non-null object
gender               

In [624]:
a.shape

(1031, 24)

In [623]:
b.shape

(258, 23)

In [617]:
a.shape

(1031, 24)

In [616]:
b.info()

<class 'pandas.core.frame.DataFrame'>
Index: 258 entries, DA3BAEB8BF604EB to 2E27EC78F50943B
Data columns (total 23 columns):
cc_len                           258 non-null object
cc_startswith                    258 non-null object
log_salary                       258 non-null float64
log_age_knn                      258 non-null float64
log_estimated_expenses_knn       258 non-null float64
log_offer_value_knn              258 non-null float64
scaled_salary                    258 non-null float64
scaled_age_knn                   258 non-null float64
scaled_estimated_expenses_knn    258 non-null float64
scaled_offer_value_knn           258 non-null float64
nan_age                          258 non-null bool
not_nan_age                      258 non-null bool
age                              189 non-null float64
center                           258 non-null object
customer_type                    258 non-null object
emails                           258 non-null object
gender               

In [601]:
test_df.shape

(258, 13)

In [591]:
train_df[0:4]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A


In [None]:
train_df[3]

In [589]:
train_df[3:5]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B


In [587]:
pd.concat([test_df, train_df.drop(columns=['accepted'])], axis=0)

Unnamed: 0_level_0,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DA3BAEB8BF604EB,Premium,male,,0.0,1.0,Fa 265302,731.25,99F,S,D6706,143.108044,7638.0,A
AEF3DE08DFED4E0,Premium,male,31.0,0.0,0.0,347063,777.5,405,S,,63.637416,5784.0,A
B621BB29484E46D,Premium,male,25.0,0.0,1.0,349250,789.58,954,S,7E9CB,210.204552,4605.0,B
2D0945802F92423,Premium,female,18.0,0.0,1.0,365226,675.0,D7D,Q,6361F,131.110924,4459.0,B
640ABFC7E49B403,Medium,male,56.0,0.0,1.0,17764,3069.58,BAF,C,63C3B,180.019199,4396.0,B
9BF7EE347C8749E,High,female,25.0,0.0,1.0,230433,2600.0,82D,S,65010,186.082807,5714.0,A
E000C782D2EA42B,High,female,54.0,1.0,4.0,29105,2300.0,7C6,S,4C6B9,83.194275,1371.0,B
EB115804FC12450,Premium,male,36.0,0.0,0.0,A/5 21175,725.0,E96,S,AB5E2,101.359082,1512.0,B
D38B370D0F67436,Premium,male,28.0,1.0,0.0,STON/O2. 3101279,1585.0,CFD,S,C8511,66.86847,5303.0,A
51B08DE7D3014FF,High,female,42.0,0.0,0.0,236852,1300.0,52D,S,D7FC5,74.346069,5748.0,A


In [572]:
a[a['offer_code'].str.contains('E\+')].sort_values(['offer_code'])

Unnamed: 0_level_0,cc_len,cc_startswith,log_salary,log_age_knn,log_estimated_expenses_knn,log_offer_value_knn,scaled_salary,scaled_age_knn,scaled_estimated_expenses_knn,scaled_offer_value_knn,nan_age,not_nan_age,target,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,center,age_knn,estimated_expenses_knn,offer_value_knn
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
1004379959394F2,ELSE,ELSE,7.276155,2.890372,8.474494,4.864278,-0.360952,-1.411383,0.09388,0.022231,False,True,False,Premium,female,18.0,0,2,2691,1445.42,0.0,C,20148,B,18.0,4791.0,129.577331
C9DE5A2E7BC24AB,ELSE,ELSE,-9.21034,3.688879,8.626765,5.172074,-0.642538,0.631047,0.50757,0.829192,False,True,False,Medium,male,40.0,0,1,112059,0.0001,0.0,S,29B9B,A,40.0,5579.0,176.279975
1A6D8FB4CCE04B6,58,P,9.025916,3.663562,8.968014,4.218891,0.977492,0.538209,1.698767,-1.04247,False,True,True,Medium,female,39.0,1,2,PC 17756,8315.83,0.0,C,F5285,B,39.0,7848.0,67.958094
2570BCC8C4C3421,ELSE,C,6.626718,3.514452,8.167068,4.979573,-0.495454,0.036656,-0.571805,0.295837,True,False,False,Premium,male,,0,1,C.A. 49867,755.0,0.0,S,6801A,B,33.597526,3523.0,145.412245
309837BADC1C477,ELSE,ELSE,7.681523,3.44859,8.546169,4.910904,-0.220199,-0.162159,0.280775,0.129094,True,False,False,Premium,male,,2,1,2662,2167.92,0.0,C,DF50C,B,31.455998,5147.0,135.762026
4CF2BD7C25314F0,ELSE,ELSE,6.586172,3.650658,8.552946,4.587439,-0.501298,0.49179,0.29915,-0.519194,False,True,False,Premium,male,38.5,0,1,missing,725.0,1.0,S,120AA,A,38.5,5182.0,98.242553
F35EECD5042D405,ELSE,A,6.690842,3.385057,7.994632,4.577639,-0.485713,-0.341923,-0.864748,-0.53575,True,False,False,Premium,male,,0,0,A4. 54510,805.0,100.0,S,C6BA4,B,29.519668,2965.0,97.284387
DBA57BFE0FBB471,ELSE,ELSE,6.652863,3.359638,8.902047,4.368015,-0.491558,-0.410707,1.435748,-0.853635,True,False,False,Premium,male,,0,1,364851,775.0,10000000.0,Q,8AEB7,B,28.778763,7347.0,78.886864
467839227F4E49E,ELSE,ELSE,7.316548,3.261489,8.556029,5.267893,-0.349345,-0.660478,0.30755,1.135489,True,False,False,High,male,,0,0,SC/PARIS 2131,1505.0,1000000000.0,C,7C376,B,26.088362,5198.0,194.006804
B4AF8E48A3C04EF,ELSE,ELSE,7.863267,3.526361,8.908559,5.037529,-0.136025,0.07402,1.460948,0.445758,False,True,False,High,male,34.0,1,0,226875,2600.0,20.0,S,7229D,B,34.0,7395.0,154.088854


In [562]:
a.groupby(['cc_len', 'target']).size()

cc_len  target
58      False      96
        True      139
ELSE    False     544
        True      252
dtype: int64

In [553]:
a.groupby(['cc_startswith', 'target']).size()

cc_startswith  target
A              False      24
               True        2
C              False      40
               True       19
ELSE           False     553
               True      320
P              False      23
               True       50
dtype: int64

In [514]:
a[a['cc_len'] > 8]['target'].value_counts()

False    95
True     51
Name: target, dtype: int64

In [515]:
a[a['cc_len'] == 8]['target'].value_counts()

True     49
False    39
Name: target, dtype: int64

In [528]:
a[a['cc_len'].isin([5,8])]['target'].value_counts()

True     139
False     96
Name: target, dtype: int64

In [529]:
a[~a['cc_len'].isin([5,8])]['target'].value_counts()

False    544
True     252
Name: target, dtype: int64

In [492]:
a.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031 entries, 8550AB469CB2445 to 358EEC4160A4478
Data columns (total 26 columns):
cc_len                           1031 non-null int64
log_salary                       1031 non-null float64
log_age_knn                      1031 non-null float64
log_estimated_expenses_knn       1031 non-null float64
log_offer_value_knn              1031 non-null float64
scaled_salary                    1031 non-null float64
scaled_age_knn                   1031 non-null float64
scaled_estimated_expenses_knn    1031 non-null float64
scaled_offer_value_knn           1031 non-null float64
nan_age                          1031 non-null bool
not_nan_age                      1031 non-null bool
target                           1031 non-null bool
offer_class                      1031 non-null object
gender                           1031 non-null object
age                              688 non-null float64
phone_calls                      1031 non-null object
emails  

In [473]:
a[a['cc_len']>9]['target']

Unnamed: 0_level_0,target,target
name,Unnamed: 1_level_1,Unnamed: 2_level_1
174003D902C3450,False,False
7D1020DC3B34422,False,False
F0FEC2493E11410,False,False
24D13EF5820C49E,True,True
282C859D670D45B,False,False
04779E8B7D83463,False,False
9D8E425268E8421,True,True
E4B445C848E4401,False,False
A5008A88F1BE45E,False,False
9DDF7D7DC39242A,False,False


In [450]:
train_df.shape

(1031, 14)

#### encode target

In [None]:
train_df['target'] = train_df['accepted'] == 'yes'

In [None]:
#train_df['target']

#### feature engineering

In [59]:
enc = LeaveOneOutEncoder(train_df=train_df, test_df=test_df, columns_to_encode=[], target_column=, random_state=42, mean=1, std=0.05)

SyntaxError: invalid syntax (<ipython-input-59-c8a4f36707a6>, line 1)