**using this there is no need to reload your packages every time when you will make any improvement to them**

In [None]:
%load_ext autoreload

In [None]:
autoreload 2

**import necessary libraries**

In [3]:
import numpy as np
import pandas as pd 
import warnings

from pathlib import Path 

In [4]:
from matplotlib import pyplot as plt

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

**import modelling helpers**

In [6]:
from sklearn.model_selection import train_test_split

**set pandas options**

In [7]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

**set warnings options**

In [8]:
warnings.filterwarnings("ignore")

**load data**

## you need to change data path manually here :)

In [9]:
LOCAL_PATH = Path('/Users/mjasiecz/PycharmProjects/') #here

PROJECT_PATH = Path('new_offer_success_predictor/data/raw/')
filename = 'client_database'
suffix = '.parquet'

In [10]:
data_path = LOCAL_PATH.joinpath(PROJECT_PATH).joinpath(Path(filename)).with_suffix(suffix)

In [11]:
df = pd.read_parquet(data_path, engine='pyarrow')

In [12]:
df.head()

Unnamed: 0,offer_class,accepted,name,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
0,Medium,yes,C7CBB5C5613449B,female,29.0,0.0,0.0,24160,21133.75,4AB,S,9E9FA,57.426571,4692.0,A
1,Medium,yes,CFD09C0248BB417,male,,1.0,2.0,113781,15155.0,61A,S,1E53D,141.639912,3164.0,A
2,Medium,no,A2A0DC541977473,female,,1.0,3.0,113781,15155.0,DB4,S,1.36E+06,154.82113,1852.0,A
3,Medium,no,9068458EB70D427,male,30.0,1.0,2.0,113781,15155.0,9B6,S,F6529,106.256196,3753.0,B
4,Medium,no,46F0CD19CF71429,female,25.0,1.0,2.0,113781,15155.0,191,S,E2FDF,139.237147,2410.0,A


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1308
Data columns (total 15 columns):
offer_class           1289 non-null object
accepted              1289 non-null object
name                  1289 non-null object
gender                1289 non-null object
age                   877 non-null float64
phone_calls           1286 non-null float64
emails                1287 non-null float64
customer_code         1265 non-null object
salary                1281 non-null float64
offer_code            1287 non-null object
customer_type         1287 non-null object
number                1280 non-null object
offer_value           1277 non-null float64
estimated_expenses    1286 non-null float64
center                1306 non-null object
dtypes: float64(6), object(9)
memory usage: 163.6+ KB


In [14]:
df.isnull().sum()

offer_class            20
accepted               20
name                   20
gender                 20
age                   432
phone_calls            23
emails                 22
customer_code          44
salary                 28
offer_code             22
customer_type          22
number                 29
offer_value            32
estimated_expenses     23
center                  3
dtype: int64

In [15]:
df[df['accepted'].isnull()].head(3)

Unnamed: 0,offer_class,accepted,name,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
1289,,,,,,,,,,,,,,,A
1290,,,,,,,,,,,,,,,A
1291,,,,,,,,,,,,,,,A


In [16]:
df = df[df['accepted'].notna()]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1289 entries, 0 to 1288
Data columns (total 15 columns):
offer_class           1289 non-null object
accepted              1289 non-null object
name                  1289 non-null object
gender                1289 non-null object
age                   877 non-null float64
phone_calls           1286 non-null float64
emails                1287 non-null float64
customer_code         1265 non-null object
salary                1281 non-null float64
offer_code            1287 non-null object
customer_type         1287 non-null object
number                1280 non-null object
offer_value           1277 non-null float64
estimated_expenses    1286 non-null float64
center                1286 non-null object
dtypes: float64(6), object(9)
memory usage: 161.1+ KB


In [18]:
df.isnull().sum()

offer_class             0
accepted                0
name                    0
gender                  0
age                   412
phone_calls             3
emails                  2
customer_code          24
salary                  8
offer_code              2
customer_type           2
number                  9
offer_value            12
estimated_expenses      3
center                  3
dtype: int64

In [19]:
df[df['age'].isna()]['accepted'].value_counts()

no     261
yes    151
Name: accepted, dtype: int64

In [20]:
df[df['age'].notna()]['accepted'].value_counts()

no     539
yes    338
Name: accepted, dtype: int64

In [21]:
#train_df['phone_calls']

In [22]:
print(df.isnull().sum()/df.shape[0]*100)

offer_class            0.000000
accepted               0.000000
name                   0.000000
gender                 0.000000
age                   31.962762
phone_calls            0.232739
emails                 0.155159
customer_code          1.861908
salary                 0.620636
offer_code             0.155159
customer_type          0.155159
number                 0.698216
offer_value            0.930954
estimated_expenses     0.232739
center                 0.232739
dtype: float64


In [23]:
csv_suffix = '.csv'

In [24]:
train_dataset = LOCAL_PATH.joinpath(PROJECT_PATH).joinpath('train').with_suffix(csv_suffix)

In [25]:
test_dataset = LOCAL_PATH.joinpath(PROJECT_PATH).joinpath('test').with_suffix(csv_suffix)

In [26]:
if train_dataset.exists() and test_dataset.exists():
    print('Split is done. Do not data snoop!')
else:
    print('Preparing train and test datasets.')
    df = df[df['accepted'].notna()].set_index('name')
    df_predictors = df.drop(columns=['accepted'])
    df_target = df['accepted']
    df_train, df_test, df_train_target, df_test_target = train_test_split(df_predictors, df_target, test_size=0.2, random_state=42, stratify=df_target) #moze popracowac nad tym stratify/random_state (wziac srednia z paru random_state'ow)
    df_train.insert(0, column='accepted', value=df_train_target)
    df_test.insert(0, column='accepted', value=df_test_target)
    df_train.to_csv(path_or_buf=train_dataset)
    df_test.to_csv(path_or_buf=test_dataset)
    print('Datasets are ready to use.')

Split is done. Do not data snoop!


In [27]:
train_df = pd.read_csv(train_dataset, index_col='name')
test_df = pd.read_csv(test_dataset, index_col='name')
test_df.drop(columns=['accepted'], inplace=True)

In [28]:
train_df.columns

Index(['accepted', 'offer_class', 'gender', 'age', 'phone_calls', 'emails',
       'customer_code', 'salary', 'offer_code', 'customer_type', 'number',
       'offer_value', 'estimated_expenses', 'center'],
      dtype='object')

In [29]:
train_df.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B


In [30]:
train_df['accepted'].value_counts()

no     640
yes    391
Name: accepted, dtype: int64

In [31]:
yes = len(train_df[train_df['accepted'] == 'yes'])
no = len(train_df[train_df['accepted'] == 'no'])

In [32]:
proportion = yes/(yes+no)

In [33]:
print(proportion)

0.3792434529582929


In [34]:
train_df.isnull().sum()

accepted                0
offer_class             0
gender                  0
age                   343
phone_calls             2
emails                  2
customer_code          16
salary                  7
offer_code              2
customer_type           2
number                  6
offer_value            10
estimated_expenses      3
center                  2
dtype: int64

In [35]:
no_salary_cc = df[df['salary'].isna()]['customer_code'].tolist()

In [36]:
df[df['customer_code'].isin(no_salary_cc)]

Unnamed: 0,offer_class,accepted,name,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center
49,Medium,yes,B593546CCB654D6,male,36.0,0.0,2.0,PC 17755,51232.92,658,C,317D5,83.164504,5847.0,B
50,Medium,yes,8F6BF37DB124417,female,58.0,0.0,2.0,PC 17755,,1F1,C,2FCB7,96.104308,5988.0,B
118,Medium,yes,2D7B8459CBD2414,male,50.0,2.0,0.0,PC 17611,,2F3,S,8997E,90.384374,5152.0,B
120,Medium,yes,CA249ABE7D4E43C,female,,1.0,0.0,PC 17611,13365.0,57E,S,E4391,101.924325,1534.0,B
161,Medium,no,FFFDD0C3B34644E,male,42.0,1.0,1.0,113789,5200.0,CE9,S,4272B,66.355074,5090.0,A
162,Medium,yes,487662632F6B484,female,35.0,1.0,0.0,113789,,B04,S,4DBC4,60.800476,7570.0,A
182,Medium,yes,5CEF00A82AF34EF,male,35.0,0.0,1.0,PC 17755,51232.92,76F,C,AA030,168.97653,5052.0,B
225,Medium,no,EF319AAD8691472,male,29.0,1.0,1.0,113776,,88A,S,7DFE9,85.21346,4003.0,A
226,Medium,yes,29C5493CE7EF447,female,22.0,1.0,1.0,113776,6660.0,D78,S,7D77D,61.329828,2754.0,A
277,Medium,yes,DAF778DD00F8407,male,32.0,0.0,1.0,13214,,F22,C,55CC4,54.717256,6754.0,A


In [37]:
train_df[train_df['estimated_expenses'] > 6000]['accepted'].value_counts()

no     176
yes    116
Name: accepted, dtype: int64

In [38]:
train_df[train_df['estimated_expenses'] > 6000]['accepted'].value_counts()

no     176
yes    116
Name: accepted, dtype: int64

In [39]:
train_df['budget_status'] = (train_df['salary'] - train_df['estimated_expenses'])/train_df['offer_value']

In [40]:
train_df['budget_status'].describe()

count    1011.000000
mean      -12.225653
std        59.971629
min      -119.346163
25%       -36.658299
50%       -18.626248
75%        -3.381127
max       632.619387
Name: budget_status, dtype: float64

In [41]:
train_df[train_df['budget_status'] < 0]['accepted'].value_counts()

no     537
yes    259
Name: accepted, dtype: int64

In [42]:
train_df[train_df['budget_status'].between(-120,-20)]['accepted'].value_counts()

no     331
yes    146
Name: accepted, dtype: int64

In [43]:
train_df[train_df['budget_status'] > 0]['accepted'].value_counts()

yes    123
no      92
Name: accepted, dtype: int64

In [44]:
# uzupelnic jakos brakujace wartosci, potem sprawdzac, czy w zbiorze testowym nie ma czegos, przetrenowac model, sprawdzic czy jest lepszy na tren/cv, jesli tak, to uzyc tego modelu na testowym, jesli nie to starego

In [45]:
no_salary_cc = train_df[train_df['salary'].isna()]['customer_code'].tolist()

In [46]:
train_df[train_df['customer_code'].isin(no_salary_cc)]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
29C5493CE7EF447,yes,Medium,female,22.0,1.0,1.0,113776,6660.0,D78,S,7D77D,61.329828,2754.0,A,63.688423
BFAEC8F911F841B,no,Premium,male,32.0,1.0,0.0,3101278,,9C0,S,52591,137.395918,5384.0,B,
CA249ABE7D4E43C,yes,Medium,female,,1.0,0.0,PC 17611,13365.0,57E,S,E4391,101.924325,1534.0,B,116.076315
2D7B8459CBD2414,yes,Medium,male,50.0,2.0,0.0,PC 17611,,2F3,S,8997E,90.384374,5152.0,B,
A14D00E1FEBF433,no,Premium,male,60.5,0.0,0.0,3701,,05A,S,00D61,80.832678,7521.0,A,
FFFDD0C3B34644E,no,Medium,male,42.0,1.0,1.0,113789,5200.0,CE9,S,4272B,66.355074,5090.0,A,1.657748
487662632F6B484,yes,Medium,female,35.0,1.0,0.0,113789,,B04,S,4DBC4,60.800476,7570.0,A,
EF319AAD8691472,no,Medium,male,29.0,1.0,1.0,113776,,88A,S,7DFE9,85.21346,4003.0,A,
CCF5A463D3BC472,yes,Premium,female,,0.0,0.0,14313,,6F2,Q,D21EF,260.922393,2631.0,A,
63193E3CA0F840B,yes,Premium,female,33.0,3.0,0.0,3101278,1585.0,46D,S,A3D8B,76.753846,2650.0,B,-13.875526


In [47]:
train_df[train_df['customer_type'] == 'Q']['offer_value'].mean()

117.48922469395835

In [48]:
train_df[train_df['customer_type'] == 'S']['offer_value'].mean()

129.23098742105262

In [49]:
train_df[train_df['customer_type'] == 'C']['offer_value'].mean()

129.7879958125909

In [50]:
train_df[train_df['customer_type'] == 'Q']['salary'].mean()

1213.8326041666667

In [51]:
train_df[train_df['customer_type'] == 'S']['salary'].mean()

2638.4246033994336

In [52]:
train_df[train_df['customer_type'] == 'S']['accepted'].value_counts()

no     485
yes    226
Name: accepted, dtype: int64

In [53]:
train_df[train_df['customer_type'] == 'Q']['accepted'].value_counts()

no     61
yes    36
Name: accepted, dtype: int64

In [54]:
train_df[train_df['customer_type'] == 'C']['accepted'].value_counts()

yes    127
no      94
Name: accepted, dtype: int64

In [55]:
train_df[train_df['customer_type'] == 'C']['salary'].mean()

6243.123363636362

In [56]:
train_df.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,-37.065906
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,-29.470882
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,-4.360756
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,29.196258


In [57]:
notna_number = train_df[train_df['offer_code'].notna()]

In [58]:
notna_number[notna_number['offer_code'].str.startswith('M')]

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [59]:
females = train_df[train_df['gender'] == 'female']

In [60]:
males = train_df[train_df['gender'] == 'male']

In [61]:
females[females['age'].isna()]['accepted'].value_counts()

yes    78
no     45
Name: accepted, dtype: int64

In [62]:
females[females['age'].isna()]['accepted'].value_counts()

yes    78
no     45
Name: accepted, dtype: int64

In [63]:
males[males['age'].isna()]['accepted'].value_counts()

no     171
yes     49
Name: accepted, dtype: int64

In [64]:
females[females['age'].isna()]['accepted'].value_counts()

yes    78
no     45
Name: accepted, dtype: int64

In [65]:
females[females['customer_type'] == 'C']['accepted'].value_counts()

yes    86
no      9
Name: accepted, dtype: int64

In [66]:
train_df[train_df['gender'] == 'male']['accepted'].value_counts()

no     540
yes    128
Name: accepted, dtype: int64

In [68]:
df_train = train_df.copy(deep=True)

In [69]:
df_train.columns

Index(['accepted', 'offer_class', 'gender', 'age', 'phone_calls', 'emails',
       'customer_code', 'salary', 'offer_code', 'customer_type', 'number',
       'offer_value', 'estimated_expenses', 'center', 'budget_status'],
      dtype='object')

#### missing data

In [70]:
df_train[df_train['phone_calls']==1]['accepted'].value_counts()

yes    130
no     122
Name: accepted, dtype: int64

In [71]:
df_train[df_train['phone_calls']==4]['accepted'].value_counts()

no     16
yes     2
Name: accepted, dtype: int64

#### outliers

In [72]:
females[females['age'].notna()]['accepted'].value_counts()

yes    185
no      55
Name: accepted, dtype: int64

In [73]:
males[males['age'].notna()]['accepted'].value_counts()

no     369
yes     79
Name: accepted, dtype: int64

#### balancing data

#### feature engineering

In [74]:
train_df['no_salary'] = train_df['salary'] == 0
train_df.loc[train_df['salary'] == 0, 'salary'] = 1.0

In [75]:
train_df['log_salary'] = np.log(train_df['salary'])
train_df['log_estimated_expenses'] = np.log(train_df['estimated_expenses'])
train_df['class_high'] = train_df['offer_class'] == 'High'
train_df['class_medium'] = train_df['offer_class'] == 'Medium'
train_df['center_a'] = train_df['center'] == 'A'
train_df['is_male'] = train_df['gender'] == 'male'
train_df['cc_startswith_a'] = train_df['customer_code'].str.startswith('A')
train_df['cc_startswith_p'] = train_df['customer_code'].str.startswith('P')
train_df['cc_startswith_c'] = train_df['customer_code'].str.startswith('C')
train_df['customer_type_c'] = train_df['customer_type'] == 'C'
train_df['customer_type_q'] = train_df['customer_type'] == 'Q'
train_df['big_salary'] = train_df['salary'] >= 7525
train_df['small_salary'] = train_df['salary'] <= 729 
train_df['cc_len_5'] = train_df['customer_code'].str.len() == 5


In [1]:
from sklearn.preprocessing import StandardScaler

In [2]:
train_subset = train_df[['phone_calls', 'emails', 'offer_value']]

NameError: name 'train_df' is not defined

In [78]:
ss = StandardScaler()

In [79]:
scaled_columns = ss.fit_transform(train_subset)

In [80]:
scaled = pd.DataFrame(scaled_columns, columns=['scaled_phone_calls', 'scaled_emails', 'scaled_offer_value'], index=train_df.index)

In [81]:
train_new = pd.merge(train_df, scaled, left_on=train_df.index, right_on=scaled.index)

#### modelling

In [82]:
# informedsearch using gridsearchcv and randomizedsearchcv

## basic model

In [83]:
# emails and phone calls outliers - how to deal?

In [84]:
base_cols = []

#### columns: emails and phone_calls

In [85]:
df_train['emails'].isnull().sum()

2

In [86]:
df_train['phone_calls'].isnull().sum()

2

In [87]:
df_train['cat_emails'] = df_train['emails']

In [88]:
df_train['cat_phone_calls'] = df_train['phone_calls']

In [89]:
round(df_train['emails'].mean())

1.0

In [90]:
round(df_train['phone_calls'].mean())

1.0

In [91]:
df_train.loc[df_train['cat_emails'].isna(), 'cat_emails'] = round(df_train['emails'].mean())

In [1]:
round(df_train['emails'].mean())

NameError: name 'df_train' is not defined

In [92]:
df_train.loc[df_train['cat_phone_calls'].isna(), 'cat_phone_calls'] = round(df_train['phone_calls'].mean())

In [93]:
df_train['cat_emails'].isnull().sum()

0

In [94]:
df_train['cat_phone_calls'].isnull().sum()

0

In [95]:
df_train[df_train['emails']>4]['accepted'].value_counts()

no     11
yes     1
Name: accepted, dtype: int64

In [96]:
df_train[df_train['phone_calls']>3]['accepted'].value_counts()

no     30
yes     2
Name: accepted, dtype: int64

In [97]:
df_train.loc[df_train['cat_emails'] > 4, 'cat_emails'] = 5

In [98]:
df_train.loc[df_train['cat_phone_calls'] > 3, 'cat_phone_calls'] = 4

In [125]:
df_train['cat_emails'] = df_train['cat_emails'].astype(object)
df_train['cat_phone_calls'] = df_train['cat_phone_calls'].astype(object)

In [101]:
from category_encoders import LeaveOneOutEncoder

In [103]:
# LeaveOneOutEncoder??

In [133]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031 entries, 8550AB469CB2445 to 358EEC4160A4478
Data columns (total 17 columns):
accepted              1031 non-null object
offer_class           1031 non-null object
gender                1031 non-null object
age                   688 non-null float64
phone_calls           1029 non-null float64
emails                1029 non-null float64
customer_code         1015 non-null object
salary                1024 non-null float64
offer_code            1029 non-null object
customer_type         1029 non-null object
number                1025 non-null object
offer_value           1021 non-null float64
estimated_expenses    1028 non-null float64
center                1029 non-null object
budget_status         1011 non-null float64
cat_emails            1031 non-null object
cat_phone_calls       1031 non-null object
dtypes: float64(7), object(10)
memory usage: 185.0+ KB


In [105]:
df_train['cat_emails'].value_counts()

1.0    461
0.0    383
2.0    111
3.0     56
5.0     12
4.0      8
Name: cat_emails, dtype: int64

In [378]:
df_train[['target', 'cat_emails']].groupby('cat_emails').mean()

Unnamed: 0_level_0,target
cat_emails,Unnamed: 1_level_1
0.0,0.368146
1.0,0.347072
2.0,0.504505
3.0,0.517857
4.0,0.5
5.0,0.083333


In [375]:
df_train.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status,cat_emails,cat_phone_calls,target,temp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816,2,1,0,403.973
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,-37.065906,0,0,0,164.075
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,-29.470882,1,1,1,208.582
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,-4.360756,1,0,0,255.496
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,29.196258,1,0,1,221.371


In [373]:
LeaveOneOutEncoder

category_encoders.leave_one_out.LeaveOneOutEncoder

In [107]:
df_train['cat_emails'].dtype

CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0], ordered=False)

In [108]:
type(df_train['cat_emails'])

pandas.core.series.Series

In [372]:
LeaveOneOutEncoder?

[0;31mInit signature:[0m
[0mLeaveOneOutEncoder[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcols[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdrop_invariant[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreturn_df[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhandle_unknown[0m[0;34m=[0m[0;34m'value'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhandle_missing[0m[0;34m=[0m[0;34m'value'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msigma[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Leave one out coding for categorical features.

This is very similar to target encoding but excludes the current row's
target when calculating the mean target for a level 

In [134]:
X = df_train.drop(columns=['accepted'])
y = df_train['accepted'] == 'yes'
y = y.astype('int')

In [135]:
X['cat_emails'].unique()

array([2.0, 0.0, 1.0, 3.0, 5.0, 4.0], dtype=object)

In [136]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1031 entries, 8550AB469CB2445 to 358EEC4160A4478
Data columns (total 16 columns):
offer_class           1031 non-null object
gender                1031 non-null object
age                   688 non-null float64
phone_calls           1029 non-null float64
emails                1029 non-null float64
customer_code         1015 non-null object
salary                1024 non-null float64
offer_code            1029 non-null object
customer_type         1029 non-null object
number                1025 non-null object
offer_value           1021 non-null float64
estimated_expenses    1028 non-null float64
center                1029 non-null object
budget_status         1011 non-null float64
cat_emails            1031 non-null object
cat_phone_calls       1031 non-null object
dtypes: float64(7), object(9)
memory usage: 176.9+ KB


In [138]:
X['cat_emails'].value_counts()

1.0    461
0.0    383
2.0    111
3.0     56
5.0     12
4.0      8
Name: cat_emails, dtype: int64

In [None]:
def LeaveOneOut(row, col, response_col):
    

In [195]:
np.random.normal(loc=1, scale=0.05, size=1)

array([1.02483571])

In [175]:
np.random.seed

<function RandomState.seed>

In [148]:
np.random.normal(loc=1, scale=0.05, size=1)

array([1.01083141])

In [172]:
X_elements = list(range(X.shape[0]))

In [216]:
column = df.columns.to_list().index('offer_class')

In [232]:
df_train['target'] = df_train['accepted'] == 'yes'

In [236]:
df_train['target'] = df_train['target'].astype(int)

In [246]:
assert not df_train['offer_class'].isna().sum(), 'offer_class is empty, change your choice inside function'

In [268]:
play = df_train[['cat_emails', 'target','offer_class']].groupby(by=['cat_emails', 'target']).count().reset_index().rename(columns={'offer_class':'response'})

In [318]:
play[(play['cat_emails'] == 0) & (play['target'] == 0)]['response'][0]

242

In [275]:
play

Unnamed: 0,cat_emails,target,response
0,0.0,0,242
1,0.0,1,141
2,1.0,0,301
3,1.0,1,160
4,2.0,0,55
5,2.0,1,56
6,3.0,0,27
7,3.0,1,29
8,4.0,0,4
9,4.0,1,4


In [369]:
for i in df_train.index:
    df_train.at[i, 'target'], df_train.at[i, '']

In [364]:
df_train.at[0]

ValueError: At based indexing on an non-integer index can only have non-integer indexers

In [293]:
def add(row):
    return row.

In [348]:
#??LeaveOneOutEncoder

In [351]:
from sklearn.datasets import load_boston
bunch = load_boston()
y = bunch.target
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
#enc = LeaveOneOutEncoder(cols=['CHAS', 'RAD']).fit(X, y)
#numeric_dataset = enc.transform(X)
#print(numeric_dataset.info())

In [356]:
#bunch.target

In [276]:
# LeaveOneOut class

In [274]:
#train
#test inne zachowania dla obu zbiorow

In [194]:
np.random.seed(seed=42)

#### LeaveOneOut

In [381]:
df_train.head(1)

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status,cat_emails,cat_phone_calls,target,temp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816,2,1,0,403.973


In [516]:
temp_df = df_train[['cat_emails', 'offer_class', 'target']]

In [517]:
groups = temp_df.groupby(['cat_emails', 'target']).count().reset_index().rename(columns={'offer_class': 'count'})

In [518]:
groups

Unnamed: 0,cat_emails,target,count
0,0.0,0,242
1,0.0,1,141
2,1.0,0,301
3,1.0,1,160
4,2.0,0,55
5,2.0,1,56
6,3.0,0,27
7,3.0,1,29
8,4.0,0,4
9,4.0,1,4


In [480]:
df_temp = df_train.tail(600)
temp_df = df_temp[['cat_emails', 'offer_class', 'target']]
groups = temp_df.groupby(['cat_emails', 'target']).count().reset_index().rename(columns={'offer_class': 'count'})

In [481]:
groups

Unnamed: 0,cat_emails,target,count
0,0.0,0,135
1,0.0,1,87
2,1.0,0,173
3,1.0,1,87
4,2.0,0,37
5,2.0,1,40
6,3.0,0,11
7,3.0,1,15
8,4.0,0,4
9,4.0,1,4


In [501]:
groups.loc[groups['cat_emails'] == 0]['count'].sum()

222

In [508]:
test_slownik = {'target':[0,1,1,0],
                'category' : ['a', 'a', 'a', 'a']}
df = pd.DataFrame(test_slownik)

In [509]:
df

Unnamed: 0,target,category
0,0,a
1,1,a
2,1,a
3,0,a


In [843]:
import pandas as pd
from collections import defaultdict
from typing import List 


class CategoricalEncoders:
    def __init__(self,
                 train_df: pd.DataFrame, 
                 test_df: pd.DataFrame,
                 random_state: int, 
                 columns_to_encode: List,
                 target_column: str):
        self.train_df = train_df
        self.test_df = test_df
        self.random_state = random_state
        self.columns_to_encode = columns_to_encode
        self.target = target_column

class LeaveOneOutEncoder(CategoricalEncoders):
    """
    Simple implementation of leave one out encoding of categorical columns.
    
    Warning! you must deal with missing values on your own, before using this class.
    """
    def __init__(self,
                 train_df: pd.DataFrame, 
                 test_df: pd.DataFrame,
                 random_state: int, 
                 groups_helper: str,
                 columns_to_encode: List,
                 target_column: str):
        CategoricalEncoders.__init__(self, train_df, test_dt, random_state, columns_to_encode_target_column)
        self.groups_helper = groups_helper

        
        cond1 = self.train_df[groups_helper].isna().sum()
        cond2 = self.test_df[groups_helper].isna().sum()
    
        if cond1 or cond2:
            raise ValueError('groups_helper column of df should not have any NaNs, deal with them and try again')
    
    @staticmethod
    def _make_groups_function(df,
                              groups_helper,
                              columns_to_encode,
                              target_column):
        """
        """
        groups = defaultdict()
        similarity_flag = False
        for col in columns_to_encode:  
            
            if col is groups_helper:
                temp = groups_helper
                groups_helper = groups_helper+'_'
                df[groups_helper] = df[temp]
                similarity_flag = True
                
            mask = [groups_helper, col, target_column] 
            temp_df = df[mask]
            groups[col] = (temp_df
                           .groupby([col, target_column])
                           .count()
                           .reset_index()
                           .rename(columns={groups_helper: 'count'}))
            
            if similarity_flag:
                similarity_flag = False
                df.drop(columns=[groups_helper], inplace=True)
                groups_helper = temp
                
            
        return groups
    
    def _make_groups(self):
        """
        """
        groups = self._make_groups_function(df = self.train_df, 
                           groups_helper = self.groups_helper, 
                           columns_to_encode = self.columns_to_encode, 
                           target_column = self.target)
        return groups
        
    
    def _loo_train(self):
        """
        """
        df = self.train_df.copy(deep=True)
        groups = self._make_groups()
        for col in self.columns_to_encode:
            transformed_rows = []
            np.random.seed(seed=self.random_state)
            group = groups[col]

            for row in df.index:
                row = df.loc[row]
                target = row[self.target]
                column_class = row[col]
                mean_numerator = group[(group[col] == column_class) & (group[self.target] == 1)]['count'].reset_index().loc[:, 'count'].values[0]
                if target:
                    mean_numerator -= 1
                mean_denominator = group.loc[group[col] == column_class]['count'].sum()-1
                random_number = np.random.normal(loc=1, scale=0.05, size=1)[0]
                mean_response = mean_numerator/mean_denominator
                transformed_row = mean_response*random_number
                transformed_rows.append(transformed_row)

            df['encoded_'+col] = transformed_rows
        return df
    
    def _loo_test(self):
        """
        """
        df = self.test_df.copy(deep=True)
        groups = self._make_groups()
        
        def test_encoding(row, groups, col):
            """
            """
            temp = groups[groups[col] == row]
            numerator = temp[temp['target'] == 1].reset_index().loc[:, 'count']
            denominator = temp['count'].sum()
            return numerator/denominator
        
        for col in self.columns_to_encode:
            df['encoded_'+col] = df[col].apply(lambda row: test_encoding(row, groups[col], col))
            
        return df  
    
    def fit(self):
        fitted_train = self._loo_train()
        fitted_test = self._loo_test()
        return fitted_train, fitted_test

In [786]:
group[(group['customer_type'] == 'C') & (group['target'] == 1)]['count'].reset_index().loc[0, 'count']

127

In [787]:
df_help = df_train.head(3)

In [788]:
for row in df_help.index:
    row = df_help.loc[row]
    print(row['customer_type'])

C
S
Q


In [789]:
groups.loc[groups['cat_emails'] == 0]

Unnamed: 0,cat_emails,target,count
0,0.0,0,242
1,0.0,1,141


In [790]:
df_t = df_train[['customer_type', 'target', 'offer_class']]

In [735]:
#train_df

In [736]:
group = (df_t
               .groupby(['customer_type', 'target'])
               .count()
               .reset_index()
               .rename(columns={'offer_class': 'count'}))

In [696]:
group

Unnamed: 0,customer_type,target,count
0,C,0,94
1,C,1,127
2,Q,0,61
3,Q,1,36
4,S,0,485
5,S,1,226


In [641]:
df_train.columns

Index(['accepted', 'offer_class', 'gender', 'age', 'phone_calls', 'emails',
       'customer_code', 'salary', 'offer_code', 'customer_type', 'number',
       'offer_value', 'estimated_expenses', 'center', 'budget_status',
       'cat_emails', 'cat_phone_calls', 'target', 'temp', 'new_col'],
      dtype='object')

In [652]:
test_df['cat_emails'] = test_df['emails']

In [632]:
df_train.loc['8550AB469CB2445']

accepted                    no
offer_class            Premium
gender                  female
age                        NaN
phone_calls                  1
emails                       2
customer_code             2678
salary                 1524.58
offer_code                 8CA
customer_type                C
number                   0F2A2
offer_value            303.973
estimated_expenses        7207
center                       B
budget_status         -18.6938
cat_emails                   2
cat_phone_calls              1
target                       0
temp                   403.973
new_col               0.144872
Name: 8550AB469CB2445, dtype: object

In [761]:
df_train = df_train.dropna(subset=['customer_type'])

In [784]:
test_df.isnull().sum()

offer_class            0
gender                 0
age                   69
phone_calls            1
emails                 0
customer_code          8
salary                 1
offer_code             0
customer_type          0
number                 3
offer_value            2
estimated_expenses     0
center                 1
cat_emails             0
dtype: int64

In [998]:
df_train.loc[:,['offer_class', 'cat_emails', 'customer_type']] = df_train.loc[:, ['offer_class', 'cat_emails', 'customer_type']].astype(str)
test_df.loc[:, ['offer_class', 'cat_emails', 'customer_type']] = test_df.loc[:, ['offer_class', 'cat_emails', 'customer_type']].astype(str)

In [999]:
enc = LeaveOneOutEncoder(train_df = df_train, test_df = test_df, columns_to_encode = ['offer_class', 'cat_emails', 'customer_type'], target_column = 'target', random_state = 42, mean=1, std=0.05)

In [1000]:
test_df.shape

(258, 14)

In [1001]:
df_train.shape

(1029, 20)

In [1002]:
train_df['customer_type'].isna().sum()

2

In [1003]:
train, test = enc.fit()

In [1008]:
#train[train['customer_type'] == 'S']['encoded_offer_class']

In [1009]:
#train

In [1010]:
#test

In [890]:
df_train[['cat_emails', 'target']].groupby(['cat_emails', 'target']).size().reset_index().rename(columns={0:'size'})

Unnamed: 0,cat_emails,target,size
0,0.0,0,242
1,0.0,1,139
2,1.0,0,301
3,1.0,1,160
4,2.0,0,55
5,2.0,1,56
6,3.0,0,27
7,3.0,1,29
8,4.0,0,4
9,4.0,1,4


In [901]:
import pandas as pd
from collections import defaultdict
from typing import List 

class CategoricalEncoders:
    def __init__(self,
                 train_df: pd.DataFrame, 
                 test_df: pd.DataFrame,
                 random_state: int, 
                 columns_to_encode: List,
                 target_column: str):
        self.train_df = train_df
        self.test_df = test_df
        self.random_state = random_state
        self.columns_to_encode = columns_to_encode
        self.target = target_column
        
class LeaveOneOutEncoder(CategoricalEncoders):
    """
    Simple implementation of leave one out encoding of categorical columns.    
    Warning! you must deal with missing values on your own, before using this class.
    """
   
    @staticmethod
    def _make_groups_function(df,
                              columns_to_encode,
                              target_column):
        """
        """
        groups = defaultdict()
        for col in columns_to_encode: 
            mask = [col, target_column] 
            temp_df = df[mask]
            groups[col] = (temp_df
                           .groupby(mask)
                           .size()
                           .reset_index()
                           .rename(columns={0: 'size'}))            
        return groups
    
    @staticmethod
    def _test_encoding(row, groups, col):
        """
        """
        temp = groups[groups[col] == row]
        numerator = temp[temp['target'] == 1].reset_index().loc[:, 'size']
        denominator = temp['size'].sum()
        return numerator/denominator
    
    def _groups(self):
        """
        """
        groups = self._make_groups_function(df = self.train_df, 
                           columns_to_encode = self.columns_to_encode, 
                           target_column = self.target)
        return groups
        
    
    def _loo_train(self):
        """
        """
        df = self.train_df.copy(deep=True)
        groups = self._groups()
        for col in self.columns_to_encode:
            transformed_rows = []
            np.random.seed(seed=self.random_state)
            group = groups[col]

            for row in df.index:
                row = df.loc[row]
                target = row[self.target]
                column_class = row[col]
                mean_numerator = group[(group[col] == column_class) & (group[self.target] == 1)]['size'].reset_index().loc[:, 'size'].values[0]
                if target:
                    mean_numerator -= 1
                mean_denominator = group.loc[group[col] == column_class]['size'].sum()-1
                random_number = np.random.normal(loc=1, scale=0.05, size=1)[0]
                mean_response = mean_numerator/mean_denominator
                transformed_row = mean_response*random_number
                transformed_rows.append(transformed_row)

            df['encoded_'+col] = transformed_rows
        return df
    
    def _loo_test(self):
        """
        """
        df = self.test_df.copy(deep=True)
        groups = self._groups()
        
        for col in self.columns_to_encode:
            df['encoded_'+col] = df[col].apply(lambda row: self._test_encoding(row, groups[col], col))
            
        return df  
    
    def fit(self):
        fitted_train = self._loo_train()
        fitted_test = self._loo_test()
        return fitted_train, fitted_test

In [923]:
from collections import defaultdict
from typing import List, Tuple

import numpy as np
import pandas as pd


class CategoricalEncoders:
    """
    Superclass for all categorical encoders possibly implemented here
    """

    def __init__(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 columns_to_encode: List[str],
                 target_column: str) -> None:
        """
        :param train_df: training DataFrame
        :param test_df:  testing DataFrame
        :param columns_to_encode: labels of the categorical columns to encode
        :param target_column: label of the target column
        """
        self.train_df = train_df
        self.test_df = test_df
        self.columns_to_encode = columns_to_encode
        self.target = target_column


class LeaveOneOutEncoder(CategoricalEncoders):
    """
    Simple implementation of leave one out encoding of categorical columns.
    Warning! you must deal with missing values on your own, before using this class.

    how it works?



    """

    def __init__(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 columns_to_encode: List[str],
                 target_column: str,
                 random_state: int,
                 mean: float,
                 std: float) -> None:
        """
        :param random_state: random state for normal distribution numbers generator
        :param mean: mean for normal distribution numbers generator
        :param std: std for normal distribution numbers generator
        """
        CategoricalEncoders.__init__(self,
                                     train_df=train_df,
                                     test_df=test_df,
                                     columns_to_encode=columns_to_encode,
                                     target_column=target_column)
        self.random_state = random_state
        self.mean = mean
        self.std = std

    @staticmethod
    def _make_groups_function(df: pd.DataFrame,
                              columns_to_encode: List[str],
                              target_column: str) -> pd.DataFrame:
        """

        """
        groups = defaultdict()
        for col in columns_to_encode:
            mask = [col, target_column]
            temp_df = df[mask]
            groups[col] = (temp_df
                           .groupby(mask)
                           .size()
                           .reset_index()
                           .rename(columns={0: 'size'}))
        return groups

    @staticmethod
    def _test_encoding(row: str, groups: pd.DataFrame, col: str) -> float:
        """
        """
        temp = groups[groups[col] == row]
        numerator = temp[temp['target'] == 1].reset_index().loc[:, 'size']
        denominator = temp['size'].sum()
        return numerator / denominator

    def _groups(self) -> pd.DataFrame:
        """
        """
        groups = self._make_groups_function(df=self.train_df,
                                            columns_to_encode=self.columns_to_encode,
                                            target_column=self.target)
        return groups

    def _loo_train(self):
        """
        """
        df = self.train_df.copy(deep=True)
        groups = self._groups()
        for col in self.columns_to_encode:
            transformed_rows = []
            np.random.seed(seed=self.random_state)
            group = groups[col]

            for row in df.index:
                row = df.loc[row]
                target = row[self.target]
                column_class = row[col]
                mean_numerator = (
                    group[(group[col] == column_class) & (group[self.target] == 1)]['size']
                    .reset_index()
                    .loc[:, 'size']
                    .values[0]
                )
                if target:
                    mean_numerator -= 1
                mean_denominator = group.loc[group[col] == column_class]['size'].sum() - 1
                random_number = np.random.normal(loc=self.mean,
                                                 scale=self.std,
                                                 size=1)[0]
                mean_response = mean_numerator / mean_denominator
                transformed_row = mean_response * random_number
                transformed_rows.append(transformed_row)

            df['encoded_' + col] = transformed_rows
        return df

    def _loo_test(self) -> pd.DataFrame:
        """
        """
        df = self.test_df.copy(deep=True)
        groups = self._groups()

        for col in self.columns_to_encode:
            df['encoded_' + col] = df[col].apply(
                lambda row: self._test_encoding(row, groups[col], col))

        return df

    def fit(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """

        :return:
        """
        fitted_train = self._loo_train()
        fitted_test = self._loo_test()
        return fitted_train, fitted_test


In [973]:
from collections import defaultdict
from typing import List, Tuple

import numpy as np
import pandas as pd


class CategoricalEncoders:
    """
    Superclass for all categorical encoders possibly implemented here
    """

    def __init__(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 columns_to_encode: List[str],
                 target_column: str) -> None:
        """
        :param train_df: training DataFrame
        :param test_df:  testing DataFrame
        :param columns_to_encode: labels of the categorical columns to encode
        :param target_column: label of the target column
        """

        self.train_df = train_df
        self.test_df = test_df
        self.columns_to_encode = columns_to_encode
        self.target = target_column


class LeaveOneOutEncoder(CategoricalEncoders):
    """
    Simple implementation of leave one out encoding of categorical columns.
    Written for binary classification.
    Warning! you must deal with missing values on your own, before using this class.

    how it works?

    Example:

    TRAIN
        user	target	encoded_user
    0	a	    0	    0.683224
    1	a	    1	    0.331029
    2	a	    1	    0.344128
    3	a	    0	    0.717434

    TEST
        user	encoded_user
    0	a	    0.5
    1	a	    0.5

    look at wacax post for explanation, based on Owen Zhang idea.
    1) https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748
    2) https://datascience.stackexchange.com/questions/10839/
    what-is-difference-between-one-hot-encoding-and-leave-one-out-encoding
    """

    def __init__(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 columns_to_encode: List[str],
                 target_column: str,
                 random_state: int,
                 mean: float,
                 std: float) -> None:
        """
        :param random_state: random state for normal distribution numbers generator
        :param mean: mean for normal distribution numbers generator
        :param std: std for normal distribution numbers generator
        """
        CategoricalEncoders.__init__(self,
                                     train_df=train_df,
                                     test_df=test_df,
                                     columns_to_encode=columns_to_encode,
                                     target_column=target_column)
        self.random_state = random_state
        self.mean = mean
        self.std = std

    @staticmethod
    def _make_groups(df: pd.DataFrame,
                     columns_to_encode: List[str],
                     target_column: str) -> defaultdict:
        """
        internal method for making groups table (how many people responsed positively/negatively
        inside of categories
        """
        groups = defaultdict()
        for col in columns_to_encode:
            mask = [col, target_column]
            temp_df = df[mask]
            groups[col] = (temp_df
                           .groupby(mask)
                           .size()
                           .reset_index()
                           .rename(columns={0: 'size'}))
        return groups

    @staticmethod
    def _test_encoding(row: str, groups: pd.DataFrame, col: str) -> float:
        """
        internal method for encoding test set (function to further use in pd.apply)
        """
        temp = groups[groups[col] == row]
        numerator = temp[temp['target'] == 1].reset_index().loc[:, 'size']
        denominator = temp['size'].sum()
        return numerator / denominator

    def _loo_train(self):
        """
        internal method
        performs loo on train set (we have response (target) column)
        """
        df = self.train_df.copy(deep=True)
        groups = self._make_groups(df=self.train_df,
                                   columns_to_encode=self.columns_to_encode,
                                   target_column=self.target)
        for col in self.columns_to_encode:
            transformed_rows = []
            np.random.seed(seed=self.random_state)
            group = groups[col]

            for row in df.index:
                row = df.loc[row]
                target = row[self.target]
                column_class = row[col]
                mean_numerator = (
                    group[(group[col] == column_class) & (group[self.target] == 1)]['size']
                    .reset_index()
                    .loc[:, 'size']
                    .values[0]
                )
                if target:
                    mean_numerator -= 1
                mean_denominator = group.loc[group[col] == column_class]['size'].sum() - 1
                random_number = np.random.normal(loc=self.mean,
                                                 scale=self.std,
                                                 size=1)[0]
                mean_response = mean_numerator / mean_denominator
                transformed_row = mean_response * random_number
                transformed_rows.append(transformed_row)

            df['encoded_' + col] = transformed_rows
        return df

    def _loo_test(self) -> pd.DataFrame:
        """
        internal method
        performs loo on train set (we don't have response (target) column)
        """
        df = self.test_df.copy(deep=True)
        groups = self._make_groups(df=self.train_df,
                                   columns_to_encode=self.columns_to_encode,
                                   target_column=self.target)

        for col in self.columns_to_encode:
            df['encoded_' + col] = (df[col]
                                    .apply(lambda row: self._test_encoding(row, groups[col], col)))

        return df

    def fit(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        :return: Returns DataFrames with encoded chosen categorical columns inside both train
        and test sets
        """
        fitted_train = self._loo_train()
        fitted_test = self._loo_test()
        return fitted_train, fitted_test


In [1013]:
debug_df_train = pd.DataFrame({'user':['a','a','a','a'],
                               'target':[0,1,1,0]})
debug_df_test = pd.DataFrame({'user':['a','a']})

In [975]:
debug_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
user      4 non-null object
target    4 non-null int64
dtypes: int64(1), object(1)
memory usage: 192.0+ bytes


In [1014]:
enc = LeaveOneOutEncoder(train_df=debug_df_train,
                         test_df=debug_df_test,
                         columns_to_encode=['user'],
                         target_column='target',
                         random_state=42, 
                         mean=1,
                         std=0.05)

In [1015]:
X_1, X_t = enc.fit()

In [1016]:
X_1

Unnamed: 0,user,target,encoded_user
0,a,0,0.683224
1,a,1,0.331029
2,a,1,0.344128
3,a,0,0.717434


In [1017]:
X_t

Unnamed: 0,user,encoded_user
0,a,0.5
1,a,0.5


In [796]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1029 entries, 8550AB469CB2445 to 358EEC4160A4478
Data columns (total 23 columns):
accepted                 1029 non-null object
offer_class              1029 non-null object
gender                   1029 non-null object
age                      686 non-null float64
phone_calls              1027 non-null float64
emails                   1027 non-null float64
customer_code            1013 non-null object
salary                   1022 non-null float64
offer_code               1027 non-null object
customer_type            1029 non-null object
number                   1023 non-null object
offer_value              1019 non-null float64
estimated_expenses       1026 non-null float64
center                   1027 non-null object
budget_status            1009 non-null float64
cat_emails               1029 non-null object
cat_phone_calls          1029 non-null object
target                   1029 non-null int64
temp                     1019 non-null o

In [797]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 258 entries, DA3BAEB8BF604EB to 2E27EC78F50943B
Data columns (total 17 columns):
offer_class              258 non-null object
gender                   258 non-null object
age                      189 non-null float64
phone_calls              257 non-null float64
emails                   258 non-null float64
customer_code            250 non-null object
salary                   257 non-null float64
offer_code               258 non-null object
customer_type            258 non-null object
number                   255 non-null object
offer_value              256 non-null float64
estimated_expenses       258 non-null float64
center                   257 non-null object
cat_emails               258 non-null float64
encoded_offer_class      258 non-null float64
encoded_cat_emails       257 non-null float64
encoded_customer_type    258 non-null float64
dtypes: float64(10), object(7)
memory usage: 36.3+ KB


In [4]:
for i in range(3):
    print(i)

TypeError: unsupported operand type(s) for +: 'range' and 'int'

In [801]:
test[test['cat_emails']>4]

Unnamed: 0_level_0,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,encoded_offer_class,encoded_cat_emails,encoded_customer_type
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
96F665BA58B24F8,Premium,female,,1.0,9.0,CA. 2343,6955.0,960,S,,215.335395,7038.0,A,9.0,0.256637,,0.317862


In [771]:
test.head()

Unnamed: 0_level_0,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,encoded_offer_class,encoded_cat_emails,encoded_customer_type
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DA3BAEB8BF604EB,Premium,male,,0.0,1.0,Fa 265302,731.25,99F,S,D6706,143.108044,7638.0,A,1.0,0.256637,0.347072,0.317862
AEF3DE08DFED4E0,Premium,male,31.0,0.0,0.0,347063,777.5,405,S,,63.637416,5784.0,A,0.0,0.256637,0.364829,0.317862
B621BB29484E46D,Premium,male,25.0,0.0,1.0,349250,789.58,954,S,7E9CB,210.204552,4605.0,B,1.0,0.256637,0.347072,0.317862
2D0945802F92423,Premium,female,18.0,0.0,1.0,365226,675.0,D7D,Q,6361F,131.110924,4459.0,B,1.0,0.256637,0.347072,0.371134
640ABFC7E49B403,Medium,male,56.0,0.0,1.0,17764,3069.58,BAF,C,63C3B,180.019199,4396.0,B,1.0,0.615385,0.347072,0.574661


In [782]:
test[test['cat_emails'].isna()]

Unnamed: 0_level_0,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,cat_emails,encoded_offer_class,encoded_cat_emails,encoded_customer_type
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [772]:
train.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status,cat_emails,cat_phone_calls,target,temp,new_col,encoded_offer_class,encoded_cat_emails,encoded_customer_type
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816,2,1,0,403.973,0.144872,0.761358,0.503101,0.433226
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,-37.065906,0,0,0,164.075,0.626529,0.570105,0.629826,0.676977
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,-29.470882,1,1,1,208.582,0.42971,0.263588,0.356846,0.37639
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,-4.360756,1,0,0,255.496,0.845145,0.799481,0.701838,0.454919
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,29.196258,1,0,1,221.371,0.411357,0.25233,0.341605,0.313191


In [14]:
from pathlib import Path
from typing import Tuple
from collections import defaultdict

import pandas as pd
from sklearn.model_selection import train_test_split

In [28]:
class DataManager:
    """
    helps us with managing our raw/train/test DataFrames:
    facilitates us:
    - load raw data,
    - create train and test DataFrames,
    - read train and test DataFrames
    """

    def __init__(self,
                 local_path: Path = Path('/Users/mjasiecz/PycharmProjects/'),
                 project_path: Path = Path('new_offer_success_predictor/data/raw/'),
                 filename: str = 'client_database',
                 suffix: str = '.parquet',
                 csv_suffix: str = '.csv') -> None:
        """
        :param local_path: path to the local folder, you probably will need to change at least
         mjasiecz part
        :param project_path: path to project raw data
        :param filename: name of the file with raw data
        :param suffix: suffix of the file with raw data
        :param csv_suffix: .csv suffix
        """

        self.local_path = local_path
        self.project_path = project_path
        self.filename = filename
        self.suffix = suffix
        self.csv_suffix = csv_suffix

    def load_data(self) -> pd.DataFrame:

        data_path = (self.local_path
                     .joinpath(self.project_path)
                     .joinpath(Path(self.filename))
                     .with_suffix(self.suffix))

        df = pd.read_parquet(data_path, engine='pyarrow')
        # deletes empty (and not useful) rows from DataFrame
        df = df[df['accepted'].notna()].set_index('name')

        return df

    def _train_test_paths(self) -> Tuple[Path, Path]:
        """
        internal method for creating paths for train and test DataFrames
        """
        train_test_paths = {item:self.local_path
                         .joinpath(self.project_path)
                         .joinpath(item)
                         .with_suffix(self.csv_suffix) for item in ['train', 'test']}
        
        return train_test_paths['train'], train_test_paths['test']

In [26]:
DM = DataManager()

In [27]:
DM._train_test_paths()

(PosixPath('/Users/mjasiecz/PycharmProjects/new_offer_success_predictor/data/raw/train.csv'),
 PosixPath('/Users/mjasiecz/PycharmProjects/new_offer_success_predictor/data/raw/test.csv'))

In [17]:
_train_test_paths

NameError: name '_train_test_paths' is not defined

In [588]:
def test_encoding(row):
    temp = groups[groups['cat_emails'] == row]
    numerator = temp[temp['target'] == 1].reset_index().loc[:, 'count']
    denominator = temp['count'].sum()
    return numerator/denominator

In [587]:
test_df['emails'].apply(lambda row: test_encoding(row))[0].value_counts()

0.368146    120
0.347072    110
0.504505     21
0.517857      4
0.500000      2
Name: 0, dtype: int64

In [533]:
'a ' is 'a '

True

In [528]:
df_train.columns

Index(['accepted', 'offer_class', 'gender', 'age', 'phone_calls', 'emails',
       'customer_code', 'salary', 'offer_code', 'customer_type', 'number',
       'offer_value', 'estimated_expenses', 'center', 'budget_status',
       'cat_emails', 'cat_phone_calls', 'target', 'temp', 'new_col'],
      dtype='object')

In [532]:
df_train['offer_class'].isna().sum()

0

In [540]:
groups[groups['cat_emails'] == 1]['count'].sum()

461

In [541]:
a = groups[groups['cat_emails'] == 1]

In [545]:
a[a['target'] == 1]['target']

3    1
Name: target, dtype: int64

In [546]:
a[a['target'] == 1]['count']/a['count'].sum()

3    0.347072
Name: count, dtype: float64

In [None]:
groups['']

In [550]:
test_df.loc['E000C782D2EA42B']

offer_class              High
gender                 female
age                        54
phone_calls                 1
emails                      4
customer_code           29105
salary                   2300
offer_code                7C6
customer_type               S
number                  4C6B9
offer_value           83.1943
estimated_expenses       1371
center                      B
Name: E000C782D2EA42B, dtype: object

In [572]:
temp = groups[groups['cat_emails'] == 0]
numerator = temp[temp['target'] == 1].reset_index().loc[0, 'count']
denominator = temp['count'].sum()

In [573]:
numerator

141

In [567]:
numerator

141

In [581]:
def test_encoding(row):
    temp = groups[groups['cat_emails'] == row]
    numerator = temp[temp['target'] == 1].reset_index().loc[:, 'count']
    denominator = temp['count'].sum()
    return numerator/denominator

In [582]:
test_df['emails'].isna().sum()

0

In [586]:
groups

Unnamed: 0,cat_emails,target,count
0,0.0,0,242
1,0.0,1,141
2,1.0,0,301
3,1.0,1,160
4,2.0,0,55
5,2.0,1,56
6,3.0,0,27
7,3.0,1,29
8,4.0,0,4
9,4.0,1,4


In [585]:
test_df['emails'].apply(lambda row: test_encoding(row))[0].value_counts()

0.368146    120
0.347072    110
0.504505     21
0.517857      4
0.500000      2
Name: 0, dtype: int64

In [None]:
def leave_one_out(df, columns, target_col, random_state=random_state) -> pd.Series:
    for col in columns:
        np.random.seed(seed=random_state)
        subset = [col, target_col, ]
        df_temp = df[columns]
    

In [669]:
transformed_rows = []
np.random.seed(seed=42)
for row_index in df_train.index:
    row = df_train.loc[row_index]
    target = row['target']
    column_class = row['cat_emails']
    mean_numerator = groups[(groups['cat_emails'] == column_class) & (groups['target'] == target)]['count'].reset_index().loc[0, 'count']-1
    mean_denominator = groups.loc[groups['cat_emails'] == 0]['count'].sum()-1
    random_number = np.random.normal(loc=1, scale=0.05, size=1)[0]
    mean_response = mean_numerator/mean_denominator
    transformed_row = mean_response*random_number
    transformed_rows.append(transformed_row)
df_train['new_col'] = transformed_rows

In [673]:
groups[(groups['cat_emails'] == column_class) & (groups['target'] == target)]['count'].reset_index()

Unnamed: 0,index,count
0,2,301


In [671]:
groups.loc[groups['cat_emails'] == 0]

Unnamed: 0,cat_emails,target,count
0,0.0,0,242
1,0.0,1,141


In [671]:
groups.loc[groups['cat_emails'] == 0]

Unnamed: 0,cat_emails,target,count
0,0.0,0,242
1,0.0,1,141


In [524]:
#col_series

In [670]:
#df_train

In [490]:
leave_one_out

Unnamed: 0,count
0,173


In [459]:
column_class

1.0

In [462]:
groups[(groups['cat_emails'] == 0) & (groups['target'] == target)]['count'][0]-1

241

In [346]:
def leave_one_out(row, groups): #wrocic do pisania jej
    groups = groups.copy(deep=True)
    flag = groups[(groups['cat_emails'] == row['cat_emails']) & (groups['target'] == row['target'])]['response'][0]-1
    if row['target']:
        response = groups[(groups['cat_emails'] == row['cat_emails']) & (groups['target'] == 0)]
        mean_response = response/(flag+response)
    else:
        no_response = groups[(groups['cat_emails'] == row['cat_emails']) & (groups['target'] == 1)]
        mean_response = flag/(no_response+flag)
    #mean_response = response/(no_response+response)
    random_number = np.random.normal(loc=1, scale=0.05, size=1)
    transformed_row = mean_response*random_number
    #return transformed_row
    return flag

In [431]:
#??LeaveOneOutEncoder

In [441]:
y = bunch.target
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
enc = LeaveOneOutEncoder(cols=['CHAS', 'RAD']).fit(X, y)
numeric_dataset = enc.transform(X)

In [442]:
numeric_dataset['RAD'].value_counts()

16.403788    132
25.706957    115
21.387273    110
27.928947     38
20.976923     26
26.833333     24
30.358333     24
24.365000     20
27.105882     17
Name: RAD, dtype: int64

In [443]:
X = df_train.drop(columns=['accepted', 'target'])
y = df_train[['target']]

In [447]:
enc = LeaveOneOutEncoder(cols=['cat_emails', 'cat_phone_calls'], random_state=42, sigma=0.03).fit(X, y) #doesn't add noise, check the library

In [448]:
# 0.504505
# 0.368146

In [449]:
enc.transform(X)

Unnamed: 0_level_0,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status,cat_emails,cat_phone_calls,temp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
8550AB469CB2445,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816,0.504505,0.515748,403.973
07355EE27DD1493,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,-37.065906,0.368146,0.341429,164.075
034E73A251554F0,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,-29.470882,0.347072,0.515748,208.582
0AF961B4AC7A439,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,-4.360756,0.347072,0.341429,255.496
8535BBCA690A4AE,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,29.196258,0.347072,0.341429,221.371
B8A3C4B5FDE74D2,Premium,male,29.0,0.0,0.0,315082,787.5,DDF,S,5D73B,165.075347,2985.0,B,-13.312103,0.368146,0.341429,265.075
2C797DCB48744EA,Premium,female,,1.0,3.0,C.A. 2315,2057.5,4EB,S,D096A,138.735912,7547.0,B,-39.567982,0.517857,0.515748,238.736
1004379959394F2,Premium,female,18.0,0.0,2.0,2691,1445.42,0.00E+00,C,20148,129.577331,4791.0,B,-25.819177,0.504505,0.341429,229.577
46AACF6692654AF,Premium,male,,0.0,1.0,36865,773.75,69F,Q,8E607,96.275798,5709.0,B,-51.261585,0.347072,0.341429,196.276
E1ED4839E59F4FE,Medium,male,51.0,0.0,1.0,PC 17597,6137.92,2F9,C,AE78D,88.325662,6503.0,A,-4.13334,0.347072,0.341429,188.326


In [304]:
play['cat_emails']

0     0.0
1     0.0
2     1.0
3     1.0
4     2.0
5     2.0
6     3.0
7     3.0
8     4.0
9     4.0
10    5.0
11    5.0
Name: cat_emails, dtype: float64

In [337]:
def add(row):
    return [row['salary'],row['offer_value']]

In [324]:
def add1(row):
    return row+100

In [398]:
#df[['emails', 'phone_calls']].apply(lambda value1, value2: [value1+value2, value3])

In [362]:
lambda x,y: x+y

<function __main__.<lambda>(x, y)>

In [338]:
temp = df_train.apply(lambda row: add(row), axis=1)

In [345]:
type(temp)

pandas.core.series.Series

In [340]:
temp['sec'] = temp.apply(lambda row: row[0]+row[1])

In [344]:
temp.shape

(1032,)

In [309]:
df_train['temp'] = df_train.apply(lambda row: add(row), axis=1)

In [396]:
#df_train[['offer_value', 'cat_emails']].apply(add1)

In [313]:
df_train['temp'] = df_train.apply(lambda row: row['phone_calls']*2, axis=1)

In [330]:
df_train.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status,cat_emails,cat_phone_calls,target,temp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816,2,1,0,403.973
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,-37.065906,0,0,0,164.075
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,-29.470882,1,1,1,208.582
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,-4.360756,1,0,0,255.496
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,29.196258,1,0,1,221.371


In [397]:
#df_train['temp'] = df_train.apply(lambda row: leave_one_out(row, play), axis=1)

In [298]:
df_train.head()

Unnamed: 0_level_0,accepted,offer_class,gender,age,phone_calls,emails,customer_code,salary,offer_code,customer_type,number,offer_value,estimated_expenses,center,budget_status,cat_emails,cat_phone_calls,target,temp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8550AB469CB2445,no,Premium,female,,1.0,2.0,2678,1524.58,8CA,C,0F2A2,303.973257,7207.0,B,-18.693816,2,1,0,[2.0647688538100692]
07355EE27DD1493,no,High,male,32.0,0.0,0.0,244360,1300.0,9DD,S,5F15A,64.075055,3675.0,A,-37.065906,0,0,0,[0.0]
034E73A251554F0,yes,Premium,female,,1.0,1.0,370365,1550.0,517,Q,AC578,108.58175,4750.0,B,-29.470882,1,1,1,[0.9882923312638332]
0AF961B4AC7A439,no,Premium,male,,0.0,1.0,2652,722.92,9CB,C,8A8CC,155.495957,1401.0,A,-4.360756,1,0,0,[0.988293152152541]
8535BBCA690A4AE,yes,Premium,male,,0.0,1.0,1601,5649.58,6FF,S,42EFB,121.371033,2106.0,B,29.196258,1,0,1,[1.0789606407753696]


In [218]:
#X.iloc[X_elements, column]

In [165]:
a = list(range(X.shape[0]))

In [167]:
len(a)

1031

In [168]:
a.remove(0)

In [169]:
len(a)

1030

In [170]:
a.append(0)
a.remove(1)

In [171]:
len(a)

1030