In [5]:
import pandas as pd
import numpy as np
from itertools import product
import os
from sklearn.model_selection import KFold


In [6]:
# Save and Store Macro
%macro -q __importLab2 1
%store __importLab2

Stored '__importLab2' (Macro)


In [7]:
# Load and Execute Macro
%store -r __importLab2
__importLab2

IPython.macro.Macro('import pandas as pd\nimport numpy as np\nfrom itertools import product\nimport os\nfrom sklearn.model_selection import KFold\n')

## Load Dataframe

Assuming data is clean

Dataset 1: https://www.kaggle.com/mashlyn/online-retail-ii-uci

In [8]:
data_folder_path = '../data/external/'

df = pd.read_csv(os.path.join(data_folder_path, 'online_retail_II.csv'))

## Task 1: Aggregate Data by Month

Encode StockCode

Target: Price

Create month

Drop: country, customer ID, invoice, description, quantity

In [9]:
df = df.drop(columns=['Invoice', 'Description','Quantity','Customer ID'])

In [10]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']).dt.date
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format='%Y-%m-%d')
df['InvoiceDate'] = df['InvoiceDate'].dt.to_period('M').astype(str)

In [11]:
MonthsSeries = pd.Series(df['InvoiceDate'].unique())
MonthCode = list(MonthsSeries.index)
Months = list(MonthsSeries)

In [12]:
df = df.replace(Months, MonthCode)

In [13]:
df = df.rename(columns = {'InvoiceDate':'MonthCode'})

In [14]:
index_cols = ['Country', 'StockCode', 'MonthCode']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for month_code in df['MonthCode'].unique():
    cur_country = df[df['MonthCode']==month_code]['Country'].unique()
    cur_items = df[df['MonthCode']==month_code]['StockCode'].unique()
    grid.append(np.array(list(product(*[cur_country, cur_items, [month_code]])),dtype='str'))

In [15]:
#turn the grid into pandas dataframe
grid_df = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=str)

In [16]:
gb = df.groupby(index_cols,as_index=False).agg({'Price': 'sum'}).rename(columns = {'Price':'target'}).astype(str)

In [17]:
#join aggregated data to the grid
all_data = pd.merge(grid_df,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['MonthCode','Country','StockCode'],inplace=True)

In [18]:
all_data["target"] = pd.to_numeric(all_data["target"])

## Task 2: Mean Encoding Without Regularization

In [19]:
mean_target = np.mean(all_data["target"])

### Task 2a: Method 1

In [20]:
%%time
# Calculate a mapping: {item_id: target_mean}
item_id_target_mean = all_data.groupby('StockCode').target.mean()

# In our non-regularized case we just *map* the computed means to the `item_id`'s
all_data['item_target_enc'] = all_data['StockCode'].map(item_id_target_mean)

# Fill NaNs
all_data['item_target_enc'].fillna(mean_target, inplace=True) 

# Print correlation
encoded_feature = all_data['item_target_enc'].values
print(np.corrcoef(all_data['target'].values, encoded_feature)[0][1])

0.1574018383578094
CPU times: user 1.31 s, sys: 224 ms, total: 1.54 s
Wall time: 4.22 s


### Task 2b: Method 2

In [21]:
%%time
'''
     Differently to `.target.mean()` function `transform` 
   will return a dataframe with an index like in `all_data`.
   Basically this single line of code is equivalent to the first two lines from of Method 1.
'''
all_data['item_target_enc'] = all_data.groupby('StockCode')['target'].transform('mean')

# Fill NaNs
all_data['item_target_enc'].fillna(mean_target, inplace=True) 

# Print correlation
encoded_feature = all_data['item_target_enc'].values
print(np.corrcoef(all_data['target'].values, encoded_feature)[0][1])

0.1574018383578094
CPU times: user 859 ms, sys: 118 ms, total: 977 ms
Wall time: 2.9 s


## Task 3: Mean Encoding With Regularization

### Task 3a: KFold Scheme

In [22]:
%%time
kf = KFold(n_splits=5, shuffle=False)

for index_train, index_valid in kf.split(all_data):
    X_tr, X_val = all_data.iloc[index_train], all_data.iloc[index_valid]
    
    # target coding of valid dataset depends on train dataset
    X_tr_group = X_tr.groupby('StockCode')['target']
    X_val['item_target_enc'] = X_val['StockCode'].map(X_tr_group.mean()) #SettingWithCopyWarning
    
    # copy target encoding back to all_data
    all_data.iloc[index_valid] = X_val
    

all_data['item_target_enc'].fillna(0.3343, inplace=True) 
encoded_feature = all_data['item_target_enc'].values
    

# You will need to compute correlation like that
corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0.13828381120363506
CPU times: user 10.5 s, sys: 1.41 s, total: 11.9 s
Wall time: 30.7 s


## Task3b: Leave-one-out scheme

In [23]:
%%time
# Calculate sum of the target values using all the objects.
target_sum = all_data.groupby('StockCode')['target'].transform('sum')

# Then subtract the target of the given object and divide the resulting value by n_objects - 1.
n_objects = all_data.groupby('StockCode')['target'].transform('count')

all_data['item_target_enc'] = (target_sum - all_data['target']) / (n_objects - 1)
all_data['item_target_enc'].fillna(0.3343, inplace=True)
encoded_feature = all_data['item_target_enc'].values


corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.1388082978909071
CPU times: user 1.36 s, sys: 129 ms, total: 1.49 s
Wall time: 4.29 s


## Task 3c: Smoothing

In [24]:
%%time
alpha = 100

item_id_target_mean = all_data.groupby('StockCode')['target'].transform('mean')
n_objects = all_data.groupby('StockCode')['target'].transform('count')

all_data['item_target_enc'] = (item_id_target_mean * n_objects + 0.3343*alpha) / (n_objects + alpha)

all_data['item_target_enc'].fillna(0.3343, inplace=True) 
encoded_feature = all_data['item_target_enc'].values


corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.1554708546055076
CPU times: user 1.48 s, sys: 239 ms, total: 1.72 s
Wall time: 3.79 s


## Task 3d: Expanding Mean Scheme

In [25]:
%%time
cumsum = all_data.groupby('StockCode')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('StockCode').cumcount()

all_data['item_target_enc'] = cumsum / cumcnt
all_data['item_target_enc'].fillna(0.3343, inplace=True) 
encoded_feature = all_data['item_target_enc'].values

corr = np.corrcoef(all_data['target'].values, encoded_feature)[0][1]
print(corr)

0.1222981010431047
CPU times: user 1.8 s, sys: 302 ms, total: 2.1 s
Wall time: 5.79 s
