In [2]:
import gc
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import numpy as np
import os
import lightgbm as lgb
import json
import sklearn.metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from scipy.sparse import dok_matrix, coo_matrix
from sklearn.utils.multiclass import  type_of_target
path = "data"

In [3]:
%%time
aisles = pd.read_csv(os.path.join(path, "aisles.csv"), dtype={'aisle_id': np.uint8, 'aisle': 'category'})
departments = pd.read_csv(os.path.join(path, "departments.csv"),
                          dtype={'department_id': np.uint8, 'department': 'category'})
order_prior = pd.read_csv(os.path.join(path, "order_products__prior.csv"), dtype={'order_id': np.uint32,
                                                                                  'product_id': np.uint16,
                                                                                  'add_to_cart_order': np.uint8,
                                                                                  'reordered': bool})

order_train = pd.read_csv(os.path.join(path, "order_products__train.csv"), dtype={'order_id': np.uint32,
                                                                                  'product_id': np.uint16,
                                                                                  'add_to_cart_order': np.uint8,
                                                                                  'reordered': bool})
orders = pd.read_csv(os.path.join(path, "orders.csv"), dtype={'order_id': np.uint32,
                                                              'user_id': np.uint32,
                                                              'eval_set': 'category',
                                                              'order_number': np.uint8,
                                                              'order_dow': np.uint8,
                                                              'order_hour_of_day': np.uint8
                                                              })

product_embeddings = pd.read_pickle('data/product_embeddings.pkl')

CPU times: user 10.4 s, sys: 580 ms, total: 11 s
Wall time: 11 s


In [6]:
product_embeddings.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,0,1,2,3,4,5,...,22,23,24,25,26,27,28,29,30,31
0,1,Chocolate Sandwich Cookies,61,19,-0.274106,-0.346766,-0.492804,0.040195,1.39333,0.857459,...,-0.734671,0.084122,1.307883,1.036936,1.053968,-0.436293,0.449946,-0.599207,-0.566169,0.386086
1,2,All-Seasons Salt,104,13,0.430817,-0.094257,0.289575,0.386656,-0.716073,-0.98302,...,-0.555524,0.555755,0.176689,0.276771,0.67112,0.720933,-0.506837,-0.355288,0.010037,0.199384
2,3,Robust Golden Unsweetened Oolong Tea,94,7,-0.684641,0.156538,0.747337,0.635513,-0.289291,-0.394251,...,0.327167,-0.025734,-0.038228,0.443475,-0.345621,-0.862611,0.881672,0.059684,0.900507,-0.671068
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,0.370838,0.339059,0.911805,-0.451075,0.206791,0.78234,...,-0.158184,0.232721,-0.765808,-0.65554,0.961942,0.795764,-0.670161,-0.714329,-0.42282,0.499329
4,5,Green Chile Anytime Sauce,5,13,-0.602406,-0.448103,0.91245,-0.440221,0.403223,0.052592,...,-0.548407,-0.049812,-0.573244,0.125205,-0.426396,-0.43555,0.808476,-0.183062,-0.743405,0.21561


In [7]:
## item2vec을 임베딩 하기 위한 리스트 : 0 - 31 개니까 32개 리스트만듬
embedings = list(range(32))

In [13]:
product_embeddings[embedings + ['product_id']].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,product_id
0,-0.274106,-0.346766,-0.492804,0.040195,1.39333,0.857459,-0.64134,-0.140168,-0.220764,-0.303222,...,0.084122,1.307883,1.036936,1.053968,-0.436293,0.449946,-0.599207,-0.566169,0.386086,1
1,0.430817,-0.094257,0.289575,0.386656,-0.716073,-0.98302,0.427547,0.626686,-0.706547,0.502924,...,0.555755,0.176689,0.276771,0.67112,0.720933,-0.506837,-0.355288,0.010037,0.199384,2
2,-0.684641,0.156538,0.747337,0.635513,-0.289291,-0.394251,0.872738,-0.55707,-0.402544,0.67799,...,-0.025734,-0.038228,0.443475,-0.345621,-0.862611,0.881672,0.059684,0.900507,-0.671068,3
3,0.370838,0.339059,0.911805,-0.451075,0.206791,0.78234,0.641243,-0.597517,-0.773465,-0.70046,...,0.232721,-0.765808,-0.65554,0.961942,0.795764,-0.670161,-0.714329,-0.42282,0.499329,4
4,-0.602406,-0.448103,0.91245,-0.440221,0.403223,0.052592,-0.482172,0.348995,0.605238,-0.356287,...,-0.049812,-0.573244,0.125205,-0.426396,-0.43555,0.808476,-0.183062,-0.743405,0.21561,5


In [11]:
product_embeddings[embedings + ['product_id']].shape

(49688, 33)

In [12]:
product_embeddings.shape

(49688, 36)

In [14]:
## product_embeddings에서 product_name,aisle_id,department_id 열을 제거한것
product_embeddings = product_embeddings[embedings + ['product_id']]

In [15]:
order_train.head(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,True


In [16]:
orders.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,


In [17]:
%%time
order_prev = pd.merge(order_train, orders, on='order_id')

CPU times: user 976 ms, sys: 248 ms, total: 1.22 s
Wall time: 1.22 s


In [18]:
order_prev.head(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,49302,1,True,112108,train,4,4,10,9.0


In [20]:
order_prev.order_number.head()

0    4
1    4
2    4
3    4
4    4
Name: order_number, dtype: uint8

In [21]:
%%time
## order_number에서 -1을 전부 제거한다.
order_prev.order_number -= 1

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.94 ms


In [24]:
order_prev.order_number.head()

0    3
1    3
2    3
3    3
4    3
Name: order_number, dtype: uint8

In [25]:
%%time
## eval_set, days_since_prior_order 제외하고 orders와 병합한다 
order_prev = pd.merge(order_prev[['user_id', 'order_number', 'product_id', 
                                  'reordered', 'add_to_cart_order', 'order_dow','order_hour_of_day']], 
                      orders[['user_id', 'order_number', 'order_id']],
                          on=['user_id', 'order_number'])



CPU times: user 1.06 s, sys: 224 ms, total: 1.29 s
Wall time: 1.28 s


In [26]:
order_prev.head(2)

Unnamed: 0,user_id,order_number,product_id,reordered,add_to_cart_order,order_dow,order_hour_of_day,order_id
0,112108,3,49302,True,1,4,10,186706
1,112108,3,11109,True,2,4,10,186706


In [27]:
order_prev.drop(['order_number', 'user_id'], axis=1, inplace=True)

In [28]:
order_prev.head(1)

Unnamed: 0,product_id,reordered,add_to_cart_order,order_dow,order_hour_of_day,order_id
0,49302,True,1,4,10,186706


In [29]:
order_prev.rename(columns={'reordered': 'reordered_prev',
                           'add_to_cart_order': 'add_to_cart_order_prev',
                           'order_dow': 'order_dow_prev',
                           'order_hour_of_day': 'order_hour_of_day_prev'}, 
                  inplace=True)

In [30]:
order_prev.head(1)

Unnamed: 0,product_id,reordered_prev,add_to_cart_order_prev,order_dow_prev,order_hour_of_day_prev,order_id
0,49302,True,1,4,10,186706


In [32]:
products = pd.read_csv(os.path.join(path, "products.csv"), dtype={'product_id': np.uint16,
                                                                  'aisle_id': np.uint8,
                                                                  'department_id': np.uint8})
order_train = pd.read_pickle(os.path.join(path, 'chunk_0.pkl'))

In [33]:
## eval_set == train 인 애들 중에 'order_id',  'product_id',  'reordered'열만 갖고 와라 
order_train = order_train.loc[order_train.eval_set == "train", ['order_id',  'product_id',  'reordered']]

In [34]:
## 유저가 상품과 주문을 얼마 주기로 하는지 전처리 해놓은 것 로드
## preprocessing#4
product_periods = pd.read_pickle(os.path.join(path, 'product_periods_stat.pkl')).fillna(9999)

In [35]:
product_periods.head(1)

Unnamed: 0,user_id,product_id,last,prev1,prev2,median,mean
0,1,196,14.0,30.0,0.0,20.0,19.555556


In [None]:
# product_periods.prev1 = product_periods['last'] / product_periods.prev1
# product_periods.prev2 = product_periods['last'] / product_periods.prev2
# product_periods['mean'] = product_periods['last'] / product_periods['mean']
# product_periods['median'] = product_periods['last'] / product_periods['median']

In [37]:
%%time

weights = order_train.groupby('order_id')['reordered'].sum().to_frame('weights')
weights.reset_index(inplace=True)

CPU times: user 216 ms, sys: 180 ms, total: 396 ms
Wall time: 396 ms


In [44]:
order_train.head(5)

Unnamed: 0,order_id,product_id,reordered
0,1,49302,True
1,1,11109,True
2,1,43633,True
3,1,22035,True
4,36,19660,True


In [46]:
## order_id==1 은 4개 있는데 모두 재구매된 상품이기 때문에 4가 된다.
order_train.groupby('order_id')['reordered'].sum().head(3)

order_id
1     4.0
36    6.0
38    1.0
Name: reordered, dtype: float64

In [47]:
## 즉 weights란 order_id에서 재구매 비율이 얼마나 되는지 False는 합에 추가되지 
## 않기 때문에 True인 갯수를 가중치로 설정
weights.head(2)

Unnamed: 0,order_id,weights
0,1,4.0
1,36,6.0


In [60]:
## 같은 작업을 prior 기록에도 적용
prob = pd.merge(order_prior, orders, on='order_id')

In [61]:
prob.head(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,True,202279,prior,3,5,9,8.0


In [62]:
%%time
## prob을'product_id', 'user_id'로 group_by 후 유저 당 reordered를 얼마나 하는지 계산
prob = prob.groupby(['product_id', 'user_id']).agg({'reordered':'sum', 'user_id': 'size'})

CPU times: user 18.6 s, sys: 2.97 s, total: 21.6 s
Wall time: 21.6 s


In [63]:
prob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,user_id
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,1.0,2
1,709,0.0,1
1,764,1.0,2
1,777,0.0,1
1,825,0.0,1


In [64]:
prob.rename(columns={'sum': 'reordered','user_id': 'total'}, inplace=True)

In [65]:
prob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,total
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,1.0,2
1,709,0.0,1
1,764,1.0,2
1,777,0.0,1
1,825,0.0,1


In [66]:
prob.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13307953 entries, (1, 138) to (49688, 205926)
Data columns (total 2 columns):
reordered    float64
total        int64
dtypes: float64(1), int64(1)
memory usage: 306.5 MB


In [67]:
## 메모리 관리 + 속도 때문에 타입을 바꿔준다.
## 또한 재구매가 된 경우 True : 1.0 한번도 안된 경우 False: 0.0 으로 더미코딩 한다.
prob.reordered = (prob.reordered > 0).astype(np.float32)

In [71]:
(prob.reordered > 0).head()

product_id  user_id
1           138         True
            709        False
            764         True
            777        False
            825        False
Name: reordered, dtype: bool

In [72]:
## 50MB 정도 감소함
prob.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 13307953 entries, (1, 138) to (49688, 205926)
Data columns (total 2 columns):
reordered    float32
total        int64
dtypes: float32(1), int64(1)
memory usage: 257.0 MB


In [73]:
prob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,total
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,1.0,2
1,709,0.0,1
1,764,1.0,2
1,777,0.0,1
1,825,0.0,1


In [74]:
prob.total = (prob.total > 0).astype(np.float32)

In [75]:
prob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,total
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,138,1.0,1.0
1,709,0.0,1.0
1,764,1.0,1.0
1,777,0.0,1.0
1,825,0.0,1.0


In [76]:
prob['reorder_prob'] = prob.reordered / prob.total

In [77]:
prob.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,total,reorder_prob
product_id,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,138,1.0,1.0,1.0
1,709,0.0,1.0,0.0
1,764,1.0,1.0,1.0
1,777,0.0,1.0,0.0
1,825,0.0,1.0,0.0


In [78]:
## 상품당 유저들이 얼마나 재구매했는지 확률로 계산
prob = prob.groupby('product_id').agg({'reorder_prob': 'mean'}).rename(columns={'mean': 'reorder_prob'}).reset_index()

In [79]:
prob.head()

Unnamed: 0,product_id,reorder_prob
0,1,0.385475
1,2,0.102564
2,3,0.486486
3,4,0.351648
4,5,0.666667


In [80]:
order_prior.head(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,True


In [81]:
%%time
## sum : reordered된 횟수의 총 합
## size : 1개의 상품당 몇 변의 오더에 속하는가? 즉 얼마나 많이 팔렸나?
## add_to_cart_order의 평균 : 평균 몇 번째로 장바구니에 담기는가? -> EDA에서 재구매일 수록 먼저담김을 확인했기 때문에 
prod_stat = order_prior.groupby('product_id').agg({'reordered': ['sum', 'size'],'add_to_cart_order':'mean'})

CPU times: user 5.43 s, sys: 1.66 s, total: 7.08 s
Wall time: 7.08 s


In [83]:
prod_stat.head()

Unnamed: 0_level_0,reordered,reordered,add_to_cart_order
Unnamed: 0_level_1,sum,size,mean
product_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,1136.0,1852,5.801836
2,12.0,90,9.888889
3,203.0,277,6.415162
4,147.0,329,9.507599
5,9.0,15,6.466667


In [92]:
prod_stat.columns = prod_stat.columns.levels[1]

In [94]:
prod_stat.head()

Unnamed: 0_level_0,mean,size,sum
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1136.0,1852,5.801836
2,12.0,90,9.888889
3,203.0,277,6.415162
4,147.0,329,9.507599
5,9.0,15,6.466667


In [95]:
prod_stat.rename(columns={'sum':'prod_reorders',
                              'size':'prod_orders',
                              'mean': 'prod_add_to_card_mean'}, inplace=True)

In [96]:
prod_stat.head()

Unnamed: 0_level_0,prod_add_to_card_mean,prod_orders,prod_reorders
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1136.0,1852,5.801836
2,12.0,90,9.888889
3,203.0,277,6.415162
4,147.0,329,9.507599
5,9.0,15,6.466667


In [97]:
prod_stat.reset_index(inplace=True)

In [99]:
prod_stat.head(1)

Unnamed: 0,product_id,prod_add_to_card_mean,prod_orders,prod_reorders
0,1,1136.0,1852,5.801836


In [84]:
## size 설명 : 1개의 상품당 몇 변의 오더에 속하는가? 즉 얼마나 많이 팔렸나?
gb = order_prior.groupby('product_id')

In [91]:
gb.get_group(1).head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
10708,1107,1,7,False
50379,5319,1,3,True
70912,7540,1,4,True
86636,9228,1,2,False
87149,9273,1,30,False


In [90]:
gb.get_group(1).shape

(1852, 4)

In [100]:
prod_stat['reorder_ration'] = prod_stat['prod_reorders'] / prod_stat['prod_orders']

In [101]:
prod_stat.head(1)

Unnamed: 0,product_id,prod_add_to_card_mean,prod_orders,prod_reorders,reorder_ration
0,1,1136.0,1852,5.801836,0.003133


In [102]:
prod_stat = pd.merge(prod_stat, prob, on='product_id')

In [103]:
prod_stat.head(1)

Unnamed: 0,product_id,prod_add_to_card_mean,prod_orders,prod_reorders,reorder_ration,reorder_prob
0,1,1136.0,1852,5.801836,0.003133,0.385475


In [None]:
## prod_stat.drop(['prod_reorders'], axis=1, inplace=True)

In [104]:
%%time
## sum : 얼마나 오래 서비스를 이용했는지
## mean : 평균 얼마의 주기로 구매하는가
## median : 구매주기의 중앙값
## order_number'max : 지금까지 얼마나 주문했는가
user_stat = orders.loc[orders.eval_set == 'prior', :].groupby('user_id').agg({'order_number': 'max',
                                                                              'days_since_prior_order': ['sum',
                                                                                                         'mean',
                                                                                                         'median']})

CPU times: user 512 ms, sys: 96 ms, total: 608 ms
Wall time: 606 ms


In [107]:
user_stat.head()

Unnamed: 0_level_0,days_since_prior_order,days_since_prior_order,days_since_prior_order,order_number
Unnamed: 0_level_1,sum,mean,median,max
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,176.0,19.555556,20.0,10
2,198.0,15.230769,13.0,14
3,133.0,12.090909,11.0,12
4,55.0,13.75,17.0,5
5,40.0,13.333333,11.0,4


In [108]:
user_stat.columns = user_stat.columns.droplevel(0)
user_stat.rename(columns={'max': 'user_orders',
                          'sum': 'user_order_starts_at',
                          'mean': 'user_mean_days_since_prior',
                          'median': 'user_median_days_since_prior'}, inplace=True)
user_stat.reset_index(inplace=True)

In [111]:
user_stat.head()

Unnamed: 0,user_id,user_order_starts_at,user_mean_days_since_prior,user_median_days_since_prior,user_orders
0,1,176.0,19.555556,20.0,10
1,2,198.0,15.230769,13.0,14
2,3,133.0,12.090909,11.0,12
3,4,55.0,13.75,17.0,5
4,5,40.0,13.333333,11.0,4


In [112]:
## orders 정보와 order_prior 합칩 : 계속 설명했기 때문에 생략
orders_products = pd.merge(orders, order_prior, on="order_id")

In [114]:
orders_products.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,False


In [115]:
## reordered : 유저당 얼마나 재구매하는가?
## product_id : nunique() 로 유저가 얼마나 다양한 상품을 구매하는가?
## user_id : 지금까지 몇개의 상품을 구매했는가?
user_order_stat = orders_products.groupby('user_id').agg({'user_id': 'size',
                                                          'reordered': 'sum',
                                                          'product_id': lambda x: x.nunique()})

In [116]:
user_order_stat.head()

Unnamed: 0_level_0,reordered,product_id,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41.0,18,59
2,93.0,102,195
3,55.0,33,88
4,1.0,17,18
5,14.0,23,37


In [117]:
user_order_stat.rename(columns={'user_id': 'user_total_products',
                                'product_id': 'user_distinct_products',
                                'reordered': 'user_reorder_ratio'}, inplace=True)
user_order_stat.reset_index(inplace=True)

In [120]:
user_order_stat.head()

Unnamed: 0,user_id,user_reorder_ratio,user_distinct_products,user_total_products
0,1,41.0,18,59
1,2,93.0,102,195
2,3,55.0,33,88
3,4,1.0,17,18
4,5,14.0,23,37


In [121]:
## user_reorder_ratio 유저의 전체 구매수 에서 reorder_ratio는 얼마인가?
user_order_stat.user_reorder_ratio = user_order_stat.user_reorder_ratio / user_order_stat.user_total_products

In [122]:
user_order_stat.head(1)

Unnamed: 0,user_id,user_reorder_ratio,user_distinct_products,user_total_products
0,1,0.694915,18,59


In [123]:
user_stat.head(1)

Unnamed: 0,user_id,user_order_starts_at,user_mean_days_since_prior,user_median_days_since_prior,user_orders
0,1,176.0,19.555556,20.0,10


In [124]:
user_stat = pd.merge(user_stat, user_order_stat, on='user_id')

In [125]:
user_stat.head(1)

Unnamed: 0,user_id,user_order_starts_at,user_mean_days_since_prior,user_median_days_since_prior,user_orders,user_reorder_ratio,user_distinct_products,user_total_products
0,1,176.0,19.555556,20.0,10,0.694915,18,59


In [126]:
## user_average_basket : 유저의 전체 상품 구매량에서 전체 오더수를 나눠 한번 구매할 때마다 얼마나 사는지 평균 계산
user_stat['user_average_basket'] = user_stat.user_total_products / user_stat.user_orders

In [127]:
user_stat.head(1)

Unnamed: 0,user_id,user_order_starts_at,user_mean_days_since_prior,user_median_days_since_prior,user_orders,user_reorder_ratio,user_distinct_products,user_total_products,user_average_basket
0,1,176.0,19.555556,20.0,10,0.694915,18,59,5.9


In [128]:
%%time
## 상품당 얼마나 많은 유저가 구매했는가?
prod_usr = orders_products.groupby(['product_id']).agg({'user_id': lambda x: x.nunique()})

CPU times: user 8.05 s, sys: 904 ms, total: 8.95 s
Wall time: 8.95 s


In [129]:
prod_usr.head()

Unnamed: 0_level_0,user_id
product_id,Unnamed: 1_level_1
1,716
2,78
3,74
4,182
5,6


In [130]:
prod_usr.rename(columns={'user_id':'prod_users_unq'}, inplace=True)
prod_usr.reset_index(inplace=True)

In [131]:
prod_usr.head()

Unnamed: 0,product_id,prod_users_unq
0,1,716
1,2,78
2,3,74
3,4,182
4,5,6


In [132]:
%%time
## 상품당 재구매 한 유저의 수
prod_usr_reordered = orders_products.loc[orders_products.reordered, :].groupby(['product_id']).agg({'user_id': lambda x: x.nunique()})

CPU times: user 6.8 s, sys: 940 ms, total: 7.74 s
Wall time: 7.74 s


In [133]:
prod_usr_reordered.head()

Unnamed: 0_level_0,user_id
product_id,Unnamed: 1_level_1
1,276
2,8
3,36
4,64
5,4


In [134]:
prod_usr_reordered.rename(columns={'user_id': 'prod_users_unq_reordered'}, inplace=True)
prod_usr_reordered.reset_index(inplace=True)

In [135]:
prod_usr_reordered.head()

Unnamed: 0,product_id,prod_users_unq_reordered
0,1,276
1,2,8
2,3,36
3,4,64
4,5,4


In [136]:
%%time
#order_size : 각 주문의 상품수
order_stat = orders_products.groupby('order_id').agg({'order_id': 'size'}).rename(columns = {'order_id': 'order_size'})\
.reset_index()

CPU times: user 2.35 s, sys: 524 ms, total: 2.87 s
Wall time: 2.87 s


In [137]:
order_stat.head()

Unnamed: 0,order_id,order_size
0,2,9
1,3,8
2,4,13
3,5,26
4,6,3


In [139]:
orders_products.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,False


In [140]:
%%time
orders_products = pd.merge(orders_products, order_stat, on='order_id')

CPU times: user 6.19 s, sys: 2.34 s, total: 8.53 s
Wall time: 8.53 s


In [143]:
orders_products.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,order_size,add_to_cart_order_inverted,add_to_cart_order_relative
0,2539329,1,prior,1,2,8,,196,1,False,5,4,0.2


In [142]:
## add_to_cart_order_inverted : 각 오더의 전체 상품의 사이즈에서 장바구니 순서를 차감
## add_to_cart_order_relative : add_to_cart_order를 각 오더의 전체 상품수로 나눔
orders_products['add_to_cart_order_inverted'] = orders_products.order_size - orders_products.add_to_cart_order
orders_products['add_to_cart_order_relative'] = orders_products.add_to_cart_order / orders_products.order_size

In [144]:
%%time
## 유저가 구매한 상품에서 재구매가 일어난 요일의 sum과 size
data_dow = orders_products.groupby(['user_id', 'product_id', 'order_dow']).agg({'reordered': ['sum', 'size']})

CPU times: user 18.2 s, sys: 5.76 s, total: 23.9 s
Wall time: 23.9 s


In [145]:
data_dow.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reordered,reordered
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,size
user_id,product_id,order_dow,Unnamed: 3_level_2,Unnamed: 4_level_2
1,196,1,3.0,3
1,196,2,1.0,2
1,196,3,2.0,2
1,196,4,3.0,3
1,10258,1,3.0,3


In [146]:
data_dow.columns = data_dow.columns.droplevel(0)
data_dow.columns = ['reordered_dow', 'reordered_dow_size']

In [147]:
data_dow.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reordered_dow,reordered_dow_size
user_id,product_id,order_dow,Unnamed: 3_level_1,Unnamed: 4_level_1
1,196,1,3.0,3
1,196,2,1.0,2
1,196,3,2.0,2
1,196,4,3.0,3
1,10258,1,3.0,3


In [148]:
## reordered_dow_ration : 유저의 상품 - 요일별 재구매 비율
data_dow['reordered_dow_ration'] = data_dow.reordered_dow / data_dow.reordered_dow_size

In [149]:
data_dow.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reordered_dow,reordered_dow_size,reordered_dow_ration
user_id,product_id,order_dow,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,196,1,3.0,3,1.0
1,196,2,1.0,2,0.5
1,196,3,2.0,2,1.0
1,196,4,3.0,3,1.0
1,10258,1,3.0,3,1.0


In [150]:
data_dow.reset_index(inplace=True)

In [151]:
data_dow.head()

Unnamed: 0,user_id,product_id,order_dow,reordered_dow,reordered_dow_size,reordered_dow_ration
0,1,196,1,3.0,3,1.0
1,1,196,2,1.0,2,0.5
2,1,196,3,2.0,2,1.0
3,1,196,4,3.0,3,1.0
4,1,10258,1,3.0,3,1.0


In [155]:
orders_products.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,order_size,add_to_cart_order_inverted,add_to_cart_order_relative
0,2539329,1,prior,1,2,8,,196,1,False,5,4,0.2


In [156]:
%%time
## 
data = orders_products.groupby(['user_id', 'product_id']).agg({'user_id': 'size',
                                                               'order_number': ['min', 'max'],
                                                               'add_to_cart_order': ['mean', 'median'],
                                                               'days_since_prior_order': ['mean', 'median'],
                                                               'order_dow': ['mean', 'median'],
                                                               'order_hour_of_day': ['mean', 'median'],
                                                               'add_to_cart_order_inverted': ['mean', 'median'],
                                                               'add_to_cart_order_relative': ['mean', 'median'],
                                                               'reordered':['sum']})

CPU times: user 20.8 s, sys: 9.7 s, total: 30.4 s
Wall time: 30.4 s


In [157]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,add_to_cart_order,add_to_cart_order,days_since_prior_order,days_since_prior_order,order_number,order_number,order_dow,order_dow,add_to_cart_order_inverted,add_to_cart_order_inverted,order_hour_of_day,order_hour_of_day,reordered,user_id,add_to_cart_order_relative,add_to_cart_order_relative
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,min,max,mean,median,mean,median,mean,median,sum,size,mean,median
user_id,product_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
1,196,1.4,1.0,19.555556,20.0,1,10,2.5,2.5,4.5,4.0,10.3,8.5,9.0,10,0.245278,0.2
1,10258,3.333333,3.0,19.555556,20.0,2,10,2.555556,3.0,2.666667,3.0,10.555556,9.0,8.0,9,0.562037,0.6
1,10326,5.0,5.0,28.0,28.0,5,5,4.0,4.0,3.0,3.0,15.0,15.0,0.0,1,0.625,0.625
1,12427,3.3,2.5,19.555556,20.0,1,10,2.5,2.5,2.6,2.5,10.3,8.5,9.0,10,0.541667,0.5
1,13032,6.333333,6.0,21.666667,20.0,2,10,2.666667,3.0,0.333333,0.0,8.0,8.0,2.0,3,0.962963,1.0


In [158]:
## up_orders(size) : 유저의 총 구매 수
## up_first_order(order_number-min) : 유저-상품의 order_num의 시작 -> 해당 상품이 유저의 order_num에 어디에 처음 들어가는가
## up_last_order(order_number-max) : 유저-상품의 order_num의 끝 -> 해당 상품이 유저의 order_num에 어디에 마지막으로 들어가는가
## up_mean_cart_position : 유저-상품의 평균 장바구니 순서 -> 해당 상품이 유저의 카트에 평균 몇번째로 들어가는가?
## up_median_cart_position : 유저-상품의 평균 장바구니 순서 -> 해당 상품이 유저의 카트에 중앙값 몇번째로 들어가는가?
## days_since_prior_order_mean : 유저-상품의 평균 구매 기간 -> 유저가 해당상품을 재구매하는데 소요되는 평균 기간
## days_since_prior_order_median : 유저-상품의 중앙값 구매 기간 -> 유저가 해당상품을 재구매하는데 소요되는 중앙값 기간
## order_dow_mean : 유저-상품의 평균 요일
## order_dow_median : 유저-상품의 중앙값 요일
## order_hour_of_day_mean : 유저-상품의 평균 구매 시간
## order_hour_of_day_median : 유저-상품의 중앙값 구매 시간
## add_to_cart_order_inverted_mean : 유저-상품의 평균 add_to_cart_order_inverted_ -> 유저가 해당 상품을 사고 얼마나 더 쇼핑했는지
## add_to_cart_order_inverted_median : 유저-상품의 중앙값 add_to_cart_order_inverted
## add_to_cart_order_relative_mean : 유저-상품의 평균 add_to_cart_order_relative 해당 상품이 얼마나 유저에게 중요했는지
## add_to_cart_order_relative_median : 유저-상품의 중앙값 add_to_cart_order_relative
## reordered_sum : 유저-상품 당 재구매 총 횟수

data.columns = data.columns.droplevel(0)
data.columns = ['up_orders', 'up_first_order', 'up_last_order', 'up_mean_cart_position', 'up_median_cart_position',
                'days_since_prior_order_mean', 'days_since_prior_order_median', 'order_dow_mean', 'order_dow_median',
                'order_hour_of_day_mean', 'order_hour_of_day_median','add_to_cart_order_inverted_mean', 
                'add_to_cart_order_inverted_median','add_to_cart_order_relative_mean', 
                'add_to_cart_order_relative_median','reordered_sum']

In [159]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,up_orders,up_first_order,up_last_order,up_mean_cart_position,up_median_cart_position,days_since_prior_order_mean,days_since_prior_order_median,order_dow_mean,order_dow_median,order_hour_of_day_mean,order_hour_of_day_median,add_to_cart_order_inverted_mean,add_to_cart_order_inverted_median,add_to_cart_order_relative_mean,add_to_cart_order_relative_median,reordered_sum
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,196,1.4,1.0,19.555556,20.0,1,10,2.5,2.5,4.5,4.0,10.3,8.5,9.0,10,0.245278,0.2
1,10258,3.333333,3.0,19.555556,20.0,2,10,2.555556,3.0,2.666667,3.0,10.555556,9.0,8.0,9,0.562037,0.6
1,10326,5.0,5.0,28.0,28.0,5,5,4.0,4.0,3.0,3.0,15.0,15.0,0.0,1,0.625,0.625
1,12427,3.3,2.5,19.555556,20.0,1,10,2.5,2.5,2.6,2.5,10.3,8.5,9.0,10,0.541667,0.5
1,13032,6.333333,6.0,21.666667,20.0,2,10,2.666667,3.0,0.333333,0.0,8.0,8.0,2.0,3,0.962963,1.0


In [160]:
## user_product_reordered_ratio : reordered_sum에서 유저의 총구매수를 나눈 유저의 재구매 비율 -> 유저가 해당상품을 재구매한 비율
data['user_product_reordered_ratio'] = (data.reordered_sum + 1.0) / data.up_orders

In [161]:
# data['first_order'] = data['up_orders'] > 0
# data['second_order'] = data['up_orders'] > 1

# data.groupby('product_id')['']
data.reset_index(inplace=True)


In [162]:
## 파생변수 전부 합한다.
data = pd.merge(data, prod_stat, on='product_id')
data = pd.merge(data, user_stat, on='user_id')

In [163]:
data['up_order_rate'] = data.up_orders / data.user_orders
data['up_orders_since_last_order'] = data.user_orders - data.up_last_order
data['up_order_rate_since_first_order'] = data.user_orders / (data.user_orders - data.up_first_order + 1)

In [165]:
data[['up_order_rate','up_orders_since_last_order','up_order_rate_since_first_order']].head()

Unnamed: 0,up_order_rate,up_orders_since_last_order,up_order_rate_since_first_order
0,0.14,-9.555556,1.0
1,0.333333,-9.555556,1.25
2,0.5,-18.0,1.666667
3,0.33,-9.555556,1.176471
4,0.633333,-11.666667,2.0


In [166]:
%%time
user_dep_stat = pd.read_pickle('data/user_department_products.pkl')
user_aisle_stat = pd.read_pickle('data/user_aisle_products.pkl')

order_train = pd.merge(order_train, products, on='product_id')
order_train = pd.merge(order_train, orders, on='order_id')
order_train = pd.merge(order_train, user_dep_stat, on=['user_id', 'department_id'])
order_train = pd.merge(order_train, user_aisle_stat, on=['user_id', 'aisle_id'])

order_train = pd.merge(order_train, prod_usr, on='product_id')
order_train = pd.merge(order_train, prod_usr_reordered, on='product_id', how='left')
order_train.prod_users_unq_reordered.fillna(0, inplace=True)

order_train = pd.merge(order_train, data, on=['product_id', 'user_id'])
order_train = pd.merge(order_train, data_dow, on=['product_id', 'user_id', 'order_dow'], how='left')

order_train['aisle_reordered_ratio'] = order_train.aisle_reordered / order_train.user_orders
order_train['dep_reordered_ratio'] = order_train.dep_reordered / order_train.user_orders

order_train = pd.merge(order_train, product_periods, on=['user_id',  'product_id'])
order_train = pd.merge(order_train, product_embeddings, on=['product_id'])

CPU times: user 58.6 s, sys: 19 s, total: 1min 17s
Wall time: 1min 17s


In [168]:
# order_train = pd.merge(order_train, weights, on='order_id')

# order_train = pd.merge(order_train, order_prev, on=['order_id', 'product_id'], how='left')
# order_train.reordered_prev = order_train.reordered_prev.astype(np.float32) + 1.
# order_train['reordered_prev'].fillna(0, inplace=True)
# order_train[['add_to_cart_order_prev', 'order_dow_prev', 'order_hour_of_day_prev']].fillna(255, inplace=True)

# order_train.days_since_prior_order_mean -= order_train.days_since_prior_order
# order_train.days_since_prior_order_median -= order_train.days_since_prior_order
#
# order_train.order_dow_mean -= order_train.order_dow
# order_train.order_dow_median -= order_train.order_dow
#
# order_train.order_hour_of_day_mean -= order_train.order_hour_of_day
# order_train.order_hour_of_day_median -= order_train.order_hour_of_day

In [169]:
unique_orders = np.unique(order_train.order_id)
orders_train, orders_test = train_test_split(unique_orders, test_size=0.25, random_state=2017)

order_test = order_train.loc[np.in1d(order_train.order_id, orders_test)]
order_train = order_train.loc[np.in1d(order_train.order_id, orders_train)]

features = [
        # 'reordered_dow_ration', 'reordered_dow', 'reordered_dow_size',
        # 'reordered_prev', 'add_to_cart_order_prev', 'order_dow_prev', 'order_hour_of_day_prev',
        'user_product_reordered_ratio', 'reordered_sum',
        'add_to_cart_order_inverted_mean', 'add_to_cart_order_relative_mean',
        'reorder_prob',
        'last', 'prev1', 'prev2', 'median', 'mean',
        'dep_reordered_ratio', 'aisle_reordered_ratio',
        'aisle_products',
        'aisle_reordered',
        'dep_products',
        'dep_reordered',
        'prod_users_unq', 'prod_users_unq_reordered',
        'order_number', 'prod_add_to_card_mean',
                'days_since_prior_order',
        'order_dow', 'order_hour_of_day',
                'reorder_ration',
                        'user_orders', 'user_order_starts_at', 'user_mean_days_since_prior',
        # 'user_median_days_since_prior',
                        'user_average_basket', 'user_distinct_products', 'user_reorder_ratio', 'user_total_products',
                        'prod_orders', 'prod_reorders',
                        'up_order_rate', 'up_orders_since_last_order', 'up_order_rate_since_first_order',
                        'up_orders', 'up_first_order', 'up_last_order', 'up_mean_cart_position',
        # 'up_median_cart_position',
                             'days_since_prior_order_mean',
        # 'days_since_prior_order_median',
        'order_dow_mean',
        # 'order_dow_median',
        #                      'order_hour_of_day_mean',
        # 'order_hour_of_day_median'
]

In [170]:
features.extend(embedings)
categories = ['product_id', 'aisle_id', 'department_id']
features.extend(embedings)
cat_features = [x + len(features) for x in range(len(categories))]
#cat_features = ','.join(map(lambda x: str(x + len(features)), range(len(categories))))
features.extend(categories)

In [172]:
%%time
data = order_train[features]
## order_train[['reordered']]이 True False의 bool이라서 float32로 바꿔 더미코딩(리코딩)한다.
labels = order_train[['reordered']].values.astype(np.float32).flatten()

CPU times: user 1.16 s, sys: 4.15 s, total: 5.31 s
Wall time: 5.26 s


In [173]:
%%time
data_val = order_test[features]
labels_val = order_test[['reordered']].values.astype(np.float32).flatten()

CPU times: user 404 ms, sys: 264 ms, total: 668 ms
Wall time: 662 ms


In [176]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6357933 entries, 0 to 8474657
Columns: 109 entries, user_product_reordered_ratio to department_id
dtypes: float32(65), float64(29), int64(3), uint16(3), uint32(1), uint64(1), uint8(7)
memory usage: 3.3 GB


In [177]:
data_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2116728 entries, 1 to 8474660
Columns: 109 entries, user_product_reordered_ratio to department_id
dtypes: float32(65), float64(29), int64(3), uint16(3), uint32(1), uint64(1), uint8(7)
memory usage: 1.1 GB


In [178]:
lgb_train = lgb.Dataset(data, labels, categorical_feature=cat_features)
lgb_eval = lgb.Dataset(data_val, labels_val, reference=lgb_train, categorical_feature=cat_features)

In [179]:
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'auc'},
        'num_leaves': 256,
        'min_sum_hessian_in_leaf':20,
        'max_depth': -12,
        'learning_rate': 0.05,
        'feature_fraction': 0.6,
        # 'bagging_fraction': 0.9,
        # 'bagging_freq': 3,
        'verbose': 1
    }

In [181]:
%%time
print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,    
                valid_sets=lgb_eval,
                early_stopping_rounds=30)

print('Feature names:', gbm.feature_name())
print('Calculate feature importances...')
# feature importances
print('Feature importances:', list(gbm.feature_importance()))

df = pd.DataFrame({'feature':gbm.feature_name(), 'importances': gbm.feature_importance()})
print(df.sort_values('importances'))

Start training...




[1]	valid_0's auc: 0.811561	valid_0's binary_logloss: 0.658484
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 0.816788	valid_0's binary_logloss: 0.627227
[3]	valid_0's auc: 0.8178	valid_0's binary_logloss: 0.598813
[4]	valid_0's auc: 0.822285	valid_0's binary_logloss: 0.572757
[5]	valid_0's auc: 0.823901	valid_0's binary_logloss: 0.548949
[6]	valid_0's auc: 0.825109	valid_0's binary_logloss: 0.527158
[7]	valid_0's auc: 0.82485	valid_0's binary_logloss: 0.507218
[8]	valid_0's auc: 0.824849	valid_0's binary_logloss: 0.488952
[9]	valid_0's auc: 0.826913	valid_0's binary_logloss: 0.472246
[10]	valid_0's auc: 0.826894	valid_0's binary_logloss: 0.456644
[11]	valid_0's auc: 0.827421	valid_0's binary_logloss: 0.442415
[12]	valid_0's auc: 0.827252	valid_0's binary_logloss: 0.429045
[13]	valid_0's auc: 0.827484	valid_0's binary_logloss: 0.416584
[14]	valid_0's auc: 0.828075	valid_0's binary_logloss: 0.404924
[15]	valid_0's auc: 0.828494	valid_0's binary_logloss:

In [183]:
gbm.best_score

defaultdict(dict,
            {'valid_0': {'auc': 0.83577523380819063,
              'binary_logloss': 0.24408596539104024}})

In [186]:
gbm.best_iteration

268

In [189]:
gbm.save_model('cv_lgbm.txt')

In [199]:
prediction = gbm.predict(data_val)

In [200]:
orders = order_test.order_id.values
products = order_test.product_id.values

result = pd.DataFrame({'product_id': products, 'order_id': orders, 'prediction': prediction})
result.to_pickle('data/prediction_lgbm_cv_ver.pkl')

In [207]:
result = pd.read_pickle('data/prediction_lgbm_cv_ver.pkl')

In [208]:
%%time
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import multiprocessing

from utils import fast_search

none_product = 50000

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

def create_products(df):
    # print(df.product_id.values.shape)
    products = df.product_id.values
    prob = df.prediction.values

    sort_index = np.argsort(prob)[::-1]

    values = fast_search(prob[sort_index][0:80], dtype=np.float64)

    index = np.argmax(values)

    #print('iteration', df.shape[0], 'optimal value', index)

    best = ' '.join(map(lambda x: str(x) if x != none_product else 'None', products[sort_index][0:index]))
    df = df[0:1]
    df.loc[:, 'products'] = best
    return df

if __name__ == '__main__':
    result['not_a_product'] = 1. - result.prediction

    gp = result.groupby('order_id')['not_a_product'].apply(lambda x: np.multiply.reduce(x.values)).reset_index()
    gp.rename(columns={'not_a_product': 'prediction'}, inplace=True)
    gp['product_id'] = none_product

    result = pd.concat([result, gp], axis=0)
    result.product_id = result.product_id.astype(np.uint32)

    result = result.loc[result.prediction > 0.01, ['order_id', 'prediction', 'product_id']]

    result = applyParallel(result.groupby(result.order_id), create_products).reset_index()

    result[['order_id', 'products']].to_csv('data/sub_1_lgbm.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

CPU times: user 1min 37s, sys: 3.9 s, total: 1min 41s
Wall time: 16min 4s


In [210]:
result[['order_id', 'products']].shape

(32803, 2)