In [2]:
import gc
import pandas as pd
import numpy as np
import os
import json
import sklearn.metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from scipy.sparse import dok_matrix, coo_matrix
from sklearn.utils.multiclass import  type_of_target
path = "data"

## 데이터 로딩

In [3]:
%%time
aisles = pd.read_csv(os.path.join(path, "aisles.csv"), dtype={'aisle_id': np.uint8, 'aisle': 'category'})
departments = pd.read_csv(os.path.join(path, "departments.csv"),
                          dtype={'department_id': np.uint8, 'department': 'category'})
order_prior = pd.read_csv(os.path.join(path, "order_products__prior.csv"), dtype={'order_id': np.uint32,
                                                                                  'product_id': np.uint16,
                                                                                  'add_to_cart_order': np.uint8,
                                                                                  'reordered': bool})
order_train = pd.read_csv(os.path.join(path, "order_products__train.csv"), dtype={'order_id': np.uint32,
                                                                                  'product_id': np.uint16,
                                                                                  'add_to_cart_order': np.uint8,
                                                                                  'reordered': bool})
orders = pd.read_csv(os.path.join(path, "orders.csv"), dtype={'order_id': np.uint32,
                                                              'user_id': np.uint32,
                                                              'eval_set': 'category',
                                                              'order_number': np.uint8,
                                                              'order_dow': np.uint8,
                                                              'order_hour_of_day': np.uint8
                                                              })

products = pd.read_csv(os.path.join(path, "products.csv"), dtype={'product_id': np.uint16,
                                                                  'aisle_id': np.uint8,
                                                                  'department_id': np.uint8})

order_train = pd.read_pickle(os.path.join(path, 'chunk_0.pkl'))

In [4]:
orders.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,


In [5]:
order_prior.head(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,True


In [6]:
%%time
## 지금까지 분산됐던 order 정보들을 모두 합친다.
orders_products = pd.merge(orders, order_prior, on="order_id")

CPU times: user 7.1 s, sys: 2.21 s, total: 9.31 s
Wall time: 9.31 s


In [7]:
orders_products.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,False


In [8]:
%%time
## 구매 정보와 상품정보를 합친다.(이름은 필요없기 때문에 합치지 않는다.)
orders_products_products = pd.merge(orders_products, products[['product_id', 'department_id', 'aisle_id']],
                                    on='product_id')

CPU times: user 10.1 s, sys: 2 s, total: 12.1 s
Wall time: 12.1 s


In [9]:
orders_products_products.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,aisle_id
0,2539329,1,prior,1,2,8,,196,1,False,7,77


In [10]:
%%time
## 상품정보와 구매정보가 합쳐진 orders_products_products를 'user_id', 'department_id'
## 로 group_by한다. 즉 user_id와 departments_id 로 묶이고 정렬된다
## group_by 객체에 'product_id'에는 lambda x: x.nunique() 를 적용해 각각 유니크 한 값이
## 몇개인지 반환한다. reordered 에는 sum을 적용해 몇 번 재구매됐는지 본다.

## 결론 파생변수 만드는데 department_id당 얼마나 팔리고 얼마나 재구매되는지 계산
user_dep_stat = orders_products_products.groupby(['user_id', 'department_id']).agg(
        {'product_id': lambda x: x.nunique(),
         'reordered': 'sum'
         })

CPU times: user 2min 45s, sys: 2.05 s, total: 2min 47s
Wall time: 2min 47s


In [11]:
user_dep_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,product_id
user_id,department_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4,1.0,4
1,7,11.0,2
1,13,0.0,1
1,14,2.0,1
1,16,8.0,5


In [12]:
%%time
user_dep_stat.rename(columns={'product_id': 'dep_products','reordered': 'dep_reordered'}, inplace=True)

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 7.32 ms


In [13]:
user_dep_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,dep_reordered,dep_products
user_id,department_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4,1.0,4
1,7,11.0,2
1,13,0.0,1
1,14,2.0,1
1,16,8.0,5


In [14]:
user_dep_stat.reset_index(inplace=True)

In [15]:
user_dep_stat.head()

Unnamed: 0,user_id,department_id,dep_reordered,dep_products
0,1,4,1.0,4
1,1,7,11.0,2
2,1,13,0.0,1
3,1,14,2.0,1
4,1,16,8.0,5


In [16]:
%%time
## 똑같은 작업을 aisle_id에도 적용한다.
user_aisle_stat = orders_products_products.groupby(['user_id', 'aisle_id']).agg(
        {'product_id': lambda x: x.nunique(),
         'reordered': 'sum'
         })

CPU times: user 6min 56s, sys: 3.97 s, total: 7min
Wall time: 6min 58s


In [17]:
user_aisle_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,product_id
user_id,aisle_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,21,7.0,1
1,23,10.0,2
1,24,1.0,4
1,45,0.0,1
1,53,1.0,1


In [18]:
user_aisle_stat.rename(columns={'product_id': 'aisle_products','reordered': 'aisle_reordered'}, inplace=True)

In [19]:
user_aisle_stat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,aisle_reordered,aisle_products
user_id,aisle_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,21,7.0,1
1,23,10.0,2
1,24,1.0,4
1,45,0.0,1
1,53,1.0,1


In [20]:
user_aisle_stat.reset_index(inplace=True)

In [21]:
user_aisle_stat.head()

Unnamed: 0,user_id,aisle_id,aisle_reordered,aisle_products
0,1,21,7.0,1
1,1,23,10.0,2
2,1,24,1.0,4
3,1,45,0.0,1
4,1,53,1.0,1
