In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={"figure.figsize": (20, 10)})
plt.rc('figure', figsize=(20, 10))
import numpy as np
from utils import *
import datetime
from sklearn.model_selection import train_test_split
import pdb
import random

# Load Data

In [None]:
data_dir = 'data_new/'

In [None]:
action04 = pd.read_csv(data_dir+"JData_Action_201604.csv", parse_dates=["time"])
action03 = pd.read_csv(data_dir+"JData_Action_201603.csv", parse_dates=["time"])
action = pd.concat([action04, action03])
action_cate8 = action[action.cate==8]
action_cate8.to_csv(data_dir+"train/action0304_cate8.csv", 
                                                   index=False)

In [None]:
products = pd.read_csv(data_dir+"JData_Product.csv")
users = pd.read_csv(data_dir+"JData_User.csv", encoding='gbk')

In [123]:
users.sort_values('user_reg_tm', ascending=False)

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm
16746,216747,16-25岁,2.0,1,2016-11-25
14542,214543,-1,2.0,1,2016-10-05
13850,213851,26-35岁,2.0,3,2016-09-11
12014,212015,36-45岁,2.0,2,2016-07-05
11019,211020,36-45岁,2.0,3,2016-06-06
10367,210368,-1,2.0,1,2016-05-24
10362,210363,56岁以上,2.0,2,2016-05-24
9394,209395,16-25岁,1.0,2,2016-05-11
8545,208546,16-25岁,0.0,2,2016-04-29
7482,207483,26-35岁,2.0,3,2016-04-15


## 用户基本信息

In [None]:
def convert_age(age_str):
    if age_str == u'-1':
        return 0
    elif age_str == u'15岁以下':
        return 1
    elif age_str == u'16-25岁':
        return 2
    elif age_str == u'26-35岁':
        return 3
    elif age_str == u'36-45岁':
        return 4
    elif age_str == u'46-55岁':
        return 5
    elif age_str == u'56岁以上':
        return 6
    else:
        return -1

def get_basic_user_feat():
    dump_path = data_dir+'basic_user.pkl'
    if os.path.exists(dump_path):
        user = pickle.load(open(dump_path))
    else:
        user = pd.read_csv(data_dir+"JData_User.csv", encoding='gbk')
        user['age'] = user['age'].map(convert_age)
        age_df = pd.get_dummies(user["age"], prefix="age")
        sex_df = pd.get_dummies(user["sex"], prefix="sex")
        user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
        user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
        pickle.dump(user, open(dump_path, 'w'))
    return user

In [None]:
user_basic = get_basic_user_feat()
user_basic

In [None]:
action_cate8 = pd.read_csv(data_dir+"train/action0304_cate8.csv", parse_dates=["time"])

In [None]:
action_type = pd.read_csv(data_dir+'train/action_type.csv', parse_dates=['date'])

In [None]:
action_type_rich = pd.read_csv(data_dir+'train/action_type_rich.csv', parse_dates=['date'])
action_type_rich

## 有效用户--有加购或关注行为的用户

In [None]:
valid_users = pd.DataFrame(action_type_rich[(action_type_rich.action_type_2 > 0)|(action_type_rich.action_type_5>0)].user_id.unique(), columns=['user_id'])

In [None]:
action_type_valid = pd.merge(action_type_rich, valid_users, on='user_id')
action_type_valid.to_csv(data_dir+'train/action_type_valid.csv', index=False)

In [None]:
action_type_valid

## 每种商品每天有多少人有过交互

In [None]:
for (sku_id, date), g in action_type.groupby(['sku_id', 'date'], as_index=False).apply(len).iteritems():
    print sku_id, date, g
    if random.random() > 0.9:
        break

In [None]:
result = pd.merge(action_type, pd.DataFrame(action_type.groupby(['sku_id', 'date'], as_index=False).apply(len)), 
         left_on=['sku_id', 'date'], right_index=True)
result = pd.merge(result, pd.DataFrame(action_type.groupby('date').apply(len)),
        left_on=['date'], right_index=True)

In [None]:
result.rename(index=str, columns={'0_x':'user_count_for_sku', '0_y':'active_users'}).to_csv("action_type_rich.csv", index=False)

In [None]:
# target_dates = [['20160311', '20160315'], 
#                     ['20160326', '20160331'], 
#                     ['20160411', '20160416']]
target_dates = pd.date_range('20160311', '20160411', freq='D')
target_dates = [[t.strftime("%Y%m%d"), (t+datetime.timedelta(days=1)).strftime("%Y%m%d")] for t in target_dates]

## Positive Set

In [None]:
pos_dfs = []
for target_date in target_dates:
    buy_action = action_type_valid[(action_type_valid.action_type_4>0)&\
                              (action_type_valid.date >= target_date[0])&(action_type_valid.date < target_date[1])]
    window_start = (datetime.datetime.strptime(target_date[0], '%Y%m%d') - datetime.timedelta(days=10)).strftime("%Y%m%d")
    window_end = target_date[0]
    window_action = action_type_valid[(action_type_valid.date >= window_start)&(action_type_valid.date < window_end)]
    pos = pd.merge(buy_action[['user_id', 'sku_id', 'date']], window_action, on='user_id', how='inner')
    pos['window_start'] = window_start
    pos['window_end'] = window_end
    pos_dfs.append(pos)

all_pos = pd.concat(pos_dfs)

In [None]:
create_valid(all_pos, 'positive_valid')

In [None]:
all_pos\
.to_csv(data_dir+"train/positive.csv",
                                index=False, encoding='gbk')

In [None]:
def create_valid(df, fname):
    sum_by_date = df.groupby(['user_id', 'date_x'], as_index=False).sum()
    valid = sum_by_date[(sum_by_date.action_type_2>0)|(sum_by_date.action_type_5>0)] 
    print len(valid)
    df['date_x'] = df['date_x'].map(pd.to_datetime)
    merged = pd.merge(df, valid[['user_id', 'date_x']], on = ['user_id', 'date_x'])
    print len(merged)
    merged.to_csv(data_dir+"train/"+fname+".csv",
                                index=False, encoding='gbk')

## Negative Set

In [None]:
neg_dfs = []
cv_neg_dfs = []
for target_date in target_dates:
    buy_action = action_type_valid[(action_type_valid.action_type_4>0)&\
                              (action_type_valid.date == target_date[0])]
    buyer_id = buy_action.user_id.unique()
    all_user_id = action_type_valid[(action_type_valid.date >= target_date[0])&(action_type_valid.date<target_date[1])].user_id.unique()
    cv_neg_id, not_buyer_id = train_test_split(list(set(all_user_id) - set(buyer_id)), test_size=0.02)
    date = datetime.datetime.strptime(target_date[0], "%Y%m%d") + datetime.timedelta(hours=10)
    window_start = (datetime.datetime.strptime(target_date[0], '%Y%m%d') - datetime.timedelta(days=10)).strftime("%Y%m%d")
    window_end = target_date[0]
    arr = [[user_id, 0, date, window_start, window_end] for user_id in not_buyer_id]
    cv_neg_arr = [[user_id, 0, date, window_start, window_end] for user_id in cv_neg_id]
#     not_buyer_action = pd.DataFrame(np.array(not_buyer_id), columns=['user_id'])
    not_buyer_action = pd.DataFrame(np.array(arr), 
            columns=['user_id','sku_id','date','window_start','window_end'])
    
    window_action = action_type_valid[(action_type_valid.date >= window_start)&(action_type_valid.date < window_end)]
    neg = pd.merge(not_buyer_action, window_action, on='user_id', how='inner')
    neg_dfs.append(neg)
    
    if target_date[0] > '20160410':
        cv_neg_action = pd.DataFrame(np.array(cv_neg_arr), 
                columns=['user_id','sku_id','date','window_start','window_end'])
        cv_neg = pd.merge(cv_neg_action, window_action, on='user_id', how='inner')
        cv_neg_dfs.append(cv_neg)
    
all_cv_neg = pd.concat(cv_neg_dfs)
all_neg = pd.concat(neg_dfs)

In [None]:
create_valid(all_neg, 'negative_valid')
create_valid(all_cv_neg, 'cv_negative_valid')

In [None]:
all_neg.to_csv(data_dir+"train/negative.csv",
                                index=False, encoding='gbk')

In [None]:
all_cv_neg.to_csv(data_dir+"train/cv_negative.csv",
                                index=False, encoding='gbk')

## 测试集

In [None]:
import datetime
test_df = pd.DataFrame(action_type_valid[action_type_valid.date >= '20160406'].user_id.unique(), 
                       columns=['user_id'])
test_df['date'] = datetime.datetime(2016, 4, 16)
test_df['sku_id'] = 0
test_df['window_start'] = '20160406'
test_df['window_end'] = '20160416'

test_df = pd.merge(test_df, action_type_valid[action_type_valid.date >= '20160406'], on = 'user_id')
create_valid(test_df, 'test')

In [None]:
type_count = {}
for (sid, t), g in action_cate8.groupby(['sku_id', 'type']):
    type_count[(sid, t)] = len(g)

In [None]:
items = np.hstack((np.array([0]), action_cate8.sku_id.unique()))

users = action_cate8.user_id.unique()

userid2idx = {o:i for i,o in enumerate(users)}
itemid2idx = {o:i for i,o in enumerate(items)}

n_users = len(users)
n_items = len(items)
n_factors = 50
n_users, n_items

In [None]:
def create_2dcnn_dataset(df):
    dual_group = df.groupby(['user_id', 'sku_id_x', 'window_start'])
    feature = []
    label = []
    for (uid, sid_buy, window_start), g in dual_group:
#         print time_buy
        window = []
        date_range = pd.date_range(start=str(window_start), periods=10, freq='D',closed='left')
#         print date_range
#         pdb.set_trace()
        for d in date_range:
            day_top10 = []
            day = d.strftime("%Y%m%d")
#             pdb.set_trace()
            # 取一天之中action最多的商品
            sku_day = g[g.time_y.dt.strftime("%Y%m%d") == day]
            sku_action_count = sku_day.sku_id_y.value_counts()
            
            if len(sku_action_count) == 0:
                max_sku_ids = []
            else:
                max_sku_ids = sku_action_count.index[np.argsort(sku_action_count)[::-1].values[:10]]
                
            for i in range(10):
                try:
                    max_sku_id = max_sku_ids[i]
                    max_sku_type = sku_day[sku_day.sku_id_y==max_sku_id]['type_y'].value_counts()
                except:
                    max_sku_id = 0
                    max_sku_type = {j:0 for j in range(1,7)}

                c_arr = [0] * 12
                for t, c in max_sku_type.iteritems():
                    t = int(t)
                    c_arr[t-1] = c
                    c_arr[t-1+6] = type_count.get((max_sku_id, t), 0)

                day_top10.append([userid2idx[uid], itemid2idx[max_sku_id]] + c_arr)
                
            window.append(day_top10)
        feature.append(window)
        label.append([itemid2idx[sid_buy], 0 if sid_buy == 0 else 1])
    return np.array(feature), np.array(label)

In [None]:
import datetime
test_df = pd.DataFrame(action_cate8[action_cate8.time > '20160406'].user_id.unique(), 
                       columns=['user_id']).sample(10)
test_df['time'] = datetime.datetime(2016, 4, 16)
test_df['sku_id'] = 0
test_df['window_start'] = '20160406'
test_df['window_end'] = '20160416'
test_df = pd.merge(test_df, action_cate8[action_cate8.time > '20160406'], on = 'user_id')

test, _ = create_2dcnn_dataset(test_df.rename(index=str, columns={"type": "type_y"}))

utils.save_array('data_new/'+'sample/test_cnn2d', test)

In [None]:
test = utils.load_array('data_new/test_cnn2d')

In [None]:
for t in range(2,8):
    plt.figure()
    sns.heatmap(np.sum(test[:, :, :, t], 0)\
                       /float(len(test)), annot=True)

In [None]:
set_buy = action_cate8[action_cate8.type==4]['sku_id'].unique()
set_positive = all_pos.sku_id_x.unique()
set(set_buy) - set(set_positive)

In [None]:
action_cate8[(action_cate8.sku_id==26796)&(action_cate8.type==4)]

In [None]:
action_cate8[(action_cate8.user_id==13636)].sort_values('time')

In [None]:
train = all_pos[all_pos.time_x < '20160411']
cv = all_pos[all_pos.time_x > '20160411']

In [None]:
items = action_cate8.sku_id.unique()

users = action_cate8.user_id.unique()

userid2idx = {o:i for i,o in enumerate(users)}
itemid2idx = {o:i for i,o in enumerate(items)}

n_factors = 50
n_users = len(users)
n_items = len(items)
n_users, n_items



In [None]:
pos_user = len(all_pos.groupby(['user_id', 'sku_id_x']))

In [None]:
type_count = {}
for (sid, t), g in action_cate8.groupby(['sku_id', 'type']):
    type_count[(sid, t)] = len(g)

In [None]:
def create_dataset(df):
    tri_group = df.groupby(['user_id', 'sku_id_x'])
    trn = []
    trn_label = []
    for (uid, sid_buy), g in tri_group:
        for oth_sid, og in g.groupby('sku_id_y'):
            c_arr = [0] * 12
            for t, c in g.type_y.value_counts().iteritems():
                c_arr[t-1] = c
                c_arr[t-1+6] = type_count[(oth_sid, t)]
            trn.append([userid2idx[uid], itemid2idx[sid_oth]] + c_arr)
            trn_label.append(itemid2idx[sid_buy])
    return np.array(trn), np.array(trn_label)

## Negtive Dataset

In [None]:
neg_user = list(set(action_cate8.user_id) - set(all_pos.user_id))

In [None]:
neg_arr = []
ratio = (1 - pos_user / float(neg_user) / 30)
print ratio
for d in pd.date_range(start='20160315', end='20160416', freq='D'):
    nu = neg_user[np.random.rand(len(neg_user))>ratio]
    for u in nu:
        neg_arr.append([u, 0, d])

neg_df = pd.DataFrame(np.array(neg_arr), columns=['user_id', 'sku_id', 'time'])
neg = pd.merge(neg_df, action_cate8, on='user_id', how='left')
#     action_cate8[(action_cate8.user_id.isin(neg_user))\
#                  &(action_cate8.time.between(window_start, window_end))].to_csv("data/train/negtive.csv", index=False)

In [None]:
import pdb
neg_dfs = []
ratio = (1 - pos_user / float(len(neg_user)) / 30)
neg_df = None
for d in pd.date_range(start='20160315', end='20160416', freq='D'):
    window_start = d - datetime.timedelta(days=10)
    window_end = d 
    neg_user_sample = np.array(neg_user)[np.random.rand(len(neg_user))>ratio]
    window_data = action_cate8[action_cate8.time.between(window_start, window_end)\
                    &(action_cate8.user_id.isin(neg_user))]
    neg_users_inside_window = window_data.user_id.unique()
    neg_users_sample = neg_users_inside_window[np.random.rand(len(neg_users_inside_window))>ratio]
    neg_data = window_data[window_data.user_id.isin(neg_users_sample)]
    neg_data['time_x'] = d
    neg_data['sku_id_x'] = 0

    neg_dfs.append(neg_data)
    
neg_dfs
#     action_cate8[(action_cate8.user_id.isin(neg_user))\
#                  &(action_cate8.time.between(window_start, window_end))].to_csv("data/train/negtive.csv", index=False)

In [None]:
neg_df = pd.concat(neg_dfs)

In [None]:
neg_df.to_csv("data/train/negtive.csv", index=False)

In [None]:
len(neg.groupby(['user_id', 'sku_id_x', 'sku_id_y']))

In [None]:
neg['window_end'] = neg.apply(lambda x: \
            datetime.datetime.strptime(x.time_x.strftime("%Y%m%d"), "%Y%m%d"), 1)
neg['window_start'] = neg.apply(lambda x: \
            datetime.datetime.strptime((x.time_x - datetime.timedelta(days=10)).strftime("%Y%m%d"), "%Y%m%d"), 1)

neg[neg.time_y.between(neg.window_start, neg.window_end)]\
    [['user_id', 'sku_id_x', 'sku_id_y', 'time_x', 'time_y', 'model_id', 'type', 'cate', 'brand']]\
                        .to_csv("data/train/negtive.csv",
                                index=False, encoding='gbk')

In [None]:
np.argmax(neg.sku_id_y.value_counts())
len(neg[neg.sku_id_y==12564])

In [None]:
items = np.hstack((np.array([0]), action_cate8.sku_id.unique()))

users = action_cate8.user_id.unique()

userid2idx = {o:i for i,o in enumerate(users)}
itemid2idx = {o:i for i,o in enumerate(items)}

n_users = len(users)
n_items = len(items)
n_factors = 50
n_users, n_items

In [None]:
targets = np.hstack((all_pos['sku_id_x'].unique(), np.array([0])))
targetid2idx = {o:i for i,o in enumerate(targets)}
n_targets = len(targets)
n_targets

In [None]:
import pdb
def create_cnn_dataset(df):
    dual_group = df.groupby(['user_id', 'sku_id_x', 'time_x'])
    feature = []
    label = []
    for (uid, sid_buy, time_buy), g in dual_group:
#         print time_buy
        window = []
        date_range = pd.date_range(end=time_buy.strftime("%Y%m%d"),
                                   periods=11, freq='D', closed='left')
#         print date_range
        for d in date_range:
            day_top10 = []
            day = d.strftime("%Y%m%d")
#             print day
            # 取一天之中action最多的商品
            sku_day = g[g.time_y.dt.strftime("%Y%m%d") == day]
            sku_action_count = sku_day.sku_id_y.value_counts()
            if len(sku_action_count) == 0:
                max_sku_ids = []
            else:
                max_sku_ids = np.argsort(sku_action_count)[:10]
                
            for i in range(10):
                try:
                    max_sku_id = max_sku_ids.index[i]
                    max_sku_type = sku_day[sku_day.sku_id_y==max_sku_id]['type_y'].value_counts()
                except:
                    max_sku_id = 0
                    max_sku_type = {j:0 for j in range(1,7)}

                c_arr = [0] * 12
                for t, c in max_sku_type.iteritems():
                    c_arr[t-1] = c
                    c_arr[t-1+6] = type_count.get((max_sku_id, t), 0)
                
                day_top10.append([userid2idx[uid], itemid2idx[max_sku_id]] + c_arr)
                
            window.append(day_top10)
        feature.append(window)
        label.append([targetid2idx[sid_buy], 0 if sid_buy == 0 else 1])
    return np.array(feature), np.array(label)

In [None]:
sample, label_sample = create_cnn_dataset(pos04.head(100))
sample.shape

In [None]:
import datetime
test_df = pd.DataFrame(action_cate8[action_cate8.time > '20160406'].user_id.unique(), 
                       columns=['user_id'])
test_df['time'] = datetime.datetime(2016, 4, 16)
test_df['sku_id'] = 0
test_df = pd.merge(test_df, action_cate8[action_cate8.time > '20160406'], on = 'user_id')

In [None]:
test2d, _ = create_cnn_dataset(test_df.rename(index=str, columns={"type": "type_y"}))

In [None]:
import bcolz
c=bcolz.carray(test, rootdir='test', mode='w')
c.flush()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={"figure.figsize": (20, 10)})
plt.rc('figure', figsize=(20, 10))
import numpy as np
import datetime
from pandas import Timedelta
sns.factorplot(x='time', kind='count', data=action_cate8[action_cate8.type==4])

In [None]:
from IPython.display import FileLink
FileLink('result.csv')

In [None]:
np.zeros(6)