In [2]:
import pandas as pd
user_goods = pd.read_csv('user_goods.csv')
print user_goods.shape
user_goods.head()

(56256752, 6)


Unnamed: 0,us_id,spu_id,action_type,date,brand_id,cat_id
0,522945.0,338312,0.0,80,10005188,1012
1,44676.0,338312,0.0,80,10005188,1012
2,320812.0,338312,0.0,38,10005188,1012
3,452731.0,338312,0.0,74,10005188,1012
4,259493.0,338312,0.0,52,10005188,1012


In [3]:
user_goods = user_goods.dropna() # 去掉NaN值，即去掉从未点击过的商品对应的行，此时shape=(56172886, 6)
user_goods.shape

(56172886, 6)

In [4]:
#去除异常特征值
user_goods = user_goods.groupby(['us_id','spu_id','date','brand_id','cat_id'],as_index = False).sum()
user_goods.shape

(56140033, 6)

In [5]:
#对类别进行one_hot，作为类别特征
cat = user_goods['cat_id'].drop_duplicates()
cat_dummy = pd.get_dummies(cat,prefix='cat_id')
cat_dummy['cat_id']=cat.values
cat_dummy

Unnamed: 0,cat_id_271,cat_id_297,cat_id_311,cat_id_337,cat_id_356,cat_id_368,cat_id_403,cat_id_517,cat_id_609,cat_id_680,...,cat_id_48337,cat_id_50513,cat_id_50514,cat_id_50515,cat_id_50516,cat_id_72269,cat_id_73741,cat_id_75798,cat_id_75799,cat_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1012
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,311
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,356
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,297
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1056
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,73741
15,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,680
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7469
21,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,517
35,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,271


In [75]:
def get_user_feature(df, start_date, end_date):
    #用户购买量——buy
    user_feature = df[['us_id','action_type']].groupby('us_id',as_index=False).sum()
    user_feature.rename(columns={'action_type':'us_buy'}, inplace = True)
    #用户点击量——user_click
    click = df.groupby('us_id',as_index=False).size()
    click.index = range(len(click))
    user_feature['us_click'] = click
    #用户购买率——buy_click_ratio
    user_feature['us_buy_click_ratio'] = user_feature['us_buy']/user_feature['us_click']
    #计算用户最早与最后交互时间与预测时间的间隔——user_first_action_date、user_last_action_date
    df_first = df[['us_id','date']].drop_duplicates(['us_id'])
    df_last = df[['us_id','date']].drop_duplicates(['us_id'], keep='last')    
    df_first['us_first_action_date'] = end_date - df_first['date'] # 用户第一次操作距离预测时间点的天数
    df_last['us_last_action_date'] = end_date - df_last['date'] # 用户最后一次操作距离预测时间点的天数
    del df_first['date']
    del df_last['date']
    user_feature = pd.merge(user_feature,df_first,how='left', on='us_id')
    user_feature = pd.merge(user_feature,df_last,how='left', on='us_id')
    #计算隔天购买数——buy_getian
    df_buy = df[df['action_type']==1]
    df_buy = df_buy[['us_id','spu_id']]
    df_buy_act = pd.merge(df_buy, df, how='left', on=['us_id','spu_id'])
    df_buy_act = df_buy_act[['us_id','spu_id','action_type']]
    df_buy_act = df_buy_act.drop_duplicates(['us_id','spu_id'])
    df_buy_act = df_buy_act[['us_id','action_type']].groupby(['us_id'],as_index=False).sum()
    user_feature = pd.merge(user_feature,df_buy_act,how = 'left',on='us_id')
    user_feature = user_feature.fillna(0)
    user_feature['us_buy_getian']= user_feature['us_buy']- user_feature['action_type']
    #隔天购买率——buy_getian_ratio
    user_feature['us_buy_getian_ratio'] =  user_feature['us_buy_getian']/user_feature['us_buy']
    del user_feature['action_type']
    return user_feature.fillna(0)

In [76]:
#用于生成与商品相关的特征
def get_spu_feature(df, start_date, end_date):
    #商品购买量——sell
    spu_feature = df[['spu_id','action_type']].groupby('spu_id',as_index=False).sum()
    spu_feature.rename(columns={'action_type':'spu_buy'}, inplace = True)
    #商品点击量——spu_click
    spu_click = df.groupby('spu_id',as_index=False).size()
    spu_click.index = range(len(spu_click))
    spu_feature['spu_click'] = spu_click
    #商品购买率——sell_click_ratio
    spu_feature['spu_buy_click_ratio'] = spu_feature['spu_buy']/spu_feature['spu_click']
    #计算商品最早与最后交互时间与预测时间的间隔——spu_first_action_date、spu_last_action_date
    df_first = df[['spu_id','date']].drop_duplicates(['spu_id'])
    df_last = df[['spu_id','date']].drop_duplicates(['spu_id'], keep='last')    
    df_first['spu_first_action_date'] = end_date - df_first['date']
    df_last['spu_last_action_date'] = end_date - df_last['date']
    del df_first['date']
    del df_last['date']
    spu_feature = pd.merge(spu_feature,df_first,how='left', on='spu_id')
    spu_feature = pd.merge(spu_feature,df_last,how='left', on='spu_id')
    #计算商品隔天被购买数——spu_sell_getian
    df_buy = df[df['action_type']==1]
    df_buy = df_buy[['us_id','spu_id']]
    df_buy_act = pd.merge(df_buy, df, how='left', on=['us_id','spu_id'])
    df_buy_act = df_buy_act[['us_id','spu_id','action_type']]
    df_buy_act = df_buy_act.drop_duplicates(['us_id','spu_id'])
    df_buy_act = df_buy_act[['spu_id','action_type']].groupby(['spu_id'],as_index=False).sum()
    spu_feature = pd.merge(spu_feature,df_buy_act,how = 'left',on='spu_id')
    spu_feature = spu_feature.fillna(0)
    spu_feature['spu_buy_getian']= spu_feature['spu_buy']- spu_feature['action_type']
    #商品隔天被购买率——spu_sell_getian_ratio
    spu_feature['spu_buy_getian_ratio'] =  spu_feature['spu_buy_getian']/spu_feature['spu_buy']
    del spu_feature['action_type']
    spu_feature = spu_feature.fillna(0)
    #对商品的类别进行one-hot编码
    #goods_train = df[['spu_id','cat_id']].drop_duplicates(['spu_id','cat_id'])
    #spu_feature = pd.merge(spu_feature,goods_train,how='left',on='spu_id')
    #cat_dummy = pd.get_dummies(spu_feature['cat_id'],prefix='cat_id')
    #spu_feature = pd.concat([spu_feature,cat_dummy],axis=1)
    #del spu_feature['cat_id']
    #del spu_feature['brand_id']
    return spu_feature.fillna(0)

In [77]:
def get_cat_feature(df, start_date, end_date):
    #类别购买量——cat_sell
    cat_feature = df[['cat_id','action_type']].groupby('cat_id',as_index=False).sum()
    cat_feature.rename(columns={'action_type':'cat_buy'}, inplace = True)
    #类别点击量——cat_click
    cat_click = df.groupby('cat_id',as_index=False).size()
    cat_click.index = range(len(cat_click))
    cat_feature['cat_click'] = cat_click
    #类别购买率——cat_sell_click_ratio
    cat_feature['cat_buy_click_ratio'] = cat_feature['cat_buy']/cat_feature['cat_click']
    #计算品类最早与最后交互时间与预测时间的间隔——cat_first_action_date、cat_last_action_date
    df_first = df[['cat_id','date']].drop_duplicates(['cat_id'])
    df_last = df[['cat_id','date']].drop_duplicates(['cat_id'], keep='last')    
    df_first['cat_first_action_date'] = end_date - df_first['date']
    df_last['cat_last_action_date'] = end_date - df_last['date']
    del df_first['date']
    del df_last['date']
    cat_feature = pd.merge(cat_feature,df_first,how='left', on='cat_id')
    cat_feature = pd.merge(cat_feature,df_last,how='left', on='cat_id')
    ##计算类别隔天被购买数——cat_sell_getian
    df_buy = df[df['action_type']==1]
    df_buy = df_buy[['us_id','spu_id']]
    df_buy_act = pd.merge(df_buy, df, how='left', on=['us_id','spu_id'])
    df_buy_act = df_buy_act[['us_id','spu_id','cat_id','action_type']]
    df_buy_act = df_buy_act.drop_duplicates(['us_id','spu_id'])
    df_buy_act = df_buy_act[['cat_id','action_type']].groupby(['cat_id'],as_index=False).sum()
    cat_feature = pd.merge(cat_feature,df_buy_act,how = 'left',on='cat_id')
    cat_feature = cat_feature.fillna(0)
    cat_feature['cat_buy_getian']= cat_feature['cat_buy']- cat_feature['action_type']
    #类别隔天购买率——cat_sell_getian_ratio
    cat_feature['cat_buy_getian_ratio'] =  cat_feature['cat_buy_getian']/cat_feature['cat_buy']
    del cat_feature['action_type']
    cat_feature = pd.merge(cat_feature,cat_dummy,how = 'left',on='cat_id')
    return cat_feature.fillna(0)

In [78]:
# 用户-商品特征
def get_us_spu_feature(df):
    df = df[['us_id','spu_id','action_type']]   #提取用户特征
    us_spu_feature = df.groupby(['us_id','spu_id'],as_index = False).sum() #获取总的购买次数
    us_spu_feature.rename(columns = {'action_type':'us_spu_buy'},inplace = True)  
    us_spu_click = df.groupby(['us_id','spu_id'],as_index = False).size()  #h获取总的点击次数，数据类型是Series格式
    us_spu_feature['us_spu_click'] = us_spu_click.values                            
    us_spu_feature['us_spu_buy_ratio'] = us_spu_feature['us_spu_buy']/us_spu_feature['us_spu_click']      
    df_us = df.drop('spu_id', axis=1).groupby('us_id',as_index = False).sum()
    df_us.rename(columns={'action_type':'us_buy'}, inplace=True) # 用户购买量
    us_click = df.drop('spu_id', axis=1).groupby('us_id',as_index = False).size() #用户对所有商品的点击
    df_us['us_click'] = us_click.values # 用户点击量
    
    us_spu_feature =pd.merge(us_spu_feature,df_us, how = 'left',on='us_id')
    us_spu_feature['us_spu_buy_prefer'] = us_spu_feature['us_spu_buy']/us_spu_feature['us_buy']
    us_spu_feature['us_spu_click_prefer'] = us_spu_feature['us_spu_click']/us_spu_feature['us_click']
    del us_spu_feature['us_buy']
    del us_spu_feature['us_click']
    return us_spu_feature.fillna(0)
# 用户-商品特征：us_id, spu_id, us_spu_buy - 用户对该商品购买量，us_spu_click - 用户对商品点击量
# us_spu_buy_ratio - 用户对该商品的购买率，us_spu_buy_prefer - 用户购买偏好，us_spu_click_prefer - 用户点击偏好

In [79]:
# 用户-品牌特征
def get_us_brand_feature(df):
    df = df[['us_id','brand_id','action_type']]   #提取用户特征
    us_brand_feature = df.groupby(['us_id','brand_id'],as_index = False).sum() #获取总的购买次数
    us_brand_feature.rename(columns = {'action_type':'us_brand_buy'},inplace = True)  
    us_brand_click = df.groupby(['us_id','brand_id'],as_index = False).size()  #h获取总的点击次数，数据类型是Series格式
    us_brand_feature['us_brand_click'] = us_brand_click.values                            
    us_brand_feature['us_brand_buy_ratio'] = us_brand_feature['us_brand_buy']/us_brand_feature['us_brand_click']      
    df_us = df.drop('brand_id', axis=1).groupby('us_id',as_index = False).sum()
    df_us.rename(columns={'action_type':'us_buy'}, inplace=True) # 用户购买量
    us_click = df.drop('brand_id', axis=1).groupby('us_id',as_index = False).size() #用户对所有商品的点击
    df_us['us_click'] = us_click.values # 用户点击量
    
    us_brand_feature =pd.merge(us_brand_feature,df_us, how = 'left',on='us_id')
    us_brand_feature['us_brand_buy_prefer'] = us_brand_feature['us_brand_buy']/us_brand_feature['us_buy']
    us_brand_feature['us_brand_click_prefer'] = us_brand_feature['us_brand_click']/us_brand_feature['us_click']
    del us_brand_feature['us_buy']
    del us_brand_feature['us_click']
    return us_brand_feature.fillna(0)
# 用户-品牌特征：us_id, brand_id, us_brand_buy - 用户对该品牌购买量，us_brand_click - 用户对品牌点击量
# us_brand_buy_ratio - 用户对该品牌的购买率，us_brand_buy_prefer - 用户购买偏好，us_brand_click_prefer - 用户点击偏好

In [64]:
# 用户-类别特征
def get_us_cat_feature(df):
    df_cat = pd.read_csv('df_cat.csv')
    df = df[['us_id', 'cat_id', 'action_type']]
    df = pd.merge(df, df_cat, how='left', on=['cat_id', 'action_type'])
    df = pd.get_dummies(df, prefix = ['action_type'], columns = ['action_type'])
    # 用户-类别 点击未购买量（49）,购买量（49），用户对该类别的点击量（action_type_0.0），用户对该类别的购买量（action_type_1.0）
    df = df.groupby(['us_id', 'cat_id'], as_index = False).sum()
    df['us_cat_click'] = df['action_type_0.0']+df['action_type_1.0']
    del df['action_type_0.0'] 
    df.rename(columns={'action_type_1.0':'us_cat_buy'}, inplace=True)
    # 用户对各个类别的 点击未购买量+购买量， 用户的总点击未购买量（action_type_0.0）与总购买量（action_type_1.0）
    df_us = df.drop('cat_id', axis=1).groupby(['us_id'], as_index = False).sum()
    df_us.rename(columns={'us_cat_buy':'us_buy', 'us_cat_click':'us_click'}, inplace=True) # 用户对所有类别 购买量
    for i in range(1,50):
        # 用户-类别购买率
        df_us['us_cat_ratio'+df_us.columns[i][11:]] = df_us[df_us.columns[i+49]]/(df_us[df_us.columns[i]]+df_us[df_us.columns[i+49]])
        # print df_us.columns[i]
        # 用户-类别偏好2
        df_us[df_us.columns[i]] = (df_us[df_us.columns[i]]+df_us[df_us.columns[i+49]])/df_us['us_click']
        # print df_us.columns[i+49]
        # 用户-类别偏好1
        df_us[df_us.columns[i+49]] = df_us[df_us.columns[i+49]]/df_us['us_buy']
    del df_us['us_buy']
    del df_us['us_click']
    df_us_cat = pd.merge(df[['us_id','cat_id','us_cat_buy','us_cat_click']], df_us, how='left', on='us_id').fillna(0)
    return df_us_cat
# us_id, cat_id，us_cat_buy - 用户-类别购买量，us_cat_click - 用户类别点击量
# cat_id_bro - 用户对各个类别的点击量（49）, cat_id_buy - 用户对各个类别的购买量（49）,
# us_cat_ratio - 用户对各个类别的购买率（49）




In [65]:
# 商品-品牌特征
def get_spu_brand_feature(df):
    df = df[['spu_id', 'brand_id', 'action_type']]
    # 商品-类别 点击量与购买量
    spu_brand_feature = df.groupby(['spu_id','brand_id'], as_index=False).sum()
    spu_brand_feature.rename(columns={'action_type':'spu_buy'}, inplace=True) # 商品购买量
    spu_click = df.groupby(['spu_id','brand_id'], as_index=False).size()
    spu_click.index = range(len(spu_click)) 
    spu_brand_feature['spu_click'] = spu_click # 商品点击量
    # 商品购买率
    spu_brand_feature['spu_buy_ratio'] = spu_brand_feature['spu_buy']/spu_brand_feature['spu_click']
    # 类别点击量与购买量
    df_brand = spu_brand_feature.drop(['spu_id','spu_buy_ratio'], axis = 1).groupby(['brand_id'], as_index = False).sum()
    df_brand.rename(columns={'spu_buy':'brand_buy', 'spu_click':'brand_click'}, inplace='True')
    # 品牌购买率
    df_brand['brand_buy_ratio'] = df_brand['brand_buy']/df_brand['brand_click']
    spu_brand_feature = pd.merge(spu_brand_feature, df_brand, how = 'left', on = 'brand_id')
    spu_brand_feature['spu_brand_buy'] = spu_brand_feature['spu_buy']/spu_brand_feature['brand_buy']
    spu_brand_feature['spu_brand_click'] = spu_brand_feature['spu_click']/spu_brand_feature['brand_click']
    spu_brand_feature['spu_brand_buy_ratio'] = spu_brand_feature['spu_buy_ratio']/spu_brand_feature['brand_buy_ratio']
    return spu_brand_feature[['spu_id','brand_id','spu_brand_buy','spu_brand_click','spu_brand_buy_ratio']].fillna(0)
# spu_id, brand_id, 商品购买量，商品点击量，商品购买率，品牌购买量，品牌点击量，品牌购买率，
# 商品-品牌购买量，商品-品牌点击量，商品-品牌购买率（除数为0时的NaN值填充）

In [66]:
# 商品-类别特征
def get_spu_cat_feature(df):
    df = df[['spu_id', 'cat_id', 'action_type']]
    # 商品-类别 点击量与购买量
    spu_cat_feature = df.groupby(['spu_id','cat_id'], as_index=False).sum()
    spu_cat_feature.rename(columns={'action_type':'spu_buy'}, inplace=True) # 商品购买量
    spu_click = df.groupby(['spu_id','cat_id'], as_index=False).size()
    spu_click.index = range(len(spu_click)) 
    spu_cat_feature['spu_click'] = spu_click # 商品点击量
    # 商品购买率
    spu_cat_feature['spu_buy_ratio'] = spu_cat_feature['spu_buy']/spu_cat_feature['spu_click']
    # 类别点击量与购买量
    df_cat = spu_cat_feature.drop(['spu_id','spu_buy_ratio'], axis = 1).groupby(['cat_id'], as_index = False).sum()
    df_cat.rename(columns={'spu_buy':'cat_buy', 'spu_click':'cat_click'}, inplace='True')
    # 品牌购买率
    df_cat['cat_buy_ratio'] = df_cat['cat_buy']/df_cat['cat_click']
    spu_cat_feature = pd.merge(spu_cat_feature, df_cat, how = 'left', on = 'cat_id')
    spu_cat_feature['spu_cat_buy'] = spu_cat_feature['spu_buy']/spu_cat_feature['cat_buy']
    spu_cat_feature['spu_cat_click'] = spu_cat_feature['spu_click']/spu_cat_feature['cat_click']
    spu_cat_feature['spu_cat_buy_ratio'] = spu_cat_feature['spu_buy_ratio']/spu_cat_feature['cat_buy_ratio']
    return spu_cat_feature[['spu_id','cat_id','spu_cat_buy','spu_cat_click','spu_cat_buy_ratio']].fillna(0)
# spu_id, cat_id, 商品购买量，商品点击量，商品购买率，类别购买量，类别点击量，类别购买率，
# 商品-类别购买量，商品-类别点击量，商品-类别购买率（除数为0时的NaN值填充）

In [67]:
# 品牌-类别特征
def get_brand_cat_feature(df):
    df = df[['brand_id', 'cat_id', 'action_type']]
    # 品牌-类别 点击量与购买量
    brand_cat_feature = df.groupby(['brand_id','cat_id'], as_index=False).sum()
    brand_cat_feature.rename(columns={'action_type':'brand_buy'}, inplace=True) # 商品购买量
    brand_click = df.groupby(['brand_id','cat_id'], as_index=False).size()
    brand_click.index = range(len(brand_click)) 
    brand_cat_feature['brand_click'] = brand_click # 商品点击量
    # 品牌购买率
    brand_cat_feature['brand_buy_ratio'] = brand_cat_feature['brand_buy']/brand_cat_feature['brand_click']
    # 类别点击量与购买量
    df_cat = brand_cat_feature.drop(['brand_id','brand_buy_ratio'], axis = 1).groupby(['cat_id'], as_index = False).sum()
    df_cat.rename(columns={'brand_buy':'cat_buy', 'brand_click':'cat_click'}, inplace='True')
    # 品牌购买率
    df_cat['cat_buy_ratio'] = df_cat['cat_buy']/df_cat['cat_click']
    brand_cat_feature = pd.merge(brand_cat_feature, df_cat, how = 'left', on = 'cat_id')
    brand_cat_feature['brand_cat_buy'] = brand_cat_feature['brand_buy']/brand_cat_feature['cat_buy']
    brand_cat_feature['brand_cat_click'] = brand_cat_feature['brand_click']/brand_cat_feature['cat_click']
    brand_cat_feature['brand_cat_buy_ratio'] = brand_cat_feature['brand_buy_ratio']/brand_cat_feature['cat_buy_ratio']
    return brand_cat_feature[['brand_id','cat_id','brand_cat_buy','brand_cat_click','brand_cat_buy_ratio']].fillna(0)
# spu_id, cat_id, 品牌购买量，品牌点击量，品牌购买率，类别购买量，类别点击量，类别购买率，
# 品牌-类别购买量，品牌-类别点击量，品牌-类别购买率（除数为0时的NaN值填充）

In [68]:
def feature_extraction(df, start_date, end_date):
    data_features = df[['us_id', 'spu_id', 'brand_id', 'cat_id']].drop_duplicates(['us_id','spu_id']) # 保证“用户-商品对”唯一
    print data_features.shape
    df = df[(df['date'] >= start_date) & (df['date'] < end_date)].sort_values(by = 'date') # df 按时间排序
    
    # 用户特征：us_id, us_buy - 用户购买量，us_click - 用户点击量，us_buy_click_ratio - 用户购买率， 
    # us_first_action_date - 用户初次操作距预测日时间，us_last_action_date - 用户末次操作距预测日时间，
    # us_buy_getian - 用户隔天购买次数， us_buy_getian_ratio - 用户隔天购买率
    user_features = get_user_feature(df, start_date, end_date)
    data_features = pd.merge(data_features, user_features, how='left', on='us_id')
    
    # 商品特征：spu_id，spu_buy - 商品购买量，spu_click - 商品点击量，spu_buy_click_ratio - 商品购买率
    # spu_first_action_date - 商品最早一次交互距预测日时间，spu_last_action_date - 商品最后一次交互距预测日时间
    # spu_buy_getian - 商品隔天销售次数，spu_buy_getian_ratio - 商品隔天销售比率
    spu_features = get_spu_feature(df, start_date, end_date)
    data_features = pd.merge(data_features, spu_features, how='left', on='spu_id')
    
    # 类别特征：cat_id，cat_buy - 类别购买量，cat_click - 类别点击量，cat_buy_click_ratio - 类别购买率，
    # cat_first_action_date - 类别最早一次交互距预测日时间，cat_last_action_date - 类别最后一次交互距预测日时间
    # cat_buy_getian - 类别隔天销售次数，cat_buy_getian_ratio - 类别隔天销售比率
    cat_features = get_cat_feature(df, start_date, end_date)
    data_features = pd.merge(data_features, cat_features, how='left', on='cat_id')
    
    # 用户-商品特征：us_id, spu_id, us_spu_buy - 用户对该商品购买量，us_spu_click - 用户对商品点击量
    # us_spu_buy_ratio - 用户对该商品的购买率，us_spu_buy_prefer - 用户购买偏好，us_spu_click_prefer - 用户点击偏好
    us_spu_feature = get_us_spu_feature(df)
    data_features = pd.merge(data_features, us_spu_feature, how='left', on=['us_id','spu_id'])
    
    # 用户-品牌特征：us_id, brand_id, us_brand_buy - 用户对该品牌购买量，us_brand_click - 用户对品牌点击量
    # us_brand_buy_ratio - 用户对该品牌的购买率，us_brand_buy_prefer - 用户购买偏好，us_brand_click_prefer - 用户点击偏好
    us_brand_feature = get_us_brand_feature(df)
    data_features = pd.merge(data_features, us_brand_feature, how='left', on=['us_id','brand_id'])
    
    # 用户-类别特征；
    us_cat_feature = get_us_cat_feature(df)
    data_features = pd.merge(data_features, us_cat_feature, how='left', on=['us_id','cat_id'])
    
    # 商品-品牌特征：spu_brand_buy - 商品-品牌购买量，spu_brand_click - 商品-品牌点击量，
    # spu_brand_buy_ratio - 商品-品牌购买率
    spu_brand_feature = get_spu_brand_feature(df)
    data_features = pd.merge(data_features, spu_brand_feature, how='left', on=['spu_id','brand_id'])
    
    # 商品-类别特征：spu_cat_buy - 商品-类别购买量，spu_cat_click - 商品-类别点击量，
    # spu_cat_buy_ratio - 商品-类别购买率
    spu_cat_feature = get_spu_cat_feature(df)
    data_features = pd.merge(data_features, spu_cat_feature, how='left', on=['spu_id','cat_id'])
    
    # 品牌-类别特征：brand_cat_buy - 品牌-类别购买量，brand_cat_click - 品牌-类别点击量，
    # brand_cat_buy_ratio - 品牌-类别购买率
    brand_cat_feature = get_brand_cat_feature(df)
    data_features = pd.merge(data_features, brand_cat_feature, how='left', on=['brand_id','cat_id'])
    
    return data_features

In [69]:
def cut_off(df, n=1):
    df_1 = df[df['label'] == 1]
    sample_num = len(df_1)
    print sample_num
    df = df[df['label'] == 0].sample(int(sample_num * n))
    df = pd.concat([df,df_1])
    return df

In [70]:
def get_X_Y(df, start_date, end_date):
    df_X = df[(df['date'] >= start_date) & (df['date'] < end_date)]
    df_Y = df[(df['date'] >= end_date) & (df['date'] < end_date+7)]
    # X
    X = pd.concat([df_X[['us_id','spu_id','action_type','date','brand_id','cat_id']], 
                   df_Y[['us_id','spu_id','brand_id','cat_id']].drop_duplicates(['us_id','spu_id'])]) # 对df_Y中重复的“用户-商品对”去重
    X['date'].fillna(start_date-1,inplace=True)
    X['action_type'].fillna(0,inplace=True)
    X = feature_extraction(X, start_date, end_date)
    # 缺失值填充
    d = end_date-start_date+1
    X.fillna({'us_first_action_date':d,'us_last_action_date':d,'spu_first_action_date':d,'spu_last_action_date':d,
              'cat_last_action_date':d,'cat_first_action_date':d},inplace=True)
    X.fillna(0,inplace = True)
    print X.shape
    # Y
    Y = pd.concat([df_X[['us_id','spu_id']], df_Y[['us_id','spu_id','action_type']]], axis=0).fillna(0)
    Y = Y.sort_values(by='action_type',ascending=False).drop_duplicates(['us_id','spu_id'])
    Y.rename(columns={'action_type':'label'}, inplace=True)
    print Y.shape
    # 带有 label 的 X_train
    X_train = pd.merge(X,Y,how='left',on=['us_id','spu_id'])
    # Y = X_train[['us_id','spu_id','label']]
    print X_train.shape
    X_train = cut_off(X_train)
    print X_train.shape
    return X_train

In [71]:
def get_train_data(df,interval,stride):
    train_data = pd.DataFrame()
    for i in range(1,83,stride): 
        start_date = i; end_date = i + interval; predict_end_date = i+interval+6
        print start_date,end_date,predict_end_date
        x_train = get_X_Y(df, start_date, end_date)
        train_data = pd.concat([train_data,x_train])
        if predict_end_date > 83:
            break
    train_data.drop(['us_id','spu_id','brand_id','cat_id'],axis=1,inplace=True)
    X =  train_data.drop(['label'],axis=1)
    y =  train_data['label']
    return X,y

In [72]:
data = get_train_data(user_goods,7,4)

1 8 14
(9872129, 4)
(9872129, 242)
(9872129, 3)
(9872129, 243)
52179
(104358, 243)
8 15 21
(8451198, 4)


KeyboardInterrupt: 

In [74]:
interval = 7
stride = 4
for i in range(1,83,stride):
    print i,i+interval,i+interval+6
    if i+interval+6 > 83:
        break

1 8 14
5 12 18
9 16 22
13 20 26
17 24 30
21 28 34
25 32 38
29 36 42
33 40 46
37 44 50
41 48 54
45 52 58
49 56 62
53 60 66
57 64 70
61 68 74
65 72 78
69 76 82
73 80 86


In [27]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Reshape, normalization, AveragePooling2D
from keras.optimizers import Adam, RMSprop
from keras import regularizers
from keras import initializers

Using TensorFlow backend.


In [28]:
import os
# sess=tf.Session()
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [43]:
model = Sequential()
model.add(normalization.BatchNormalization(input_shape=(238,)))
model.add(Dense(100, kernel_regularizer=regularizers.l2(0.0001)))
model.add(Activation('relu'))

model.add(Dense(20, kernel_regularizer=regularizers.l2(0.0001)))
model.add(Activation('relu'))

model.add(Dense(1,kernel_regularizer=regularizers.l2(0.0001)))
model.add(Activation('sigmoid'))

In [44]:
model.compile(loss = 'mean_squared_error', optimizer = Adam(lr = 1e-3))
# callbacks = [ LearningRateScheduler(lambda x: 1e-3 * 0.9 ** x)] # EarlyStopping(monitor='val_loss',patience= 100,verbose=0),

In [1]:
hist = model.fit(data[0].values, data[1].values, validation_split=0.3, epochs = 1, batch_size=512)

NameError: name 'model' is not defined