# 数据探索 处理用户行为数据


以商品推荐为例，首先构建三类基本特征。

1、用户对商品的行为特征。最近3天（7天/14天/30天/总的）用户点击（收藏/加入购物车/购买）商品的次数；最后一次点击时间；点击（收藏/加入购物车/购买）的天数……

2、品牌自身的特征。最近7天（30天/总的）被点击（收藏/加入购物车/购买）次数，最近7天（30天/总的）点击（收藏/加入购物车/购买）该商品的用户数目，回头客的数目……

3、用户自身的特征。购买的商品件数；第一次（最后一次）访问时间（购买时间）……

由基本特征衍生出一些特征，其中包含很多强相关特征。例如，转化率，回头率，最近一个月用户点击（购买）该商品的次数除以用户对所有商品的点击（购买）次数……

特征的扩充通常是将基本特征两两相除、相乘、求交、求并等等，获得新的特征。将单特征的属性值用0-1编码扩充成多个特征也是常用的技术之一，俗称“哑变量”。也可以将多个特征的属性值按一定的权值相加，构成新的特征。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import *
%matplotlib inline

### 1.读入2月数据

In [None]:
# File = '../action02/000000_0'

# #colInfo =['user_id','sku_id','time','model_id','type','cate','brand']
# colInfo =['user_id','sku_id','cate','brand', \
#           'sum_type1','sum_type2','sum_type3','sum_type4','sum_type5','sum_type6']

# data = pd.read_table(File,sep='\001',names=colInfo)
# data.head() #显示数据的前5行

In [2]:
File = '../JData_Action_201602.csv'
df = pd.read_csv(File,encoding='gbk')

In [3]:
df.head()

Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
0,266079.0,138778,2016-01-31 23:59:02,,1,8,403
1,266079.0,138778,2016-01-31 23:59:03,0.0,6,8,403
2,200719.0,61226,2016-01-31 23:59:07,,1,8,30
3,200719.0,61226,2016-01-31 23:59:08,0.0,6,8,30
4,263587.0,72348,2016-01-31 23:59:08,,1,5,159


### 2.评论数据探索

In [None]:
# df.describe()

In [None]:
# 查看数据条数
len(df)

### 3.数据清洗

In [None]:
type(df.ix[0,'user_id'])

In [None]:
# 把用户id从float转为str
str(int(df['user_id'].values[0]))

In [3]:
df['userid']=pd.Series([str(int(x)) for x in df['user_id'].values])

In [4]:
df = df.drop('user_id', axis=1)

In [5]:
# 修改列名
df.rename(columns={'userid': 'user_id'}, inplace=True)

In [6]:
df.head()

Unnamed: 0,sku_id,time,model_id,type,cate,brand,user_id
0,138778,2016-01-31 23:59:02,,1,8,403,266079
1,138778,2016-01-31 23:59:03,0.0,6,8,403,266079
2,61226,2016-01-31 23:59:07,,1,8,30,200719
3,61226,2016-01-31 23:59:08,0.0,6,8,30,200719
4,72348,2016-01-31 23:59:08,,1,5,159,263587


In [None]:
#df['user_id']=pd.Series([str(x) for x in df['user'].values])

In [None]:
#df.head()

In [None]:
#df = df.drop('user', axis=1)

In [None]:
# 处理数据里面的Nan

In [7]:
# 把str类型的time,转成datetime
df['Date'] = pd.to_datetime(pd.Series(df['time']))
df = df.drop('time', axis=1)

In [8]:
#df['Year'] = df['Date'].apply(lambda x: x.year)
df['Month'] = df['Date'].apply(lambda x: x.month)
# 求出日期是周几
df['weekday'] = df['Date'].apply(lambda x: x.weekday())

In [10]:
df = df.drop('Date', axis=1)

### 4.查看属性的取值分布情况 

In [None]:
# 查看user_id 个数
len(df['user_id'].unique())

In [None]:
# 查看sku_id 个数
len(df['sku_id'].unique())

In [None]:
# 查看点击的模块种类个数
len(df['model_id'].unique())

In [None]:
# 查看cate个数
len(df['cate'].unique())

In [None]:
# 查看brand个数
len(df['brand'].unique())

### 5.特征处理 

In [11]:
df.head(10)

Unnamed: 0,sku_id,model_id,type,cate,brand,user_id,Month,weekday
0,138778,,1,8,403,266079,1,6
1,138778,0.0,6,8,403,266079,1,6
2,61226,,1,8,30,200719,1,6
3,61226,0.0,6,8,30,200719,1,6
4,72348,,1,5,159,263587,1,6
5,103126,,1,4,174,296130,1,6
6,103126,0.0,6,4,174,296130,1,6
7,103126,0.0,6,4,174,296130,1,6
8,137328,111.0,6,6,159,217892,1,6
9,4732,,1,6,375,283139,1,6


In [12]:
# 对用户行为做离散化处理
type_dummy = pd.get_dummies(df['type'])
type_dummy.head()

Unnamed: 0,1,2,3,4,5,6
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,1,0,0,0,0,0
3,0,0,0,0,0,1
4,1,0,0,0,0,0


In [3]:
type_list = ['type_'+str(int(x)) for x in type_dummy.columns]
type_list

NameError: name 'type_dummy' is not defined

In [14]:
# 对weekday做离散化处理
weekday_dummy = pd.get_dummies(df['weekday'])
weekday_dummy.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


In [15]:
weekday_list = ['weekday_'+str(int(x)) for x in weekday_dummy.columns]
weekday_list

['weekday_0',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6']

In [15]:
# 对点击模块做离散化
model_dummy = pd.get_dummies(df['model_id'])
model_dummy.head()

Unnamed: 0,0.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,...,339.0,340.0,341.0,342.0,343.0,344.0,345.0,346.0,347.0,348.0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
mod_list = ['mod_'+str(int(x)) for x in model_dummy.columns]
mod_list

['mod_0',
 'mod_11',
 'mod_12',
 'mod_13',
 'mod_14',
 'mod_15',
 'mod_16',
 'mod_17',
 'mod_18',
 'mod_19',
 'mod_21',
 'mod_22',
 'mod_23',
 'mod_24',
 'mod_25',
 'mod_26',
 'mod_27',
 'mod_28',
 'mod_29',
 'mod_31',
 'mod_32',
 'mod_33',
 'mod_34',
 'mod_36',
 'mod_39',
 'mod_110',
 'mod_111',
 'mod_112',
 'mod_113',
 'mod_114',
 'mod_115',
 'mod_116',
 'mod_119',
 'mod_120',
 'mod_121',
 'mod_122',
 'mod_124',
 'mod_125',
 'mod_210',
 'mod_211',
 'mod_216',
 'mod_217',
 'mod_218',
 'mod_219',
 'mod_220',
 'mod_221',
 'mod_222',
 'mod_223',
 'mod_224',
 'mod_311',
 'mod_312',
 'mod_313',
 'mod_315',
 'mod_316',
 'mod_318',
 'mod_319',
 'mod_320',
 'mod_321',
 'mod_322',
 'mod_323',
 'mod_325',
 'mod_326',
 'mod_328',
 'mod_329',
 'mod_331',
 'mod_333',
 'mod_334',
 'mod_335',
 'mod_336',
 'mod_337',
 'mod_339',
 'mod_340',
 'mod_341',
 'mod_342',
 'mod_343',
 'mod_344',
 'mod_345',
 'mod_346',
 'mod_347',
 'mod_348']

In [16]:
# 对cate做离散化
cate_dummy = pd.get_dummies(df['cate'])
cate_dummy.head()

Unnamed: 0,4,5,6,7,8,9,10,11
0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0


In [27]:
cate_list = ['cate_'+str(int(x)) for x in cate_dummy.columns]
cate_list

['cate_4',
 'cate_5',
 'cate_6',
 'cate_7',
 'cate_8',
 'cate_9',
 'cate_10',
 'cate_11']

In [17]:
# 对brand做离散化
# brand_dummy = pd.get_dummies(df['brand'])
# brand_dummy.head()

Unnamed: 0,4,6,8,18,22,24,29,30,36,38,...,909,910,911,912,914,916,917,918,922,923
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# del brand_dummy

In [16]:
# 把原始数据和离散以后的数据做连接
# 离散的数据 是 type_dummy,weekday_dummy
df_tmp = pd.concat([df[['user_id','sku_id']],type_dummy,weekday_dummy],axis=1)

In [17]:
# 为了处理方便,修改列名
df_tmp.columns = ['user_id', 'sku_id'] + type_list + weekday_list 

In [18]:
df_tmp.head()

Unnamed: 0,user_id,sku_id,type_1,type_2,type_3,type_4,type_5,type_6,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,266079,138778,1,0,0,0,0,0,0,0,0,0,0,0,1
1,266079,138778,0,0,0,0,0,1,0,0,0,0,0,0,1
2,200719,61226,1,0,0,0,0,0,0,0,0,0,0,0,1
3,200719,61226,0,0,0,0,0,1,0,0,0,0,0,0,1
4,263587,72348,1,0,0,0,0,0,0,0,0,0,0,0,1


In [32]:
df_tmp.to_csv('../action02_df_tmp.csv',index=False)

### 根据user_id,sku_id,对用户行为做计数

In [19]:
# 对用户行为做计数(2月全月)
# 每一个<user,sku>,各种行为的次数
# 每一个<user,sku>,在周一到周日的期间,每一天的所有行为的发生次数
grouped=df_tmp[type_list + weekday_list].groupby([df_tmp['user_id'],df_tmp['sku_id']]).sum()
df_type = grouped.copy()

In [20]:
df_type.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,type_1,type_2,type_3,type_4,type_5,type_6,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
user_id,sku_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
200002,3752,1.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,5505,2.0,0.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,5757,6.0,0.0,0.0,0.0,0.0,7.0,6.0,0.0,0.0,0.0,0.0,0.0,7.0
200002,7199,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
200002,8792,4.0,0.0,0.0,0.0,0.0,13.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,18103,1.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,18412,1.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,19253,4.0,0.0,0.0,0.0,0.0,7.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,20308,1.0,0.0,0.0,0.0,0.0,9.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,24369,4.0,0.0,0.0,0.0,0.0,3.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df_type.to_csv('../action02_df_type_weekday.csv',index=True)

### 用户自身的特征 每一种行为次数

In [2]:
File2 = '../action02_df_type_weekday.csv'
df_grp = pd.read_csv(File2,encoding='gbk')

In [4]:
df_grp.columns

Index(['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5',
       'type_6', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3',
       'weekday_4', 'weekday_5', 'weekday_6'],
      dtype='object')

In [5]:
[x for x in df_grp.columns if x[0:4]=='type']

['type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']

In [6]:
grouped_user = df_grp[[x for x in df_grp.columns if x[0:4]=='type']].groupby(df_grp['user_id']).sum()
df_user = grouped_user.copy()

In [7]:
df_user.head()

Unnamed: 0_level_0,type_1,type_2,type_3,type_4,type_5,type_6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200002,139.0,1.0,0.0,0.0,0.0,228.0
200003,20.0,0.0,0.0,0.0,0.0,31.0
200005,70.0,2.0,3.0,1.0,2.0,105.0
200007,26.0,0.0,0.0,0.0,0.0,25.0
200008,35.0,0.0,0.0,0.0,0.0,69.0


In [8]:
df_user.to_csv('../action02_df_user.csv',index=True)

### 用户自身的特征 每周的一天,所有行为次数的合计

In [10]:
[x for x in df_grp.columns if x[0:7]=='weekday']

['weekday_0',
 'weekday_1',
 'weekday_2',
 'weekday_3',
 'weekday_4',
 'weekday_5',
 'weekday_6']

In [11]:
grouped_wd = df_grp[[x for x in df_grp.columns if x[0:7]=='weekday']].groupby(df_grp['user_id']).sum()
df_wd = grouped_wd.copy()

In [12]:
df_wd.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
200002,164.0,82.0,79.0,10.0,0.0,0.0,33.0
200003,0.0,31.0,20.0,0.0,0.0,0.0,0.0
200005,0.0,63.0,3.0,0.0,5.0,16.0,96.0
200007,4.0,0.0,25.0,0.0,0.0,22.0,0.0
200008,49.0,0.0,0.0,0.0,0.0,44.0,11.0


In [13]:
df_wd.to_csv('../action02_df_wd.csv',index=True)

### 商品的特征 每一个商品发生的行为的次数合计

In [14]:
grouped_sku = df_grp[[x for x in df_grp.columns if x[0:4]=='type']].groupby(df_grp['sku_id']).sum()
df_sku = grouped_sku.copy()

In [15]:
df_sku.head()

Unnamed: 0_level_0,type_1,type_2,type_3,type_4,type_5,type_6
sku_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,18.0,0.0,0.0,0.0,0.0,24.0
36,12.0,0.0,0.0,0.0,0.0,40.0
37,5.0,0.0,0.0,0.0,0.0,10.0
40,21.0,0.0,0.0,0.0,0.0,49.0
43,156.0,0.0,0.0,0.0,1.0,229.0


In [8]:
df_sku.to_csv('../action02_df_sku.csv',index=True)

### 商品的特征 每周的一天,所有行为次数的合计

In [16]:
grouped_sku_wd = df_grp[[x for x in df_grp.columns if x[0:7]=='weekday']].groupby(df_grp['sku_id']).sum()
df_sku_wd = grouped_sku_wd.copy()

In [17]:
df_sku_wd.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
sku_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,11.0,5.0,0.0,4.0,4.0,7.0,11.0
36,15.0,0.0,0.0,17.0,14.0,6.0,0.0
37,0.0,8.0,5.0,0.0,0.0,0.0,2.0
40,14.0,6.0,14.0,10.0,10.0,16.0,0.0
43,104.0,32.0,74.0,26.0,34.0,73.0,43.0


In [18]:
df_sku_wd.to_csv('../action02_df_sku_wd.csv',index=True)

### 对点击的模块做计数

In [None]:
# 只选取点击的数据
df_type6 = df[df['model_id'].notnull()].copy()
len(df_type6)

In [None]:
# 把原始数据的一部分和离散化的点击模块数据做结合
df_tmp2 = pd.concat([df_type6[['user_id','sku_id']],model_dummy],axis=1)

In [None]:
# 为了处理方便,修改列名
mod_list = ['mod_'+str(x) for x in df_tmp2.columns[2:].values]
df_tmp2.columns = ['user_id', 'sku_id'] + mod_list

In [None]:
# 对模块的点击次数做计数
grouped_mod=df_tmp2[mod_list].groupby(df_tmp2['sku_id']).sum()
grouped_mod.to_csv('../action02_df_mod.csv',index=True)