# 数据探索 处理用户行为数据


以商品推荐为例，首先构建三类基本特征。

1、用户对商品的行为特征。最近3天（7天/14天/30天/总的）用户点击（收藏/加入购物车/购买）商品的次数；最后一次点击时间；点击（收藏/加入购物车/购买）的天数……

2、品牌自身的特征。最近7天（30天/总的）被点击（收藏/加入购物车/购买）次数，最近7天（30天/总的）点击（收藏/加入购物车/购买）该商品的用户数目，回头客的数目……

3、用户自身的特征。购买的商品件数；第一次（最后一次）访问时间（购买时间）……

由基本特征衍生出一些特征，其中包含很多强相关特征。例如，转化率，回头率，最近一个月用户点击（购买）该商品的次数除以用户对所有商品的点击（购买）次数……

特征的扩充通常是将基本特征两两相除、相乘、求交、求并等等，获得新的特征。将单特征的属性值用0-1编码扩充成多个特征也是常用的技术之一，俗称“哑变量”。也可以将多个特征的属性值按一定的权值相加，构成新的特征。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import *
%matplotlib inline

### 1.读入数据

In [None]:
# File = '../action02/000000_0'

# #colInfo =['user_id','sku_id','time','model_id','type','cate','brand']
# colInfo =['user_id','sku_id','cate','brand', \
#           'sum_type1','sum_type2','sum_type3','sum_type4','sum_type5','sum_type6']

# data = pd.read_table(File,sep='\001',names=colInfo)
# data.head() #显示数据的前5行

In [14]:
File = '../JData_Action_201602.csv'
df = pd.read_csv(File,encoding='gbk')

In [15]:
df.head()

Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
0,266079.0,138778,2016-01-31 23:59:02,,1,8,403
1,266079.0,138778,2016-01-31 23:59:03,0.0,6,8,403
2,200719.0,61226,2016-01-31 23:59:07,,1,8,30
3,200719.0,61226,2016-01-31 23:59:08,0.0,6,8,30
4,263587.0,72348,2016-01-31 23:59:08,,1,5,159


### 2.评论数据探索

In [None]:
# df.describe()

In [None]:
# 查看数据条数
len(df)

### 3.数据清洗

In [16]:
type(df.ix[0,'user_id'])

numpy.float64

In [23]:
# 把用户id从float转为str
int(df['user_id'].values[0])

266079

In [28]:
df['user']=pd.Series([int(x) for x in df['user_id'].values])

KeyError: 'user_id'

In [29]:
df.head()

Unnamed: 0,sku_id,time,model_id,type,cate,brand,user
0,138778,2016-01-31 23:59:02,,1,8,403,266079
1,138778,2016-01-31 23:59:03,0.0,6,8,403,266079
2,61226,2016-01-31 23:59:07,,1,8,30,200719
3,61226,2016-01-31 23:59:08,0.0,6,8,30,200719
4,72348,2016-01-31 23:59:08,,1,5,159,263587


In [26]:
df = df.drop('user_id', axis=1)

In [31]:
df['user_id']=pd.Series([str(x) for x in df['user'].values])

In [32]:
df.head()

Unnamed: 0,sku_id,time,model_id,type,cate,brand,user,user_id
0,138778,2016-01-31 23:59:02,,1,8,403,266079,266079
1,138778,2016-01-31 23:59:03,0.0,6,8,403,266079,266079
2,61226,2016-01-31 23:59:07,,1,8,30,200719,200719
3,61226,2016-01-31 23:59:08,0.0,6,8,30,200719,200719
4,72348,2016-01-31 23:59:08,,1,5,159,263587,263587


In [33]:
df = df.drop('user', axis=1)

In [10]:
# 处理数据里面的Nan

### 4.查看属性的取值分布情况 

In [None]:
# 查看user_id 个数
len(df['user_id'].unique())

In [None]:
# 查看sku_id 个数
len(df['sku_id'].unique())

In [None]:
# 查看点击的模块种类个数
len(df['model_id'].unique())

In [None]:
# 查看cate个数
len(df['cate'].unique())

In [None]:
# 查看brand个数
len(df['brand'].unique())

### 5.特征处理 

In [None]:
df.head(10)

In [3]:
# 对用户行为做离散化处理
type_dummy = pd.get_dummies(df['type'])
type_dummy.head()

Unnamed: 0,1,2,3,4,5,6
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,1,0,0,0,0,0
3,0,0,0,0,0,1
4,1,0,0,0,0,0


In [None]:
# 把原始数据的一部分和离散化的用户行为数据做结合
df_tmp = pd.concat([df[['user_id','sku_id','cate','brand','model_id']],type_dummy],axis=1)

In [None]:
# 为了处理方便,修改列名
df_tmp.columns = ['user_id', 'sku_id', 'cate', 'brand','model_id','type_1','type_2','type_3','type_4','type_5','type_6']

In [None]:
df_tmp.columns

In [None]:
df_tmp.head()

### 每一个用户对每一个商品的各种行为的次数合计

In [None]:
# 根据user_id,sku_id,对用户行为做计数
# 计算得到2月全月,每一个用户对每一个商品的各种行为的次数合计
grouped_type=df_tmp[['type_1','type_2','type_3','type_4','type_5','type_6']].groupby([df_tmp['user_id'],df_tmp['sku_id']]).sum()
df_type = grouped_type.copy()

In [None]:
df_type.head(20)

In [None]:
#  用户行为的计数结果写入到文件
df_type.to_csv('../action02_df_type.csv',index=True)

### 每一个用户每一种行为的天数

In [4]:
df.head()

Unnamed: 0,user_id,sku_id,time,model_id,type,cate,brand
0,266079.0,138778,2016-01-31 23:59:02,,1,8,403
1,266079.0,138778,2016-01-31 23:59:03,0.0,6,8,403
2,200719.0,61226,2016-01-31 23:59:07,,1,8,30
3,200719.0,61226,2016-01-31 23:59:08,0.0,6,8,30
4,263587.0,72348,2016-01-31 23:59:08,,1,5,159


In [5]:
# 把str类型的time,转成datetime
df['Date'] = pd.to_datetime(pd.Series(df['time']))
df = df.drop('time', axis=1)

In [7]:
df['Year'] = df['Date'].apply(lambda x: x.year)
df['Month'] = df['Date'].apply(lambda x: x.month)
df['weekday'] = df['Date'].apply(lambda x: x.weekday())

In [None]:
df.head()

### 对点击的模块做计数

In [3]:
# 只选取点击的数据
df_type6 = df[df['model_id'].notnull()].copy()
len(df_type6)

6525807

In [4]:
# 对点击模块做离散化
model_dummy = pd.get_dummies(df_type6['model_id'])
model_dummy.head()

Unnamed: 0,0.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,...,339.0,340.0,341.0,342.0,343.0,344.0,345.0,346.0,347.0,348.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# 把原始数据的一部分和离散化的点击模块数据做结合
df_tmp2 = pd.concat([df_type6[['user_id','sku_id']],model_dummy],axis=1)

In [6]:
# 为了处理方便,修改列名
mod_list = ['mod_'+str(x) for x in df_tmp2.columns[2:].values]
df_tmp2.columns = ['user_id', 'sku_id'] + mod_list

In [7]:
df_tmp2.head()

Unnamed: 0,user_id,sku_id,mod_0.0,mod_11.0,mod_12.0,mod_13.0,mod_14.0,mod_15.0,mod_16.0,mod_17.0,...,mod_339.0,mod_340.0,mod_341.0,mod_342.0,mod_343.0,mod_344.0,mod_345.0,mod_346.0,mod_347.0,mod_348.0
1,266079.0,138778,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,200719.0,61226,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,296130.0,103126,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,296130.0,103126,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,217892.0,137328,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# 对模块的点击次数做计数
grouped_mod=df_tmp2[mod_list].groupby(df_tmp2['sku_id']).sum()
grouped_mod.to_csv('../action02_df_mod.csv',index=True)