# 数据探索

In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import *
%matplotlib inline

### 1.读入用户数据

In [130]:
userFile = './data/JData_User.csv'

In [131]:
user_df = pd.read_csv(userFile,encoding='gbk')
print(type(user_df))
print(user_df.head())

<class 'pandas.core.frame.DataFrame'>
   user_id     age  sex  user_lv_cd user_reg_tm
0   200001   56岁以上  2.0           5  2016-01-26
1   200002      -1  0.0           1  2016-01-26
2   200003  36-45岁  1.0           4  2016-01-26
3   200004      -1  2.0           1  2016-01-26
4   200005  16-25岁  0.0           4  2016-01-26


### 2.用户数据探索

In [132]:
user_df.describe()

Unnamed: 0,user_id,sex,user_lv_cd
count,105321.0,105318.0,105321.0
mean,252661.0,1.112887,3.85003
std,30403.698188,0.95593,1.071505
min,200001.0,0.0,1.0
25%,226331.0,0.0,3.0
50%,252661.0,2.0,4.0
75%,278991.0,2.0,5.0
max,305321.0,2.0,5.0


In [133]:
# 查看数据条数
len(user_df)

105321

In [134]:
# 查看属性非空条数
user_df['age'].count()

105318

In [135]:
# 查看属性非空条数
user_df['sex'].count()

105318

In [136]:
# 查看属性非空条数
user_df['user_reg_tm'].count()

105318

### 3.数据清洗

In [137]:
# 通过数据条数,判断发现有空数据,空值的项目有 age,sex,user_reg_tm
#查看age=nan的数据
user_df[user_df['age'].isnull()]

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm
34072,234073,,,1,
38905,238906,,,1,
67704,267705,,,1,


### 数据清洗:去掉上面三条空数据

In [138]:
# 去除空数据
user_df=user_df[user_df['age'].notnull()]

### 4.查看属性的取值分布情况 

In [139]:
#查看属性的值
user_df['age'].unique()

array(['56岁以上', '-1', '36-45岁', '16-25岁', '26-35岁', '46-55岁', '15岁以下'], dtype=object)

In [140]:
#查看每一个属性取值的数据条数
user_df['age'].value_counts()

26-35岁    46570
36-45岁    30336
-1        14412
16-25岁     8797
46-55岁     3325
56岁以上      1871
15岁以下         7
Name: age, dtype: int64

In [141]:
#查看属性的值
user_df['sex'].unique()

array([ 2.,  0.,  1.])

In [142]:
#查看每一个属性取值的数据条数
user_df['sex'].value_counts()

2.0    54735
0.0    42846
1.0     7737
Name: sex, dtype: int64

In [143]:
user_df['user_lv_cd'].unique()

array([5, 1, 4, 2, 3])

In [144]:
#查看每一个属性取值的数据条数
user_df['user_lv_cd'].value_counts()

5    36088
4    32343
3    24563
2     9661
1     2663
Name: user_lv_cd, dtype: int64

### 5.特征处理 

In [145]:
# 对sex做离散化,把三个取值,展开成三列,对应的位置为1,其余为0
# 男(0) -> sex_male
# 女(1) -> sex_female
# 保密(2) -> sex_unknow
# 初始化
user_df['sex_male'] = 0.0
user_df['sex_female'] = 0.0
user_df['sex_unknow'] = 0.0

In [146]:
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm,sex_male,sex_female,sex_unknow
0,200001,56岁以上,2.0,5,2016-01-26,0.0,0.0,0.0
1,200002,-1,0.0,1,2016-01-26,0.0,0.0,0.0
2,200003,36-45岁,1.0,4,2016-01-26,0.0,0.0,0.0
3,200004,-1,2.0,1,2016-01-26,0.0,0.0,0.0
4,200005,16-25岁,0.0,4,2016-01-26,0.0,0.0,0.0


In [147]:
# 离散化
user_df.loc[user_df['sex'] == 0.0,'sex_male']=1
user_df.loc[user_df['sex'] == 1.0,'sex_female']=1
user_df.loc[user_df['sex'] == 2.0,'sex_unknow']=1

In [148]:
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm,sex_male,sex_female,sex_unknow
0,200001,56岁以上,2.0,5,2016-01-26,0.0,0.0,1.0
1,200002,-1,0.0,1,2016-01-26,1.0,0.0,0.0
2,200003,36-45岁,1.0,4,2016-01-26,0.0,1.0,0.0
3,200004,-1,2.0,1,2016-01-26,0.0,0.0,1.0
4,200005,16-25岁,0.0,4,2016-01-26,1.0,0.0,0.0


In [149]:
# 对age做变换
# 初始化
user_df['age_15'] = 0.0
user_df['age_25'] = 0.0
user_df['age_35'] = 0.0
user_df['age_45'] = 0.0
user_df['age_55'] = 0.0
user_df['age_56'] = 0.0
user_df['age_unknow'] = 0.0

# 数据变换
user_df.loc[user_df['age'] == '15岁以下','age_15']=1
user_df.loc[user_df['age'] == '16-25岁','age_25']=1
user_df.loc[user_df['age'] == '26-35岁','age_35']=1
user_df.loc[user_df['age'] == '36-45岁','age_45']=1
user_df.loc[user_df['age'] == '46-55岁','age_55']=1
user_df.loc[user_df['age'] == '56岁以上','age_56']=1
user_df.loc[user_df['age'] == '-1','age_unknow']=1

In [150]:
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm,sex_male,sex_female,sex_unknow,age_15,age_25,age_35,age_45,age_55,age_56,age_unknow
0,200001,56岁以上,2.0,5,2016-01-26,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,200002,-1,0.0,1,2016-01-26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,200003,36-45岁,1.0,4,2016-01-26,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,200004,-1,2.0,1,2016-01-26,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,200005,16-25岁,0.0,4,2016-01-26,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [151]:
# 计算用户注册时间,同时做离散化处理
# 使用datetime
# from datetime import datetime
# datetime.strptime(user_df['user_reg_tm'][0],'%Y-%m-%d')

In [152]:
# 使用parse
# from dateutil.parser import parse
# parse(user_df['user_reg_tm'][0])

In [153]:
# 使用.to_datetime
user_df['user_reg_tm']=pd.to_datetime(user_df['user_reg_tm'])

In [154]:
# 计算注册时间
user_df['time_delta']= pd.Series([x.days for x in (user_df['user_reg_tm'] - datetime(2016,4,15))])

In [155]:
# 查看注册日期在2016/4/15之后的用户的数量
len(user_df[user_df['time_delta']>0])

9

In [156]:
# 题目要求预测 2016-04-16到2016-04-20用户是否下单P中的商品
# 因此注册日期在2016/4/15的用户数据不符合要求,丢弃这部分数据
user_df=user_df[user_df['time_delta']<0]

In [157]:
# 查看注册日期在2016/4/15之后的用户的数量
len(user_df[user_df['time_delta']>0])

0

In [158]:
# 注册日期是连续变量,做归一化处理
from sklearn import preprocessing

user_df['time_delta_z']=pd.Series(preprocessing.scale(user_df['time_delta'].apply(abs).values))

In [159]:
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm,sex_male,sex_female,sex_unknow,age_15,age_25,age_35,age_45,age_55,age_56,age_unknow,time_delta,time_delta_z
0,200001,56岁以上,2.0,5,2016-01-26,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-80.0,-1.342287
1,200002,-1,0.0,1,2016-01-26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-80.0,-1.342287
2,200003,36-45岁,1.0,4,2016-01-26,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-80.0,-1.342287
3,200004,-1,2.0,1,2016-01-26,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-80.0,-1.342287
4,200005,16-25岁,0.0,4,2016-01-26,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-80.0,-1.342287


In [160]:
# 处理好的数据写入到文件
# 丢弃处理之前的属性列
colDrop = ['age','sex', 'user_reg_tm']
user_df_sel =user_df.drop(colDrop,axis=1).copy()

In [161]:
user_df_sel.head()

Unnamed: 0,user_id,user_lv_cd,sex_male,sex_female,sex_unknow,age_15,age_25,age_35,age_45,age_55,age_56,age_unknow,time_delta,time_delta_z
0,200001,5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-80.0,-1.342287
1,200002,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-80.0,-1.342287
2,200003,4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-80.0,-1.342287
3,200004,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-80.0,-1.342287
4,200005,4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-80.0,-1.342287


In [162]:
user_df_sel.to_csv('../user_df_sel.csv',index=False)