# 数据探索

In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import *
%matplotlib inline

### 1.读入用户数据

In [72]:
userFile = './data/JData_User.csv'

In [73]:
user_df = pd.read_csv(userFile,encoding='gbk')
print(type(user_df))
print(user_df.head())

<class 'pandas.core.frame.DataFrame'>
   user_id     age  sex  user_lv_cd user_reg_tm
0   200001   56岁以上  2.0           5  2016-01-26
1   200002      -1  0.0           1  2016-01-26
2   200003  36-45岁  1.0           4  2016-01-26
3   200004      -1  2.0           1  2016-01-26
4   200005  16-25岁  0.0           4  2016-01-26


### 2.用户数据探索

In [74]:
user_df.describe()

Unnamed: 0,user_id,sex,user_lv_cd
count,105321.0,105318.0,105321.0
mean,252661.0,1.112887,3.85003
std,30403.698188,0.95593,1.071505
min,200001.0,0.0,1.0
25%,226331.0,0.0,3.0
50%,252661.0,2.0,4.0
75%,278991.0,2.0,5.0
max,305321.0,2.0,5.0


In [75]:
# 查看数据条数
len(user_df)

105321

In [76]:
# 查看属性非空条数
user_df['age'].count()

105318

In [77]:
# 查看属性非空条数
user_df['sex'].count()

105318

In [78]:
# 查看属性非空条数
user_df['user_reg_tm'].count()

105318

### 3.数据清洗

In [79]:
# 通过数据条数,判断发现有空数据,空值的项目有 age,sex,user_reg_tm
#查看age=nan的数据
user_df[user_df['age'].isnull()]

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm
34072,234073,,,1,
38905,238906,,,1,
67704,267705,,,1,


### 数据清洗:去掉上面三条空数据

In [80]:
# 去除空数据
user_df=user_df[user_df['age'].notnull()]

### 4.查看属性的取值分布情况 

In [81]:
#查看属性的值
user_df['age'].unique()

array(['56岁以上', '-1', '36-45岁', '16-25岁', '26-35岁', '46-55岁', '15岁以下'], dtype=object)

In [82]:
#查看每一个属性取值的数据条数
user_df['age'].value_counts()

26-35岁    46570
36-45岁    30336
-1        14412
16-25岁     8797
46-55岁     3325
56岁以上      1871
15岁以下         7
Name: age, dtype: int64

In [83]:
#查看属性的值
user_df['sex'].unique()

array([ 2.,  0.,  1.])

In [84]:
#查看每一个属性取值的数据条数
user_df['sex'].value_counts()

2.0    54735
0.0    42846
1.0     7737
Name: sex, dtype: int64

In [85]:
user_df['user_lv_cd'].unique()

array([5, 1, 4, 2, 3])

In [86]:
#查看每一个属性取值的数据条数
user_df['user_lv_cd'].value_counts()

5    36088
4    32343
3    24563
2     9661
1     2663
Name: user_lv_cd, dtype: int64

### 5.特征处理 

In [87]:
# 对sex做离散化,把三个取值,展开成三列,对应的位置为1,其余为0
# 男(0) -> sex_male
# 女(1) -> sex_female
# 保密(2) -> sex_unknow
# 初始化
user_df['sex_male'] = 0.0
user_df['sex_female'] = 0.0
user_df['sex_unknow'] = 0.0

In [90]:
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm,sex_male,sex_female,sex_unknow
0,200001,56岁以上,2.0,5,2016-01-26,0.0,0.0,0.0
1,200002,-1,0.0,1,2016-01-26,0.0,0.0,0.0
2,200003,36-45岁,1.0,4,2016-01-26,0.0,0.0,0.0
3,200004,-1,2.0,1,2016-01-26,0.0,0.0,0.0
4,200005,16-25岁,0.0,4,2016-01-26,0.0,0.0,0.0


In [92]:
# 离散化
tmp = user_df['sex_male']
tmp[user_df['sex'] == 0.0] = 1
user_df['sex_male'] = tmp

# user_df.loc[user_df['sex'] == 1]['sex_male'] = 1
# user_df[user_df['sex'] == 1].sex_female = 1
# user_df[user_df['sex'] == 2].sex_unknow = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [94]:
tmp = user_df['sex_female']
tmp[user_df['sex'] == 1.0] = 1
user_df['sex_female'] = tmp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [95]:
tmp = user_df['sex_unknow']
tmp[user_df['sex'] == 2.0] = 1
user_df['sex_unknow'] = tmp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [96]:
user_df.head()

Unnamed: 0,user_id,age,sex,user_lv_cd,user_reg_tm,sex_male,sex_female,sex_unknow
0,200001,56岁以上,2.0,5,2016-01-26,0.0,0.0,1.0
1,200002,-1,0.0,1,2016-01-26,1.0,0.0,0.0
2,200003,36-45岁,1.0,4,2016-01-26,0.0,1.0,0.0
3,200004,-1,2.0,1,2016-01-26,0.0,0.0,1.0
4,200005,16-25岁,0.0,4,2016-01-26,1.0,0.0,0.0


### 商品数据探索

In [None]:
productFile= './data/JData_Product.csv'

In [None]:
prd_df = pd.read_csv(productFile,encoding='gbk')
print(type(prd_df))
print(prd_df.head())