# 数据探索

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import *
%matplotlib inline

### 1.读入评论数据

In [30]:
File = './data/JData_Comment.csv'

In [31]:
df = pd.read_csv(File,encoding='gbk')
print(type(df))
print(df.head())

<class 'pandas.core.frame.DataFrame'>
           dt  sku_id  comment_num  has_bad_comment  bad_comment_rate
0  2016-02-01    1000            3                1            0.0417
1  2016-02-01   10000            2                0            0.0000
2  2016-02-01  100011            4                1            0.0376
3  2016-02-01  100018            3                0            0.0000
4  2016-02-01  100020            3                0            0.0000


### 2.评论数据探索

In [32]:
df.describe()

Unnamed: 0,sku_id,comment_num,has_bad_comment,bad_comment_rate
count,558552.0,558552.0,558552.0,558552.0
mean,85830.17226,2.57972,0.475469,0.049993
std,49359.214063,1.163774,0.499398,0.13058
min,8.0,0.0,0.0,0.0
25%,43224.0,2.0,0.0,0.0
50%,85892.0,3.0,0.0,0.0
75%,128624.0,4.0,1.0,0.0465
max,171225.0,4.0,1.0,1.0


In [33]:
# 查看数据条数
len(df)

558552

### 3.数据清洗

### 4.查看属性的取值分布情况 

In [34]:
len(df['dt'].unique())

12

In [35]:
# 查看获取评论数据的时间窗口
df['dt'].unique()

array(['2016-02-01', '2016-02-08', '2016-02-15', '2016-02-22',
       '2016-02-29', '2016-03-07', '2016-03-14', '2016-03-21',
       '2016-03-28', '2016-04-04', '2016-04-11', '2016-04-15'], dtype=object)

In [36]:
# 查看每一个截止日期的数据条数
[len(df[df['dt'] == x]) for x in df['dt'].unique()]

[46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546]

In [37]:
#查看每一个时间窗口的商品id的个数
[len(df[df['dt'] == x]['sku_id'].unique()) for x in df['dt'].unique()]

[46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546,
 46546]

In [38]:
len(df['sku_id'].unique())

46546

In [39]:
# 根据数据探索的结果推测,总共有46546中商品,每一个商品,在12个时间窗口,分别采集了评论数据
# 因此可以把评论数据看成是时间序列的数据,在每一个窗口有3个维度

In [40]:
#查看每一个属性取值的数据条数
df['comment_num'].value_counts()

2    168698
4    164789
3    119642
1     85430
0     19993
Name: comment_num, dtype: int64

In [41]:
# 做离散化处理
df['comment_num_0'] = 0.0
df['comment_num_1'] = 0.0
df['comment_num_2'] = 0.0
df['comment_num_3'] = 0.0
df['comment_num_4'] = 0.0
# 离散化赋值
df.loc[df['comment_num'] == 1,'comment_num_1']=1
df.loc[df['comment_num'] == 2,'comment_num_2']=1
df.loc[df['comment_num'] == 3,'comment_num_3']=1
df.loc[df['comment_num'] == 4,'comment_num_4']=1
df.loc[df['comment_num'] == 0,'comment_num_0']=1
# 查看结果
df.head()

Unnamed: 0,dt,sku_id,comment_num,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
0,2016-02-01,1000,3,1,0.0417,0.0,0.0,0.0,1.0,0.0
1,2016-02-01,10000,2,0,0.0,0.0,0.0,1.0,0.0,0.0
2,2016-02-01,100011,4,1,0.0376,0.0,0.0,0.0,0.0,1.0
3,2016-02-01,100018,3,0,0.0,0.0,0.0,0.0,1.0,0.0
4,2016-02-01,100020,3,0,0.0,0.0,0.0,0.0,1.0,0.0


### 5.特征处理 

In [42]:
# 处理好的数据写入到文件
# 丢弃处理之前的属性列
#colDrop = ['dt','comment_num']
colDrop = ['comment_num']
df_sel =df.drop(colDrop,axis=1).copy()

In [43]:
df_sel.head()

Unnamed: 0,dt,sku_id,has_bad_comment,bad_comment_rate,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
0,2016-02-01,1000,1,0.0417,0.0,0.0,0.0,1.0,0.0
1,2016-02-01,10000,0,0.0,0.0,0.0,1.0,0.0,0.0
2,2016-02-01,100011,1,0.0376,0.0,0.0,0.0,0.0,1.0
3,2016-02-01,100018,0,0.0,0.0,0.0,0.0,1.0,0.0
4,2016-02-01,100020,0,0.0,0.0,0.0,0.0,1.0,0.0


In [28]:
# 对经过列筛选的数据做数据归并
# 对has_bad_comment 做合计,求发生差评的时间窗口次数
# 对bad_comment_rate 求均值和方差
# 对comment_num_0 ~ comment_num_4 做合计,求出相同取值的时间窗口次数

In [63]:
grouped_sum=df_sel[['comment_num_0','comment_num_1','comment_num_2','comment_num_3','comment_num_4']].groupby(df_sel['sku_id']).sum()

In [64]:
grouped_sum

Unnamed: 0_level_0,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4
sku_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,0.0,12.0,0.0,0.0,0.0
11,0.0,0.0,0.0,12.0,0.0
12,0.0,0.0,0.0,0.0,12.0
14,0.0,0.0,0.0,12.0,0.0
15,0.0,0.0,12.0,0.0,0.0
18,0.0,0.0,0.0,0.0,12.0
24,0.0,0.0,12.0,0.0,0.0
30,0.0,0.0,12.0,0.0,0.0
36,0.0,0.0,0.0,12.0,0.0
38,0.0,0.0,0.0,0.0,12.0


In [69]:
# 差评率的均值
grouped_mean=df_sel['bad_comment_rate'].groupby(df_sel['sku_id']).mean()
grouped_mean

sku_id
8         1.000000
11        0.087000
12        0.026300
14        0.080000
15        0.250000
18        0.014400
24        0.250000
30        0.000000
36        0.000000
38        0.244400
40        0.087033
41        0.039800
43        0.000000
44        0.000000
45        0.000000
49        0.056900
50        0.003975
51        0.029900
52        0.087092
61        0.000000
64        0.109400
66        0.000000
71        0.031592
87        0.098625
95        0.000000
101       0.000000
107       0.000000
109       0.333300
114       0.000000
116       0.018500
            ...   
171093    0.000000
171097    0.065200
171099    0.000000
171102    0.142900
171111    0.000000
171114    0.000000
171126    0.000000
171127    1.000000
171128    0.000000
171134    0.125000
171140    0.000000
171142    0.000000
171147    0.111100
171151    0.000000
171162    0.018800
171163    0.000000
171165    0.016200
171168    0.000000
171170    1.000000
171171    0.000000
171174    0.041100
17118

In [68]:
# 差评率的方差
grouped_var=df_sel['bad_comment_rate'].groupby(df_sel['sku_id']).var()
grouped_var

sku_id
8         0.000000e+00
11        0.000000e+00
12        0.000000e+00
14        0.000000e+00
15        0.000000e+00
18        0.000000e+00
24        0.000000e+00
30        0.000000e+00
36        0.000000e+00
38        0.000000e+00
40        3.235152e-05
41        0.000000e+00
43        0.000000e+00
44        0.000000e+00
45        0.000000e+00
49        0.000000e+00
50        1.165682e-06
51        0.000000e+00
52        2.792811e-05
61        0.000000e+00
64        0.000000e+00
66        0.000000e+00
71        5.719697e-08
87        1.298548e-04
95        0.000000e+00
101       0.000000e+00
107       0.000000e+00
109       0.000000e+00
114       0.000000e+00
116       0.000000e+00
              ...     
171093    0.000000e+00
171097    0.000000e+00
171099    0.000000e+00
171102    0.000000e+00
171111    0.000000e+00
171114    0.000000e+00
171126    0.000000e+00
171127    0.000000e+00
171128    0.000000e+00
171134    0.000000e+00
171140    0.000000e+00
171142    0.000000e+00
1711

In [88]:
# 创建输出用的df
df_out = grouped_sum.copy()
df_out['bad_comment_rate_mean'] = grouped_mean
df_out['bad_comment_rate_var'] = grouped_var
df_out['has_bad_comment_times'] = df_sel['has_bad_comment'].groupby(df_sel['sku_id']).sum()

In [89]:
df_out.head()

Unnamed: 0_level_0,comment_num_0,comment_num_1,comment_num_2,comment_num_3,comment_num_4,bad_comment_rate_mean,bad_comment_rate_var,has_bad_comment_times
sku_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8,0.0,12.0,0.0,0.0,0.0,1.0,0.0,12
11,0.0,0.0,0.0,12.0,0.0,0.087,0.0,12
12,0.0,0.0,0.0,0.0,12.0,0.0263,0.0,12
14,0.0,0.0,0.0,12.0,0.0,0.08,0.0,12
15,0.0,0.0,12.0,0.0,0.0,0.25,0.0,12


In [93]:
# 查看某一个特征的取值分布情况
df_sel['has_bad_comment'].groupby(df_sel['sku_id']).sum().unique()

array([12,  0, 10,  2,  5,  4,  9,  3,  1, 11,  6,  8,  7])

In [95]:
df_out.to_csv('../comment_df_sel.csv',index=True)