In [4]:
import time
import numpy as np
import pandas as pd
import pymysql
from sklearn.ensemble import RandomForestClassifier
from pyecharts import Bar3D

In [5]:
# 读取数据
sheet_names = ['2015','2016','2017','2018','会员等级']
sheet_datas = [pd.read_excel('./data/sales.xlsx',sheet_name=i) for i in sheet_names]

In [6]:
print(sheet_datas)

[              会员ID         订单号       提交日期     订单金额
0      15278002468  3000304681 2015-01-01    499.0
1      39236378972  3000305791 2015-01-01   2588.0
2      38722039578  3000641787 2015-01-01    498.0
3      11049640063  3000798913 2015-01-01   1572.0
4      35038752292  3000821546 2015-01-01     10.1
5          2731292  3001358088 2015-01-01     98.0
6      14393019386  3001653829 2015-01-01     69.0
7      16891947460  3002248094 2015-01-01   1187.0
8      38895062781  3002781462 2015-01-01    119.0
9      38867901669  3003061989 2015-01-01    149.0
10     39110628969  3003685705 2015-01-01    867.0
11     39460064727  3003858731 2015-01-01   1299.0
12     38907755660  3004633576 2015-01-01   2299.0
13     17447369827  3005478350 2015-01-01    196.0
14     25646490579  3005787853 2015-01-01   7549.0
15     25668695551  3005968593 2015-01-01     99.0
16         9432965  3006228687 2015-01-01    999.0
17     39058678007  3006626321 2015-01-01     48.5
18     14673468350  3006702170

In [23]:
# 数据审查
for each_name,each_data in zip(sheet_names,sheet_datas):
    print('[data summary for ============={}===============]'.format(each_name))
    print("overview:\n",each_data.head(4)) #展示数据前4条
    print("DESC:\n",each_data.describe()) # 数据描述性信息
    print('NA records',each_data.isnull().any(axis=1).sum()) # 缺失值记录数    
    print('Dtypes',each_data.dtypes) # 数据类型

overview:
           会员ID         订单号       提交日期    订单金额 max_year_data max_year_date
0  15278002468  3000304681 2015-01-01   499.0    2015-12-31    2015-12-31
1  39236378972  3000305791 2015-01-01  2588.0    2015-12-31    2015-12-31
2  38722039578  3000641787 2015-01-01   498.0    2015-12-31    2015-12-31
3  11049640063  3000798913 2015-01-01  1572.0    2015-12-31    2015-12-31
DESC:
                会员ID           订单号           订单金额
count  3.057400e+04  3.057400e+04   30574.000000
mean   2.921327e+10  4.020442e+09     967.270965
std    1.384598e+10  2.630518e+08    2073.397861
min    2.670000e+02  3.000305e+09       1.500000
25%    1.961657e+10  3.885746e+09      59.700000
50%    3.754532e+10  4.117491e+09     142.000000
75%    3.923630e+10  4.234853e+09     899.000000
max    3.954613e+10  4.282025e+09  111750.000000
NA records 0
Dtypes 会员ID                      int64
订单号                       int64
提交日期             datetime64[ns]
订单金额                    float64
max_year_data    dateti

In [24]:
# 数据预处理
# 去除缺失值和异常值
for ind,each_data in enumerate(sheet_datas[:-1]):
    sheet_datas[ind] = each_data.dropna() # 丢弃缺失记录
    sheet_datas[ind]  =each_data[each_data['订单金额']>1] # 丢弃订单金额<=1的记录
    # 增加一列最大日期值
    sheet_datas[ind]['max_year_date'] = each_data['提交日期'].max()
    

In [25]:
# 汇总所有数据
data_merge = pd.concat(sheet_datas[:-1],axis=0)
# 获取各自年份数据
data_merge['date_interval'] = data_merge['max_year_date'] - data_merge['提交日期']
data_merge['year'] = data_merge['提交日期'].dt.year
# 转换日期间隔为数字
data_merge['date_interval'] = data_merge['date_interval'].apply(lambda x:x.days)

In [27]:
# 按会员ID做汇总
rfm_gb = data_merge.groupby(['year','会员ID'],as_index=False).agg( {'date_interval': 'min',  # 计算最近一次订单时间
     '提交日期': 'count', # 计算订单频率
     '订单金额': 'sum'})  # 计算订单总金额
# 重命名列名
rfm_gb.columns = ['year','会员ID','r','f','m']
rfm_gb.head()

Unnamed: 0,year,会员ID,r,f,m
0,2015,267,197,2,105.0
1,2015,282,251,1,29.7
2,2015,283,340,1,5398.0
3,2015,343,300,1,118.0
4,2015,525,37,3,213.0


In [29]:
# 查看数据分布
desc_pd = rfm_gb.iloc[:,2:].describe().T
print(desc_pd)
# 定义区间边界
r_bins = [-1,79,255,365] # 注意起始边界小于最小值
f_bins = [0,2,5,130] 
m_bins = [0,69,1199,206252]

      count         mean          std  min   25%    50%     75%       max
r  148591.0   165.524043   101.988472  0.0  79.0  156.0   255.0     365.0
f  148591.0     1.365002     2.626953  1.0   1.0    1.0     1.0     130.0
m  148591.0  1323.741329  3753.906883  1.5  69.0  189.0  1199.0  206251.8


In [32]:
# 匹配会员等级和rfm得分
rfm_merge = pd.merge(rfm_gb,sheet_datas[-1],on='会员ID',how="inner")
# rf获得rfm因子得分
clf = RandomForestClassifier()
clf = clf.fit(rfm_merge[['r','f','m']],rfm_merge["会员等级"])
weights = clf.feature_importances_
print('feature importance',weights)

feature importance [0.42412991 0.00625133 0.56961876]


In [33]:
# RFM分箱得分pd.cut数据离散化
rfm_gb['r_score'] = pd.cut(rfm_gb['r'], r_bins, labels=[i for i in range(len(r_bins)-1,0,-1)])  # 计算R得分
rfm_gb['f_score'] = pd.cut(rfm_gb['f'], f_bins, labels=[i+1 for i in range(len(f_bins)-1)])  # 计算F得分
rfm_gb['m_score'] = pd.cut(rfm_gb['m'], m_bins, labels=[i+1 for i in range(len(m_bins)-1)])  # 计算M得分

In [34]:
# 计算RFM总得分
# 方法一：加权得分
rfm_gb = rfm_gb.apply(np.int32) # cate转数值
rfm_gb['rfm_score'] = rfm_gb['r_score'] * weights[0] + rfm_gb['f_score'] * weights[1] + rfm_gb['m_score'] * weights[2]

In [35]:
# 方法二：RFM组合pd.cat连接两个序列
rfm_gb['r_score'] = rfm_gb['r_score'].astype(np.str)
rfm_gb['f_score'] = rfm_gb['f_score'].astype(np.str)
rfm_gb['m_score'] = rfm_gb['m_score'].astype(np.str)
rfm_gb['rfm_group'] = rfm_gb['r_score'].str.cat(rfm_gb['f_score']).str.cat(
rfm_gb['m_score'])

In [37]:
rfm_gb.to_excel('sales_rfm_score1.xlsx')  # 保存数据为Excel

KeyboardInterrupt: 

In [38]:
# 图形化展示
# 图形数据汇总
display_data = rfm_gb.groupby(['rfm_group','year'],as_index=False)['会员ID'].count()
display_data.columns = ['rfm_group','year','number']
display_data['rfm_group'] = display_data['rfm_group'].astype(np.int32)
display_data.head()

Unnamed: 0,rfm_group,year,number
0,111,2015,2180
1,111,2016,1498
2,111,2017,3169
3,111,2018,2271
4,112,2015,3811


In [39]:
# 显示图形
bar3d = Bar3D("", width=900, height=600)
range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
               '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
bar3d.add(
    "rfm分组结果",
    "",
    "",
    [d.tolist() for d in display_data.values],
    is_visualmap=True,
    visual_range=[0, display_data['number'].max()],
    visual_range_color=range_color,
    grid3d_width=200,
    grid3d_height=80,
    grid3d_depth=80
)
bar3d