In [1]:
import time  # 时间库

import numpy as np  # numpy库
import pandas as pd  # pandas库
import pymysql  # mysql连接库
from sklearn.ensemble import RandomForestClassifier # RF库

from pyecharts import Bar3D # 3D柱形图

### 读取数据

In [2]:
sheet_names = ['2015','2016','2017','2018','会员等级']
sheet_datas = [pd.read_excel('sales.xlsx',sheet_name=i) for i in sheet_names]

### 查看数据基本情况

In [3]:
for each_name,each_data in zip(sheet_names,sheet_datas):
    print('[data summary for ============={}===============]'.format(each_name))
    print('Overview:','\n',each_data.head(4))# 展示数据前4条
    print('DESC:','\n',each_data.describe())# 数据描述性信息
    print('NA records',each_data.isnull().any(axis=1).sum()) # 缺失值记录数    
    print('Dtypes',each_data.dtypes) # 数据类型

Overview: 
           会员ID         订单号       提交日期    订单金额
0  15278002468  3000304681 2015-01-01   499.0
1  39236378972  3000305791 2015-01-01  2588.0
2  38722039578  3000641787 2015-01-01   498.0
3  11049640063  3000798913 2015-01-01  1572.0
DESC: 
                会员ID           订单号           订单金额
count  3.077400e+04  3.077400e+04   30774.000000
mean   2.918779e+10  4.020414e+09     960.991161
std    1.385333e+10  2.630510e+08    2068.107231
min    2.670000e+02  3.000305e+09       0.500000
25%    1.944122e+10  3.885510e+09      59.000000
50%    3.746545e+10  4.117491e+09     139.000000
75%    3.923593e+10  4.234882e+09     899.000000
max    3.954613e+10  4.282025e+09  111750.000000
NA records 0
Dtypes 会员ID             int64
订单号              int64
提交日期    datetime64[ns]
订单金额           float64
dtype: object
Overview: 
           会员ID         订单号       提交日期    订单金额
0  39288120141  4282025766 2016-01-01    76.0
1  39293812118  4282037929 2016-01-01  7599.0
2  27596340905  4282038740 2016-0

### 数据预处理

In [4]:
for ind,each_data in enumerate(sheet_datas[:-1]):
    sheet_datas[ind] = each_data.dropna() #丢弃缺失值
    sheet_datas[ind] = each_data[each_data['订单金额']>1] # 丢弃订单金额<=1的订单
    sheet_datas[ind]['max_year_date'] = each_data['提交日期'].max()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [5]:
sheet_datas[1].head()

Unnamed: 0,会员ID,订单号,提交日期,订单金额,max_year_date
0,39288120141,4282025766,2016-01-01,76.0,2016-12-31
1,39293812118,4282037929,2016-01-01,7599.0,2016-12-31
2,27596340905,4282038740,2016-01-01,802.0,2016-12-31
3,15111475509,4282043819,2016-01-01,65.0,2016-12-31
4,38896594001,4282051044,2016-01-01,95.0,2016-12-31


In [6]:
sheet_datas[1]['订单金额'].min()

1.5

### 汇总数据

In [7]:
data_merge = pd.concat(sheet_datas[:-1],axis=0)

In [8]:
data_merge.count()

会员ID             202827
订单号              202827
提交日期             202827
订单金额             202827
max_year_date    202827
dtype: int64

In [9]:
#计算订单距离年底的时间间隔 计算每个订单所在年份
data_merge['date_interval'] = data_merge['max_year_date']-data_merge['提交日期']

In [10]:
data_merge.head()

Unnamed: 0,会员ID,订单号,提交日期,订单金额,max_year_date,date_interval
0,15278002468,3000304681,2015-01-01,499.0,2015-12-31,364 days
1,39236378972,3000305791,2015-01-01,2588.0,2015-12-31,364 days
2,38722039578,3000641787,2015-01-01,498.0,2015-12-31,364 days
3,11049640063,3000798913,2015-01-01,1572.0,2015-12-31,364 days
4,35038752292,3000821546,2015-01-01,10.1,2015-12-31,364 days


In [11]:
data_merge['year'] = data_merge['提交日期'].dt.year # 提取每一个订单所在的年份

In [13]:
data_merge.head()

Unnamed: 0,会员ID,订单号,提交日期,订单金额,max_year_date,date_interval,year
0,15278002468,3000304681,2015-01-01,499.0,2015-12-31,364 days,2015
1,39236378972,3000305791,2015-01-01,2588.0,2015-12-31,364 days,2015
2,38722039578,3000641787,2015-01-01,498.0,2015-12-31,364 days,2015
3,11049640063,3000798913,2015-01-01,1572.0,2015-12-31,364 days,2015
4,35038752292,3000821546,2015-01-01,10.1,2015-12-31,364 days,2015


In [14]:
#把date_interval 从datetime类型转换成数字
data_merge['date_interval'] = data_merge['date_interval'].apply(lambda x:x.days)

In [16]:
data_merge.head()

Unnamed: 0,会员ID,订单号,提交日期,订单金额,max_year_date,date_interval,year
0,15278002468,3000304681,2015-01-01,499.0,2015-12-31,364,2015
1,39236378972,3000305791,2015-01-01,2588.0,2015-12-31,364,2015
2,38722039578,3000641787,2015-01-01,498.0,2015-12-31,364,2015
3,11049640063,3000798913,2015-01-01,1572.0,2015-12-31,364,2015
4,35038752292,3000821546,2015-01-01,10.1,2015-12-31,364,2015


In [17]:
# 按照年份 以及会员的ID 分组计算 RFM的最终数据
rfm_data = data_merge.groupby(['year','会员ID'],as_index=False).agg({'date_interval':'min','提交日期':'count','订单金额':'sum'})

In [19]:
rfm_data.columns = ['year','会员ID','r','f','m']

In [21]:
rfm_data.head()

Unnamed: 0,year,会员ID,r,f,m
0,2015,267,197,2,105.0
1,2015,282,251,1,29.7
2,2015,283,340,1,5398.0
3,2015,343,300,1,118.0
4,2015,525,37,3,213.0


### 定义边界

In [22]:
# 查看数据分布
desc_pd = rfm_data.iloc[:,2:].describe().T
print(desc_pd)
# 定义区间边界
r_bins = [-1,79,255,365] # 注意起始边界小于最小值
f_bins = [0,2,5,130] 
m_bins = [0,69,1199,206252]


      count         mean          std  min   25%    50%     75%       max
r  148591.0   165.524043   101.988472  0.0  79.0  156.0   255.0     365.0
f  148591.0     1.365002     2.626953  1.0   1.0    1.0     1.0     130.0
m  148591.0  1323.741329  3753.906883  1.5  69.0  189.0  1199.0  206251.8


#### 计算RFM因子权重

In [23]:
# 将会员等级连接进来
rfm_merge = pd.merge(rfm_data,sheet_datas[-1],on='会员ID',how='inner')

In [25]:
rfm_merge.head()

Unnamed: 0,year,会员ID,r,f,m,会员等级
0,2015,267,197,2,105.0,1
1,2015,282,251,1,29.7,5
2,2017,282,314,2,12992.0,5
3,2018,282,19,5,30027.0,5
4,2015,283,340,1,5398.0,4


In [26]:
#rfm计算权重，利用随机森林进行建模
clf = RandomForestClassifier()
clf = clf.fit(rfm_merge[['r','f','m']],rfm_merge['会员等级'])



In [27]:
weights = clf.feature_importances_

In [28]:
weights

array([0.38612078, 0.00521915, 0.60866007])

### RFM计算过程

In [30]:
# pd.cut 做数据的离散化  参数1 需要离散化的数据  r_bins 离散化的时候 分组的节点 ，labels 分组之后，每一组会带上一个标签 标签的值
rfm_data['r_score'] = pd.cut(rfm_data['r'],r_bins,labels=[i for i in range(len(r_bins)-1,0,-1)])
rfm_data['f_score'] = pd.cut(rfm_data['f'],f_bins,labels=[i+1 for i in range(len(f_bins)-1)])
rfm_data['m_score'] = pd.cut(rfm_data['m'],m_bins,labels=[i+1 for i in range(len(m_bins)-1)])

In [32]:
rfm_data.head()

Unnamed: 0,year,会员ID,r,f,m,r_score,f_score,m_score
0,2015,267,197,2,105.0,2,1,2
1,2015,282,251,1,29.7,2,1,1
2,2015,283,340,1,5398.0,1,1,3
3,2015,343,300,1,118.0,1,1,2
4,2015,525,37,3,213.0,3,2,2


### 计算RFM的得分

In [33]:
# 加权的RFM的单一值
rfm_data = rfm_data.apply(np.int32)

In [34]:
rfm_data['rfm_score'] = rfm_data['r_score']*weights[0]+rfm_data['f_score']*weights[1]+rfm_data['m_score']*weights[2]

In [35]:
rfm_data.head()

Unnamed: 0,year,会员ID,r,f,m,r_score,f_score,m_score,rfm_score
0,2015,267,197,2,105,2,1,2,1.994781
1,2015,282,251,1,29,2,1,1,1.386121
2,2015,283,340,1,5398,1,1,3,2.21732
3,2015,343,300,1,118,1,1,2,1.60866
4,2015,525,37,3,213,3,2,2,2.386121


In [37]:
#计算rfm的组合 实际上就是把r f m的得分拼接起来
rfm_data['r_score'] = rfm_data['r_score'].astype(np.str)
rfm_data['f_score'] = rfm_data['f_score'].astype(np.str)
rfm_data['m_score'] = rfm_data['m_score'].astype(np.str)

In [39]:
rfm_data['rfm_group'] = rfm_data['r_score'].str.cat(rfm_data['f_score']).str.cat(rfm_data['m_score'])

In [40]:
rfm_data.head()

Unnamed: 0,year,会员ID,r,f,m,r_score,f_score,m_score,rfm_score,rfm_group
0,2015,267,197,2,105,2,1,2,1.994781,212
1,2015,282,251,1,29,2,1,1,1.386121,211
2,2015,283,340,1,5398,1,1,3,2.21732,113
3,2015,343,300,1,118,1,1,2,1.60866,112
4,2015,525,37,3,213,3,2,2,2.386121,322


### 保存结果到excel

In [41]:
rfm_data.to_excel('sales_rfm_group_score.xlsx')

### 把结果写入到mysql数据库

In [42]:
# 数据库信息
config = {'host': '127.0.0.1',  # 默认127.0.0.1
          'user': 'root',  # 用户名
          'password': 'MyNewPass',  # 密码
          'port': 3306,  # 端口，默认为3306
          'database': 'test',  # 数据库名称
          'charset': 'utf8'  # 字符编码
          }

In [43]:
# 建表操作
con = pymysql.connect(**config)  # 建立mysql连接
cursor = con.cursor()  # 获得游标
cursor.execute("show tables")  # 查询表
table_list = [t[0] for t in cursor.fetchall()]  # 读出所有库
# 查找数据库是否存在目标表，如果没有则新建
table_name = 'sales_rfm_score'  # 要写库的表名
if not table_name in table_list:  # 如果目标表没有创建
    cursor.execute('''
    CREATE TABLE %s (
    userid               VARCHAR(20),
    r_score               int(2),
    f_score              int(2),
    m_score              int(2),
    rfm_score              DECIMAL(10,2),
    rfm_group              VARCHAR(10),
    insert_date              VARCHAR(20)
    )ENGINE=InnoDB DEFAULT CHARSET=utf8
    ''' % table_name)  # 创建新表

In [45]:
# 梳理数据
write_db_data = rfm_data[['会员ID','r_score','f_score','m_score','rfm_score','rfm_group']] # 主要数据
timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))  # 日期

In [46]:
# 写库
for each_value in write_db_data.values:
    insert_sql = "INSERT INTO `%s` VALUES ('%s',%s,%s,%s,%s,'%s','%s')" % \
                 (table_name, each_value[0], each_value[1], each_value[2], \
                  each_value[3],each_value[4],each_value[5],
                  timestamp)  # 写库SQL依据
    cursor.execute(insert_sql)  # 执行SQL语句，execute函数里面要用双引号
    con.commit()  # 提交命令
cursor.close()  # 关闭游标
con.close()  # 关闭数据库连接

  result = self._query(query)


KeyboardInterrupt: 

In [47]:
#图形数据准备
display_data = rfm_data.groupby(['rfm_group','year'],as_index=False)['会员ID'].count()

In [49]:
display_data.columns =['rfm_group','year','number']
display_data['rfm_group'] = display_data['rfm_group'].astype(np.int32)
display_data.head()

Unnamed: 0,rfm_group,year,number
0,111,2015,2180
1,111,2016,1498
2,111,2017,3169
3,111,2018,2271
4,112,2015,3811


In [50]:
# 显示图形
bar3d = Bar3D("", width=900, height=600)
range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
               '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
bar3d.add(
    "rfm分组结果",
    "",
    "",
    [d.tolist() for d in display_data.values],
    is_visualmap=True,
    visual_range=[0, display_data['number'].max()],
    visual_range_color=range_color,
    grid3d_width=200,
    grid3d_height=80,
    grid3d_depth=80
)
bar3d

In [51]:
!pip list

Package                           Version 
--------------------------------- --------
appdirs                           1.4.3   
attrs                             19.1.0  
backcall                          0.1.0   
bleach                            3.1.0   
boto                              2.49.0  
boto3                             1.9.125 
botocore                          1.12.125
bz2file                           0.98    
cachetools                        3.1.0   
certifi                           2019.3.9
chardet                           3.0.4   
Click                             7.0     
colorama                          0.4.1   
cycler                            0.10.0  
decorator                         4.4.0   
defusedxml                        0.5.0   
docutils                          0.14    
dukpy                             0.2.2   
entrypoints                       0.3     
et-xmlfile                        1.0.1   
Flask                             1.0.2   
future     