# 基于RFM的用户价值分析

In [1]:
with open('./datacode_for_book/chapter5/sales.csv','r',encoding='utf-8') as f:
    data = pd.read_csv(f)
   

data['ORDERID'] = data['ORDERID'].astype('object')
data['USERID'] = data['USERID'].astype('object')
data[:5]

Unnamed: 0,USERID,ORDERDATE,ORDERID,AMOUNTINFO
0,142074,2016-01-01,4196439032,9399.0
1,56927,2016-01-01,4198324983,8799.0
2,87058,2016-01-01,4191287379,6899.0
3,136104,2016-01-01,4198508313,5999.0
4,117831,2016-01-01,4202238313,5399.0


In [2]:
data.describe(include='all')

Unnamed: 0,USERID,ORDERDATE,ORDERID,AMOUNTINFO
count,86135.0,86133,86135.0,86127.0
unique,60503.0,347,76050.0,
top,74270.0,2016-04-18,3945060000.0,
freq,14.0,300,13.0,
mean,,,,744.705249
std,,,,1425.211176
min,,,,0.5
25%,,,,13.0
50%,,,,59.0
75%,,,,629.0


In [3]:
data.isnull().any(axis=0)

USERID        False
ORDERDATE      True
ORDERID       False
AMOUNTINFO     True
dtype: bool

In [4]:
data[data.isnull().any(axis=1)]

Unnamed: 0,USERID,ORDERDATE,ORDERID,AMOUNTINFO
20,75849,2016-01-01,4197103430,
72,103714,,4136159682,189.0
114,155209,2016-01-01,4177940815,
229,139877,,4111956196,6.3
233,54599,2016-01-01,4119525205,
260,65456,2016-01-02,4195643356,
62134,122134,2016-09-21,3826649773,
70342,116995,2016-10-24,3981569421,
80110,98888,2016-12-06,3814398698,
86069,145951,2016-12-29,4139830098,


In [5]:
#异常值处理
data = data.dropna()
data = data[data.AMOUNTINFO > 1]
data[:3]

Unnamed: 0,USERID,ORDERDATE,ORDERID,AMOUNTINFO
0,142074,2016-01-01,4196439032,9399.0
1,56927,2016-01-01,4198324983,8799.0
2,87058,2016-01-01,4191287379,6899.0


In [6]:
data['ORDERDATE'] = pd.to_datetime(data['ORDERDATE'],format='%Y-%m-%d')
data.dtypes
data[:3]

USERID                object
ORDERDATE     datetime64[ns]
ORDERID               object
AMOUNTINFO           float64
dtype: object

Unnamed: 0,USERID,ORDERDATE,ORDERID,AMOUNTINFO
0,142074,2016-01-01,4196439032,9399.0
1,56927,2016-01-01,4198324983,8799.0
2,87058,2016-01-01,4191287379,6899.0


In [7]:
#数据转化 RFM
data = data.groupby('USERID').agg({'ORDERDATE':'max','AMOUNTINFO':('count','sum')})
data.columns = ['recency','frequency','monetary']
data = data.reset_index()
data[:3]

Unnamed: 0,USERID,recency,frequency,monetary
0,51220,2016-09-20,2,151.9
1,51221,2016-05-16,1,29.9
2,51224,2016-07-24,2,12.9


In [42]:
#计算RFM得分
deadline_date = pd.datetime(2017,1,1)
r_interval = (deadline_date - data['recency']).dt.days
data['r_score'] = pd.cut(r_interval,bins=5,labels=[5,4,3,2,1])
data['r_score'] = data['r_score'].astype('int64')
data['f_score'] = pd.cut(data['frequency'],bins=5,labels=[1,2,3,4,5])
data['f_score'] = data['f_score'].astype('int64')
data['m_score'] = pd.cut(data['monetary'],bins=5,labels=[1,2,3,4,5])
data['m_score'] = data['m_score'].astype('int64')
data[:3]

Unnamed: 0,USERID,recency,frequency,monetary,r_score,f_score,m_score
0,51220,2016-09-20,2,151.9,4,1,1
1,51221,2016-05-16,1,29.9,2,1,1
2,51224,2016-07-24,2,12.9,3,1,1


In [49]:
data['rfm_wsocre'] = data['r_score'] * 0.6 + data['f_score'] * 0.3 + data['m_score'] * 0.1
data['rfm_comb'] = data['r_score'].astype('str').str.cat(data['f_score'].astype('str')).str.cat(data['m_score'].astype('str'))
data[:3]

Unnamed: 0,USERID,recency,frequency,monetary,r_score,f_score,m_score,rfm_wsocre,rfm_comb
0,51220,2016-09-20,2,151.9,4,1,1,2.8,411
1,51221,2016-05-16,1,29.9,2,1,1,1.6,211
2,51224,2016-07-24,2,12.9,3,1,1,2.2,311


In [56]:
data.groupby(['rfm_comb'])['rfm_comb'].count().sort_values(ascending=False)

rfm_comb
511    15456
411    13493
311    11020
211    10136
111     8769
521      478
421      227
321       78
221       12
412        2
121        2
555        1
322        1
212        1
Name: rfm_comb, dtype: int64

# 基于AdaBoost的营销响应预测

In [57]:
import time  # 导入自带时间库
from sklearn.preprocessing import OneHotEncoder  # 导入OneHotEncoder库
from sklearn.model_selection import StratifiedKFold, cross_val_score  # 导入交叉检验算法
from sklearn.feature_selection import SelectPercentile, f_classif  # 导入特征选择方法库
from sklearn.ensemble import AdaBoostClassifier  # 导入集成算法
from sklearn.pipeline import Pipeline  # 导入Pipeline库
from sklearn.metrics import accuracy_score  # 准确率指标

In [64]:
data = pd.read_excel('./datacode_for_book/chapter5/order.xlsx')
data[:3]

Unnamed: 0,age,total_pageviews,edu,edu_ages,user_level,industry,value_level,act_level,sex,blue_money,red_money,work_hours,region,response
0,39.0,77516.0,1.0,13.0,1.0,1.0,1,1.0,1.0,2174,0.0,40,1.0,0
1,50.0,83311.0,1.0,13.0,2.0,2.0,2,1.0,1.0,0,0.0,13,1.0,0
2,38.0,215646.0,2.0,9.0,3.0,3.0,1,1.0,1.0,0,0.0,40,1.0,0


In [None]:
data[data.isnull().any(axis=1)]

In [None]:
data.groupby('response')['response'].count()

In [None]:
var_list = {'edu': 'int32',
        'user_level': 'int32',
        'industry': 'int32',
        'value_level': 'int32',
        'act_level': 'int32',
        'sex': 'int32',
        'region': 'int32'
        }  # 字典：定义要转换的列及其数据类型
for var, type in var_list.items():  # 循环读出列名和对应的数据类型
    data[var] = data[var].astype(type)  # 数据类型转换
    
data.dtypes

In [66]:
na_rules = {'age': data['age'].mean(),
        'total_pageviews': data['total_pageviews'].mean(),
        'edu': data['edu'].median(),
        'edu_ages': data['edu_ages'].median(),
        'user_level': data['user_level'].median(),
        'industry': data['user_level'].median(),
        'act_level': data['act_level'].median(),
        'sex': data['sex'].median(),
        'red_money': data['red_money'].mean(),
        'region': data['region'].median()
        }  # 字典：定义各个列数据转换方法
data = data.fillna(na_rules)  # 使用指定方法填充缺失值
data[:3]

Unnamed: 0,age,total_pageviews,edu,edu_ages,user_level,industry,value_level,act_level,sex,blue_money,red_money,work_hours,region,response
0,39.0,77516.0,1.0,13.0,1.0,1.0,1,1.0,1.0,2174,0.0,40,1.0,0
1,50.0,83311.0,1.0,13.0,2.0,2.0,2,1.0,1.0,0,0.0,13,1.0,0
2,38.0,215646.0,2.0,9.0,3.0,3.0,1,1.0,1.0,0,0.0,40,1.0,0


In [71]:
data_con = data[['edu', 'user_level', 'industry', 'value_level', 'act_level', 'sex', 'region']]  # 选择要做标志转换的数据
data_org = data[['age', 'total_pageviews', 'edu_ages', 'blue_money', 'red_money', 'work_hours']].values  # 设置不作标志转换的列

enc = OneHotEncoder(categories='auto')  # 建立标志转换模型对象
enc.fit(data_con)  # 训练模型
data_con_new = enc.transform(data_con).toarray()  # 转换数据并输出为数组格式
new_matrix = np.hstack((data_org,data_con_new))  # 将未转换的数据与转换后的数据合并
new_matrix

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

array([[3.90000e+01, 7.75160e+04, 1.30000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.00000e+01, 8.33110e+04, 1.30000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.80000e+01, 2.15646e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [6.00000e+01, 1.02310e+05, 1.20000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.80000e+01, 2.40175e+05, 7.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [4.10000e+01, 1.45441e+05, 9.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])