In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import confusion_matrix
inputfile0 = 'user_loss.csv'  # 客户信息数据文件
inputfile1 = 'info_new.csv'  # 订单表数据文件
outputfile = 'info_user.csv'  # 合并客户信息和订单表后的数据文件

# 合并两个表
# 读取数据
user_loss = pd.read_csv(inputfile0, encoding='gbk')
info_new = pd.read_csv(inputfile1) 
# 将时间转为时间格式
user_loss['CREATED'] = pd.to_datetime(user_loss['CREATED'])
info_new['use_start_time'] = pd.to_datetime(info_new['use_start_time'])
info_new['lock_time'] = pd.to_datetime(info_new['lock_time'])

# 匹配用户的最后一次用餐时间
for i in range(len(user_loss)):
    info1 = info_new.iloc[info_new[info_new['name']==user_loss.iloc[i,2]].index.tolist(),:]
    if sum(info_new['name']==user_loss.iloc[i,2]) != 0:
        user_loss.iloc[i,14]= max(info1['use_start_time'])

# 筛选特征
user = user_loss.iloc[:,[0,2,14,37]]
# 提取有效订单
info = info_new.loc[info_new['order_status'] == 1,['emp_id', 'number_consumers', 'expenditure']]  

info = info.rename(columns = {'emp_id':'USER_ID'})  # 修改列名
# 合并两个表
info_user = pd.merge(user, info,left_on = 'USER_ID',right_on = 'USER_ID', how='left')
info_user.to_csv(outputfile , index=False,sep=',')

In [2]:
inputfile = 'info_user.csv'  # 合并客户信息和订单表后的数据文件
outputfile = 'info_user_clear.csv'  # 客户流失特征数据文件
# 构建特征
info_user = pd.read_csv(inputfile , encoding='utf-8')

# 提取info表的用户名和用餐时间，并按人名对用餐人数和金额进行分组求和
info_user1 = info_user['USER_ID'].value_counts()  # 统计每个人的用餐次数
info_user1 = info_user1.reset_index()
info_user1.columns = ['USER_ID', 'frequence']  # 修改列名

# 求出每个人的消费总金额
info_user2 = info_user[['number_consumers', 'expenditure']].groupby(info_user['USER_ID']).sum()  # 分组求和
info_user2 = info_user2.reset_index()
info_user2.columns = ['USER_ID', 'numbers', 'amount']
info_user_new = pd.merge(info_user1,info_user2,left_on='USER_ID', right_on='USER_ID', how='left')  # 合并两个表

# 对合并后的数据进行处理
info_user = info_user.iloc[:,:4]
info_user = info_user.groupby(['USER_ID']).last()
info_user = info_user.reset_index()
info_user_new = pd.merge(info_user_new,info_user,left_on='USER_ID', right_on='USER_ID', how='left')  # 合并两个表

# 去除空值
info_user_new.isnull().sum().sum()  # 查看空值数目
info_user_new = info_user_new.dropna(axis=0)
info_user_new = info_user_new[info_user_new["numbers"] != 0]  # 删除numbers为0的客户

# 求平均消费金额，并保留2为小数
info_user_new['average'] = info_user_new['amount']/info_user_new['numbers']
info_user_new['average'] = info_user_new['average'].apply(lambda x: '%.2f'% x)

# 计算每个客户最近一次点餐的时间距离观测窗口结束的天数
# 修改时间列，改为日期

info_user_new['LAST_VISITS'] = pd.to_datetime(info_user_new['LAST_VISITS'])
datefinally = pd.to_datetime('2016-7-31')  # 观测窗口结束时间
time = datefinally - info_user_new['LAST_VISITS']
info_user_new['recently'] = time.apply(lambda x: x.days)   # 计算时间差

info_user_new = info_user_new.loc[:,['USER_ID', 'ACCOUNT', 'frequence', 'amount', 'average','recently', 'type']]  # 筛选属性
info_user_new.to_csv(outputfile , index=False, encoding='gbk')

In [3]:
# 划分测试集、训练集
info_user = pd.read_csv('info_user_clear.csv', encoding='gbk')
from sklearn.preprocessing import LabelEncoder

# 创建LabelEncoder实例
le = LabelEncoder()

# 对'流失状态'列进行编码，将"流失"映射为1，"非流失"映射为2，准流失为0
info_user['type'] = le.fit_transform(info_user['type'])
# 删除流失用户
info_user = info_user[info_user['type'] != '1']

model_data = info_user.iloc[:,[2,3,4,5,6]]

x_tr, x_te, y_tr, y_te = train_test_split(model_data.iloc[:,:-1], model_data['type'], test_size=0.2, random_state=12345)

dtc = DTC()  # 初始化决策树对象，基于信息熵
dtc.fit(x_tr,y_tr)  # 训练模型
pre = dtc.predict(x_te)
sum(pre == y_te)/len(pre)

0.8343815513626834

In [4]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(n_estimators=3, random_state=42)
random_forest_model.fit(x_tr,y_tr)
pre2 = random_forest_model.predict(x_te)
sum(pre2 == y_te)/len(pre)

0.7190775681341719

In [5]:
from sklearn.svm import SVC

svc=SVC(random_state=42)
svc.fit(x_tr,y_tr)
pre3 = svc.predict(x_te)
sum(pre3 == y_te)/len(pre)

0.7358490566037735

In [6]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')
knn.fit(x_tr,y_tr)
pre4 = knn.predict(x_te)
sum(pre4 == y_te)/len(pre)

0.8448637316561844