In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
file_path = './tmp/data_cleaned.csv'
data = pd.read_csv(file_path)

data.head(), data.columns

(   Unnamed: 0  MEMBER_NO   FFP_DATE FIRST_FLIGHT_DATE GENDER  FFP_TIER  \
 0           0      54993  2006/11/2        2008/12/24      男         6   
 1           1      28065  2007/2/19          2007/8/3      男         6   
 2           2      55106   2007/2/1         2007/8/30      男         6   
 3           3      21189  2008/8/22         2008/8/23      男         5   
 4           4      39546  2009/4/10         2009/4/15      男         6   
 
      WORK_CITY WORK_PROVINCE WORK_COUNTRY   AGE  ... ADD_Point_SUM  \
 0            .            北京           CN  31.0  ...         39992   
 1          NaN            北京           CN  42.0  ...         12000   
 2            .            北京           CN  40.0  ...         15491   
 3  Los Angeles            CA           US  64.0  ...             0   
 4           贵阳            贵州           CN  48.0  ...         22704   
 
    Eli_Add_Point_Sum  L1Y_ELi_Add_Points  Points_Sum  L1Y_Points_Sum  \
 0             114452              111100      

In [8]:
# 客户分类
def categorize_customer(ratio):
    if ratio < 0.5:
        return '已流失'
    elif 0.5 <= ratio < 0.9:
        return '准流失'
    else:
        return '未流失'

# 根据“Ration_L1Y_Flight_Count”对客户进行分类
data['Customer_Status'] = data['Ration_L1Y_Flight_Count'].apply(categorize_customer)

# 关键属性
selected_features = ['FFP_TIER', 'AVG_INTERVAL', 'avg_discount', 'EXCHANGE_COUNT', 
                     'Point_NotFlight', 'Ration_L1Y_Flight_Count', 
                     'L1Y_BP_SUM', 'P1Y_BP_SUM', 'Customer_Status']

old_customers_data = data[data['FLIGHT_COUNT'] > 6]
old_customers_selected = old_customers_data[selected_features]
old_customers_selected.to_csv("./tmp/classified.csv")
old_customers_selected.head()

Unnamed: 0,FFP_TIER,AVG_INTERVAL,avg_discount,EXCHANGE_COUNT,Point_NotFlight,Ration_L1Y_Flight_Count,L1Y_BP_SUM,P1Y_BP_SUM,Customer_Status
0,6,3.483254,0.961639,34,50,0.509524,259111,246197,准流失
1,6,5.194245,1.252314,29,33,0.514286,185122,177358,准流失
2,6,5.298507,1.254676,20,26,0.518519,182087,169072,准流失
3,5,27.863636,1.09087,11,12,0.434783,151210,186104,已流失
4,6,4.788079,0.970658,27,39,0.532895,145396,128448,准流失


In [9]:
X = data_selected.drop('Customer_Status', axis=1)
y = data_selected['Customer_Status']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [11]:
clf = DecisionTreeClassifier(random_state=32)
# 训练模型
clf.fit(X_train, y_train)

In [16]:
# 预测测试集
te_pred = clf.predict(X_test)
te_pred

array(['准流失', '未流失', '已流失', ..., '未流失', '已流失', '准流失'], dtype=object)

In [18]:
cm_te = confusion_matrix(y_test, te_pred)
cm_te

array([[4982,    0,    0],
       [   0, 5635,    0],
       [   0,    0, 1792]])

In [19]:
print("测试集准确率：", accuracy_score(y_test, te_pred))

测试集准确率： 1.0


In [20]:
# 预测训练集
tr_pred = clf.predict(X_train)
tr_pred

array(['已流失', '已流失', '已流失', ..., '准流失', '已流失', '已流失'], dtype=object)

In [21]:
cm_tr = confusion_matrix(y_train, tr_pred)
cm_tr

array([[19859,     0,     0],
       [    0, 22696,     0],
       [    0,     0,  7079]])

In [22]:
print("训练集准确率：", accuracy_score(y_train, tr_pred))

训练集准确率： 1.0


In [23]:
classification_report_output = classification_report(y_test, te_pred)
classification_report_output

'              precision    recall  f1-score   support\n\n         准流失       1.00      1.00      1.00      4982\n         已流失       1.00      1.00      1.00      5635\n         未流失       1.00      1.00      1.00      1792\n\n    accuracy                           1.00     12409\n   macro avg       1.00      1.00      1.00     12409\nweighted avg       1.00      1.00      1.00     12409\n'