In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph.
import warnings
warnings.filterwarnings("ignore")

from pylab import rcParams

import collections
from sklearn.model_selection import train_test_split
import xgboost
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer, LabelBinarizer, Normalizer, OneHotEncoder

%matplotlib inline
pd.options.display.max_columns = None

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [2]:
# 类别分析(分析分类的占比的分布)
def check_classifier(df, col_name_list=None):
    for col_name in col_name_list:
        print("===========================")
        result = df[col_name].value_counts()
        cols = result.index
        result_dict = {k: v for k, v in zip(cols, list(result))}

        items = []
        header = ["类别名称", "数量"]
        items.append(header)
        for k, v in result_dict.items():
            item = dict()
            item["类别名称"] = k
            item["数量"] = v
            print(col_name, item)

    return None

# 转换为哑变量(如果没有手动指定，则自动识别object进行转换)
def dumb_columns(df, columns=None, inverse=False):

    col_name_list = list(df.columns.values)
    # 没有选择的情况
    if not columns:
        print(col_name_list)
        obj_list = []
        for index, d in enumerate(df.dtypes):
            if d == "object":
                obj_list.append(col_name_list[index])

        columns = obj_list
        print("columns {}".format(columns))
    else:
        if inverse:
            for t in columns:
                col_name_list.remove(t)
            columns = col_name_list
        else:
            columns = columns

    # df = df.fillna("00000000")
    items = []
    for column_name in columns:
        df[column_name] = df[column_name].fillna("00000000")

        # 获得训练集和测试集的所有分类并排序，保持每次运行程序时哑变量数字代表的类型一致
        all_class = list(set(df[column_name]))
#         print("{} - all_class {}".format(column_name, all_class))

        try:
            if "00000000" in all_class:
                all_class.remove("00000000")
                all_class.sort()
                all_class = ["00000000"] + all_class
                class_mapping = {label: idx - 1 for idx, label in enumerate(all_class)}
            else:
                all_class.sort()
                class_mapping = {label: idx for idx, label in enumerate(all_class)}

        except TypeError:
            type_list = set()
            for a in all_class:
                type_list.add(str(type(a)))
            print("type_list {}".format(type_list))
            raise Exception("<{}>列中存在多种类型的数据{}，或者存在没有填充的数据，请预处理后再转哑变量".format(column_name, type_list))

        # 数字映射到每一个类型
        df[column_name] = df[column_name].map(class_mapping).astype(int)

        dict_list = sorted(class_mapping.items(), key=lambda x: x[1], reverse=True)

        page = []
        page.append({"header": ["类别", "编码"]})
        for k, v in dict_list:
            item = collections.OrderedDict()
            item["类别"] = k
            item["编码"] = v
            page.append(item)
            
            print("{}=> {}: {}".format(column_name, k, v))

        items.append({
            "type": "table_1",
            "name": column_name,
            "data": page
        })

    # 转nan
    for col in columns:
        df.loc[df[col] == -1, col] = np.nan
    return df

# 划分训练集和测试集
def split_train_test(df, random_state=33, test_size=0.25):
    random_state = int(random_state)
    test_size = float(test_size)
    train, test = train_test_split(df, random_state=random_state, test_size=test_size)
    return train, test

def split_column(df, y="y"):
    try:
        X = df.drop(y, axis=1)
    except KeyError:
        raise KeyError("请在拆分列的参数中选择数据中有的字段")
    y = pd.DataFrame(df[y], columns=[y])
    return X, y

# xgboost分类
def xgboost_classifier(x, y, max_depth=3, learning_rate=0.1, n_estimators=100, min_child_weight=1, gamma=0,
                                  subsample=1, colsample_bytree=1, scale_pos_weight=1, random_state=27, reg_alpha=0, reg_lambda=1):

    if max_depth:
        max_depth = int(max_depth)
        if max_depth < 0:
            max_depth = None
    else:
        max_depth = None

    learning_rate = float(learning_rate)
    n_estimators = int(n_estimators)
    min_child_weight = int(min_child_weight)
    gamma = float(gamma)
    subsample = float(subsample)
    colsample_bytree = float(colsample_bytree)
    scale_pos_weight = int(scale_pos_weight)

    random_state = int(random_state)

    # 拟合XGBoost模型
    model = xgboost.XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators,
                                  min_child_weight=min_child_weight, gamma=gamma,
                                  subsample=subsample, colsample_bytree=colsample_bytree,
                                  scale_pos_weight=scale_pos_weight, random_state=random_state, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

    model.fit(x, y)
    return model

# 转换为one-hot编码
def OnehotEncoding(df, columns=None):
    if not columns:
        col_name_list = df.columns.values

        obj_list = []
        for index, d in enumerate(df.dtypes):
            if d == "object":
                obj_list.append(col_name_list[index])

        columns = obj_list

    data = df[columns]
    # 实例化OnehotEncoder
    enc = OneHotEncoder(categories="auto")
    # 生成目标特征列One_hot编码
    data_encoded = enc.fit_transform(data).toarray()
    # 生成新的列名
    new_columns = list(enc.get_feature_names())
    for i, column_name in enumerate(new_columns):
        df[column_name] = data_encoded[:, i]
    return df

In [3]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head()

(7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
df.columns.values

array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges', 'Churn'], dtype=object)

In [7]:
df.rename(
    columns={
        'customerID': "客户ID",
        'gender': "性别",
        'SeniorCitizen': "是否老年人客户",
        'Partner': "是否有合作伙伴",
        'Dependents': "是否有被抚养人",
        'tenure': "在当前公司工作多少个月",
        'PhoneService': "电话服务",
        'MultipleLines': "使用多个电话服务",
        'InternetService': "互联网供应商",
        'OnlineSecurity': "在线安全服务",
        'OnlineBackup': "在线备份服务",
        'DeviceProtection': "设备保护",
        'TechSupport': "技术支持",
        'StreamingTV': "流媒体电视",
        'StreamingMovies': "流媒体电影",
        'Contract': "合同期限",
        'PaperlessBilling': "无纸化计费",
        'PaymentMethod': "付款方式",
        'MonthlyCharges': "每月付费",
        'TotalCharges': "总计付费",
        'Churn': "是否流失",
    },
    inplace=True)
df.head()

Unnamed: 0,客户ID,性别,是否老年人客户,是否有合作伙伴,是否有被抚养人,在当前公司工作多少个月,电话服务,使用多个电话服务,互联网供应商,在线安全服务,在线备份服务,设备保护,技术支持,流媒体电视,流媒体电影,合同期限,无纸化计费,付款方式,每月付费,总计付费,是否流失
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
check_class_list = ["性别", "是否老年人客户", "是否有合作伙伴", "是否有被抚养人", "电话服务", "使用多个电话服务", "互联网供应商",
                    "在线安全服务", "在线备份服务", "设备保护", "技术支持", "流媒体电视", "流媒体电影", "合同期限", "无纸化计费",
                   "付款方式", "是否流失"]
_ = check_classifier(df, col_name_list=check_class_list)

性别 {'类别名称': 'Male', '数量': 3555}
性别 {'类别名称': 'Female', '数量': 3488}
是否老年人客户 {'类别名称': 0, '数量': 5901}
是否老年人客户 {'类别名称': 1, '数量': 1142}
是否有合作伙伴 {'类别名称': 'No', '数量': 3641}
是否有合作伙伴 {'类别名称': 'Yes', '数量': 3402}
是否有被抚养人 {'类别名称': 'No', '数量': 4933}
是否有被抚养人 {'类别名称': 'Yes', '数量': 2110}
电话服务 {'类别名称': 'No', '数量': 682}
电话服务 {'类别名称': 'Yes', '数量': 6361}
使用多个电话服务 {'类别名称': 'No', '数量': 3390}
使用多个电话服务 {'类别名称': 'No phone service', '数量': 682}
使用多个电话服务 {'类别名称': 'Yes', '数量': 2971}
互联网供应商 {'类别名称': 'DSL', '数量': 2421}
互联网供应商 {'类别名称': 'Fiber optic', '数量': 3096}
互联网供应商 {'类别名称': 'No', '数量': 1526}
在线安全服务 {'类别名称': 'No internet service', '数量': 1526}
在线安全服务 {'类别名称': 'No', '数量': 3498}
在线安全服务 {'类别名称': 'Yes', '数量': 2019}
在线备份服务 {'类别名称': 'No internet service', '数量': 1526}
在线备份服务 {'类别名称': 'No', '数量': 3088}
在线备份服务 {'类别名称': 'Yes', '数量': 2429}
设备保护 {'类别名称': 'No internet service', '数量': 1526}
设备保护 {'类别名称': 'No', '数量': 3095}
设备保护 {'类别名称': 'Yes', '数量': 2422}
技术支持 {'类别名称': 'No internet service', '数量': 1526}
技术支持 {'类别名称': 'No', '数量': 3

In [9]:
# 缺失值
df['总计付费'] = df["总计付费"].replace(" ",np.nan)
df = df[df["总计付费"].notnull()]
df = df.reset_index()[df.columns]

df[["总计付费"]] = df[["总计付费"]].astype(float)

df["总计付费_每月付费_比值"] = df["每月付费"] / df["总计付费"]

In [10]:
replace_cols = [ '在线安全服务', '在线备份服务', '设备保护',
                '技术支持','流媒体电视', '流媒体电影']
for i in replace_cols : 
    df[i]  = df[i].replace({'No internet service' : 'No'})

In [11]:
# dumb_list = ["性别", "是否有合作伙伴", "是否有被抚养人", "电话服务", "使用多个电话服务", "互联网供应商",
#                     "在线安全服务", "在线备份服务", "设备保护", "技术支持", "流媒体电视", "流媒体电影", "合同期限", "无纸化计费",
#                    "付款方式", "是否流失"]
# df = dumb_columns(df, columns=check_class_list, inverse=False)

df["SeniorCitizen"] = df["是否老年人客户"].replace({1:"Yes",0:"No"})

KeyError: 'SeniorCitizen'

In [None]:
drop_list = ['在当前公司工作多少个月']
df = df.drop(drop_list, axis=1)

# dl_list = ["使用多个电话服务", "互联网供应商",
#                     "在线安全服务", "在线备份服务", "技术支持", "流媒体电影", "合同期限", "无纸化计费",
#                    "付款方式"]
# df = OnehotEncoding(df, columns=dl_list)
# df = df.drop(dl_list, axis=1)

In [None]:
df.head()

In [None]:
# 计算变量之间的相关系数
# 如果变量之间相关系数大于0.6，说明两个变量有较高的正相关性，
# 这种情况训练的模型会使模型失真，可以选择去掉其中一个变量
corr = df.corr()
xticks = list(corr.index) # x轴标签
yticks = list(corr.index) # y轴标签
fig = plt.figure(figsize=(15,10))
ax1 = fig.add_subplot(1, 1, 1)
sns.heatmap(corr, annot=True, cmap="rainbow",ax=ax1, linewidths=.5, annot_kws={'size': 9, 'weight': 'bold', 'color': 'blue'})

ax1.set_xticklabels(xticks, rotation=35, fontsize=8)
ax1.set_yticklabels(yticks, rotation=0, fontsize=8)
plt.show()
# 本例中没有相关度较高的变量

In [None]:
df_train_train, df_train_valid = split_train_test(df, random_state=15, test_size=0.25)

df_train_train, train_ID = split_column(df_train_train, y="客户ID")
df_train_valid, valid_ID = split_column(df_train_valid, y="客户ID")

df_train_train_x, df_train_train_y = split_column(df_train_train, y="是否流失")
df_train_valid_x, df_train_valid_y = split_column(df_train_valid, y="是否流失")

In [None]:
df_train_train_x.head()

In [None]:
def train_and_valid(train_x, train_y, valid_x, valid_y, n_estimators=10, max_depth=5, gamma=0, learning_rate=0.1, reg_alpha=0, reg_lambda=1, random_state=27):
    # xgboost分类
    xg_model = xgboost_classifier(train_x, train_y, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, min_child_weight=1, gamma=gamma, subsample=1, colsample_bytree=1, scale_pos_weight=1, random_state=random_state, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
    # 逻辑回归
#     xg_model = logistic_regression(train_x, train_y, penalty="l2", class_weight=None, C=1)
    # 支持向量机
#     xg_model = svm_classifier(train_x, train_y, kernel='rbf', C=1, gamma=0.1, degree=3, class_weight=None, coef0=0)
    # 随机森林
#     xg_model = random_forest_classifier(train_x, train_y, criterion="gini", n_estimators=50, max_depth=7, min_samples_split=2, min_samples_leaf=2, min_weight_fraction_leaf=0., max_features="log2", random_state=33, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=1e-7)
    # lgbm分类
#     xg_model = lgbm_classifier(train_x, train_y, max_depth=8, learning_rate=0.1, n_estimators=100, min_child_weight=1, gamma=0, subsample=1, colsample_bytree=1, scale_pos_weight=1, random_state=27)
    
    y_pred = xg_model.predict(valid_x)

    accuracy_score_result = metrics.accuracy_score(valid_y, y_pred)
    precision_score_result = metrics.precision_score(valid_y, y_pred, average='macro')
    recall_score_result = metrics.recall_score(valid_y, y_pred, average='macro')
    f1_score_result = metrics.f1_score(valid_y, y_pred, average='macro')

    print("accuracy_score_result: {}".format(accuracy_score_result))
    print("precision_score_result: {}".format(precision_score_result))
    print("recall_score_result: {}".format(recall_score_result))
    print("f1_score_result: {}".format(f1_score_result))
    
    return xg_model

In [None]:
model = train_and_valid(df_train_train_x, df_train_train_y, df_train_valid_x, df_train_valid_y)
# 计算y_prob, y_pred
y_prob = model.predict_proba(df_train_valid_x)
# 计算auc, ks, gini
fpr, tpr, threshold = metrics.roc_curve(df_train_valid_y, y_prob[:, 1])
auc_value = metrics.auc(fpr, tpr)  # 计算auc
print("=======", "** auc_value: 【{}】 **".format(auc_value))

plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % auc_value)
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC_curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 计算KS值
fig, ax = plt.subplots()
ax.plot(1 - threshold, tpr, label='tpr') # ks曲线要按照预测概率降序排列，所以需要1-threshold镜像
ax.plot(1 - threshold, fpr, label='fpr')
ax.plot(1 - threshold, tpr-fpr,label='KS')
plt.xlabel('score')
plt.title('KS Curve')
plt.ylim([0.0, 1.0])
plt.figure(figsize=(20,20))
legend = ax.legend(loc='upper left')
plt.show()

# 计算KS值
print(max(tpr-fpr))

In [None]:
result_dict = {k: v for k, v in zip(df_train_train_x.columns.values, list(model.feature_importances_))}
dict_list = sorted(result_dict.items(), key=lambda x: x[1], reverse=True)
not_zero_list = []
for dl in dict_list:
    if dl[1] < 0:
        not_zero_list.append(dl[0])
    else:
        print(dl)