### 读取数据

In [9]:
import pandas
df = pandas.read_csv('Data/customer_churn.csv', index_col=0, header = 0)
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1,KS,128,area_code_415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,no
2,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
3,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
4,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
5,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no


### 数据处理

In [11]:
df = df.iloc[:,3:]
df.head()

Unnamed: 0,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,no
2,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
3,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
4,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
5,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no


In [12]:
cat_var = ['international_plan','voice_mail_plan', 'churn']

for var in cat_var:
    df[var] = df[var].map(lambda e: 1 if e == 'yes' else 0)
y = df.iloc[:,-1]
X = df.iloc[:,:-1]

In [13]:
df.head()

Unnamed: 0,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
2,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
3,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
4,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
5,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


### 区分训练集和测试集

In [None]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state = 123)

### 决策树分类模型

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(train_X, train_y)

In [None]:
predicted  = clf.predict(test_X)
predicted

In [None]:
### 计算准确率，但意义不大
import numpy
numpy.sum(test_y == predicted) / len(test_y)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predicted)

#### 产生混淆矩阵

In [None]:
from sklearn.metrics import confusion_matrix
m = confusion_matrix(test_y, predicted)
m

#### 评判指标

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_y, predicted))

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf1 = DecisionTreeClassifier()
clf1.fit(train_X, train_y)

In [None]:
### cache_size，max_iter的设置会影响模型的准确度和运行时间
### 不设置的话，运行时间会比较长，可能一直迭代
from sklearn.svm import SVC
clf2 = SVC(probability=True, kernel='linear', cache_size=7000, max_iter=10000)
clf2.fit(train_X, train_y)

In [None]:
from sklearn.linear_model import LogisticRegression
clf3 = LogisticRegression()
clf3.fit(train_X, train_y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf4 = RandomForestClassifier()
clf4.fit(train_X, train_y)

### ROC比较模型

In [None]:
## ROC接受的是二维数据，true和false，如果分类指标超过两个，先要对指标进行处理，才能生成ROC曲线
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
plt.figure(figsize= [20,10])
for clf, title in zip([clf1,clf2,clf3, clf4], ['Decision Tree', 'SVM', 'LogisticRegression', 'RandomForest']):
    probas_ = clf.fit(train_X, train_y).predict_proba(test_X)
    fpr, tpr, thresholds = roc_curve(test_y, probas_[:, 1])
    plt.plot(fpr, tpr, label='%s - AUC:%.2f'%(title, auc(fpr, tpr)) )
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate', fontsize = 20)
plt.ylabel('True Positive Rate', fontsize = 20)
plt.title('Receiver operating characteristic example', fontsize = 20)
plt.legend(loc="lower right", fontsize = 20)
plt.show()

In [None]:
### 得到指标重要性，也就是X中最能影响y的指标排名
### 排序得到的是索引，可以对columns调整顺序
train_X.columns[clf4.feature_importances_.argsort()[::-1]]

In [None]:
### 评估特征重要性

In [None]:
import matplotlib.pyplot as plt
importance = clf4.feature_importances_
names = train_X.columns
plt.title('Feature Importance')
plt.bar(range(0, len(names)), importance[importance.argsort()[::-1]])
plt.xticks(range(0,len(names)), names[importance.argsort()[::-1]], rotation=90)
plt.show()