In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,roc_auc_score

In [3]:
# 1. 获取数据

names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                  names=names)

In [6]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [9]:
# 2. 数据基本处理
#    2.1 缺失值处理
np.any(data=='?')

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                     True
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [10]:
data.replace('?',np.nan,inplace=True)

In [12]:
np.any(pd.isnull(data))

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                     True
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [14]:
data.dropna(inplace=True)

In [15]:
np.any(pd.isnull(data))

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [16]:
#  2.2 确定特征值和目标值
x = data.iloc[:,1:10]

In [18]:
y = data.iloc[:,10]

In [20]:
#    2.3 切割数据
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=22)

In [22]:
# 3. 特征工程
#    特征标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [23]:
# 4. 建立模型
#    逻辑回归
estimator = LogisticRegression()

In [25]:
estimator.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
# 5. 模型评估
pre = estimator.predict(x_test)

print("真实值与预测试",pre==y_test)  

真实值与预测试 389     True
32      True
272     True
655     True
271     True
478     True
650     True
375     True
458     True
683     True
387     True
452     True
694     True
132     True
202     True
61      True
173     True
227     True
113     True
539     True
117     True
277     True
295     True
367     True
323     True
209     True
466     True
419     True
29      True
242     True
       ...  
495     True
200     True
44      True
454     True
169     True
399     True
35      True
67      True
577     True
70      True
486     True
129     True
350     True
45      True
590     True
219     True
1      False
212     True
2       True
558     True
49      True
679     True
479     True
150     True
480     True
250     True
436     True
496     True
645     True
518     True
Name: Class, Length: 137, dtype: bool


In [29]:
estimator.score(x_test,y_test)

0.9854014598540146

# 分类评估报告

In [32]:
ret = classification_report(y_test,pre,labels=[2,4],target_names=['良性','恶性'])

In [33]:
print(ret)

             precision    recall  f1-score   support

         良性       0.99      0.99      0.99        89
         恶性       0.98      0.98      0.98        48

avg / total       0.99      0.99      0.99       137



In [37]:
y_test1 = np.where(y_test>3,1,0)

In [39]:
print("AUC 指标",roc_auc_score(y_test1,pre))

AUC 指标 0.9839653558052434
