In [1]:
import numpy as np
import pandas as pd
# 创建特征列表
column_names = ['Sample code number',' Clump Thickness','Uniformity of Cell Size',
               'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',
               'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',
                  names = column_names)
print('第一行数据为:\n',data.iloc[0])
print('columns 为:\n',data.columns)
print(data.index)
print(data.values)
print('原始数据大小为：',data.shape)

第一行数据为:
 Sample code number             1000025
 Clump Thickness                     5
Uniformity of Cell Size              1
Uniformity of Cell Shape             1
Marginal Adhesion                    1
Single Epithelial Cell Size          2
Bare Nuclei                          1
Bland Chromatin                      3
Normal Nucleoli                      1
Mitoses                              1
Class                                2
Name: 0, dtype: object
columns 为:
 Index(['Sample code number', ' Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')
RangeIndex(start=0, stop=699, step=1)
[[1000025 5 1 ..., 1 1 2]
 [1002945 5 4 ..., 2 1 2]
 [1015425 3 1 ..., 1 1 2]
 ..., 
 [888820 5 10 ..., 10 2 4]
 [897471 4 8 ..., 6 1 4]
 [897471 4 8 ..., 4 1 4]]
原始数据大小为： (699, 11)


In [2]:
# 将？替换为标准缺失值表示
data = data.replace(to_replace = "?",value = np.nan)
# dropna 丢弃带有nan的纬度
data = data.dropna(how = 'any')
print('处理后大小为：',data.shape)

处理后大小为： (683, 11)


In [58]:
# 划分数据集
from sklearn.model_selection import train_test_split
# 随机采样25%的数据作为测试集
X_train,X_test,y_train,y_test = train_test_split(data[column_names[1:10]],data[column_names[10]],test_size = 0.25)
# 查看训练数据和测试数据数量和类别分布
print('训练样本数量和类别为：')
# value_counts()返回包含唯一值计数的对象
print(y_train.value_counts())
print('测试样本数量和类别为：')
print(y_test.value_counts())

训练样本数量和类别为：
2    332
4    180
Name: Class, dtype: int64
测试样本数量和类别为：
2    112
4     59
Name: Class, dtype: int64


In [66]:
# 进行线性分类
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier #随机梯度下降算法

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# 初始化LogisticRegression() 和 SGDClassifier() 分类器
lr = LogisticRegression()
sgdc = SGDClassifier()

# 使用LogisticRegression()分类器来训练模型参数，并在测试集上预测输出
lr.fit(X_train,y_train)
lr_y_predict = lr.predict(X_test) 
print(lr)

# 使用 SGDClassifier() 分类器来训练模型参数，并在测试集上预测输出
sgdc.fit(X_train,y_train)
sgdc_y_predict = sgdc.predict(X_test)
print(sgdc)

print('运行完毕')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
运行完毕


In [67]:
# 模型预测评估
from sklearn.metrics import classification_report

# 1、使用自带的score函数进行查看准确率
lr_train_accuary = lr.score(X_train,y_train)
print(' LogisticRegression 在训练集上的准确率：',lr_train_accuary)
lr_test_accuary = lr.score(X_test,y_test)
print(' LogisticRegression 在测试集上的准确率：',lr_test_accuary)
# 2、使用 classification_report 查看
lr_report = classification_report(y_test,lr_y_predict,target_names = ['良性','恶性'])
print(lr_report)

###########################################################

# 1、使用自带的score函数进行查看准确率
sgdc_train_accuary = sgdc.score(X_train,y_train)
print(' SGDClassifier 在训练集上的准确率：',sgdc_train_accuary)
sgdc_test_accuary = sgdc.score(X_test,y_test)
print(' SGDClassifier 在测试集上的准确率：',sgdc_test_accuary)
# 2、使用 classification_report 查看
sgdc_report = classification_report(y_test,sgdc_y_predict,target_names = ['良性','恶性'])
print(sgdc_report)

 LogisticRegression 在训练集上的准确率： 0.974609375
 LogisticRegression 在测试集上的准确率： 0.953216374269
             precision    recall  f1-score   support

         良性       0.97      0.96      0.96       112
         恶性       0.92      0.95      0.93        59

avg / total       0.95      0.95      0.95       171

 SGDClassifier 在训练集上的准确率： 0.9765625
 SGDClassifier 在测试集上的准确率： 0.947368421053
             precision    recall  f1-score   support

         良性       0.98      0.94      0.96       112
         恶性       0.89      0.97      0.93        59

avg / total       0.95      0.95      0.95       171

