# 第 6 章：逻辑回归算法 实例：乳腺癌预测

+ 实际上只有 10 个特征，有 20 个特征都是构造出来的。
+ 用逻辑回归训练模型。

In [1]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

In [2]:
# 有 30 个特征
X.shape

(569, 30)

In [3]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
print("0 表示阴性", sum(y == 0), "1 表示阳性", sum(y == 1))

0 表示阴性 212 1 表示阳性 357


+ malignant：恶性肿瘤
+ benign：良性肿瘤

In [5]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
import numpy as np

np.unique(cancer.target_names)

array(['benign', 'malignant'], dtype='<U9')

In [7]:
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

In [8]:
cancer.data

array([[ 17.99,  10.38, 122.8 , ...,   0.27,   0.46,   0.12],
       [ 20.57,  17.77, 132.9 , ...,   0.19,   0.28,   0.09],
       [ 19.69,  21.25, 130.  , ...,   0.24,   0.36,   0.09],
       ...,
       [ 16.6 ,  28.08, 108.3 , ...,   0.14,   0.22,   0.08],
       [ 20.6 ,  29.33, 140.1 , ...,   0.27,   0.41,   0.12],
       [  7.76,  24.54,  47.92, ...,   0.  ,   0.29,   0.07]])

分割训练数据集和测试数据集。

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=666)

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='newton-cg')
model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

In [11]:
train_score

0.9626373626373627

In [12]:
test_score

0.9473684210526315

In [13]:
y_pred = model.predict(X_test)

In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9473684210526315

## 增加多项式特征

多项式特征 + L1 正则化。

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline


def polynomial_model(degree=1, **kwarg):
    polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
    logistic_regression = LogisticRegression(**kwarg)
    pipeline = Pipeline([('polynomial_features', polynomial_features),
                         ('logistic_regression', logistic_regression)])
    return pipeline


model = polynomial_model(degree=2, penalty='l1', solver='liblinear')
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('polynomial_features', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('logistic_regression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

In [16]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

In [17]:
train_score

1.0

In [18]:
test_score

0.9736842105263158

In [19]:
logit_regression = model.named_steps['logistic_regression']

In [20]:
logit_regression.coef_.shape

(1, 495)

In [21]:
np.count_nonzero(logit_regression.coef_)

94

## 模型优化

+ 这个例子中，用 L1 正则化的效果更好。

In [22]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

def polynomial_model(degree=1, **kwarg):
    polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
    logistic_regression = LogisticRegression(**kwarg)
    pipeline = Pipeline([
        ('polynomial_features',polynomial_features),
        ('logistic_regression',logistic_regression)])
    return pipeline

In [23]:
model = polynomial_model(degree=2, penalty='l1')

In [24]:
model.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('polynomial_features', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)), ('logistic_regression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [25]:
train_score = model.score(X_train, y_train)
train_score

1.0

In [26]:
test_score = model.score(X_test, y_test)
test_score

0.9736842105263158

In [27]:
logistic_regression = model.named_steps['logistic_regression']

In [28]:
len(logistic_regression.coef_[0])

495

In [29]:
np.count_nonzero(logistic_regression.coef_)

93