In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import sklearn

from sklearn.model_selection import train_test_split    # 数据集划分
from sklearn.linear_model import LogisticRegressionCV   # 逻辑回归交叉验证 
from sklearn.preprocessing import StandardScaler    # 数据标准化处理
from sklearn.neighbors import KNeighborsClassifier   # KNN算法

  return f(*args, **kwds)


### 数据读取

In [4]:
names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'cla']
data = pd.read_csv('./datas/iris.data', header=None, names=names)
data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,cla
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
# 查看不同数据集的种类
data['cla'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: cla, dtype: int64

### 对目标属性进行编码

方法一：使用pandas中的方法进行编码

In [9]:
pd.Categorical(data['cla']).codes

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int8)

方法二：自定义函数进行编码

In [15]:
def parse_recode(series):
    """
    对目标值进行编码，使其方便进行计算
    """
    result = []
    r = zip(names, series)
    for name, v in r:
        if name == 'cla':
            # 对数据进行编码
            if v == 'Iris-setosa':
                result.append(0)
            elif v == 'Iris-versicolor':
                result.append(1)
            elif v == 'Iris-virginica':
                result.append(2)
            else:
                result.append(np.nan)
        else:
            result.append(float(v))
    return result

#### 数据转换和划分

In [35]:
# 1、调用函数进行数据转化
new_datas = data.apply(lambda x: pd.Series(parse_recode(x)), axis=1)
# 2、删除异常值
print('删除前数据总量{}'.format(new_datas.shape[0]))
new_datas.dropna(how='any', inplace=True)
print('删除后数据总量{}'.format(new_datas.shape[0]))

# 数据集划分
x_train, x_test, y_train, y_test = train_test_split(
    new_datas.loc[:, :3],
    new_datas.loc[:, 4],
    test_size=0.2,
    random_state=9
)

删除前数据总量150
删除后数据总量150


#### 数据标准化

In [38]:
ss = StandardScaler()    # 数据标准化对象实例化
x_train = ss.fit_transform(x_train)    # 数据训练并转化
x_test = ss.transform(x_test)

### 构建逻辑回归模型进行分类

In [40]:
# 1、构建模型
lr = LogisticRegressionCV(
    Cs=np.logspace(-4, 1, 50),    # 逻辑回归参数设置
    cv=3,    # 3折交叉验证
    fit_intercept=True,    # 是否训练截距
    penalty='l2',     # 使用L2正则
    tol=0.001,     # 容忍停止标准
    solver='lbfgs',    # 模型优化的算法
    multi_class='multinomial'    # ⚠️作用有待确定
)
# 2、训练模型
lr.fit(x_train, y_train)

LogisticRegressionCV(Cs=array([1.00000000e-04, 1.26485522e-04, 1.59985872e-04, 2.02358965e-04,
       2.55954792e-04, 3.23745754e-04, 4.09491506e-04, 5.17947468e-04,
       6.55128557e-04, 8.28642773e-04, 1.04811313e-03, 1.32571137e-03,
       1.67683294e-03, 2.12095089e-03, 2.68269580e-03, 3.39322177e-03,
       4.29193426e-03, 5.42867544e-03, 6.86648845e-03, 8.68511374e-03,
       1.09854114e-02, 1.38...
       1.20679264e+00, 1.52641797e+00, 1.93069773e+00, 2.44205309e+00,
       3.08884360e+00, 3.90693994e+00, 4.94171336e+00, 6.25055193e+00,
       7.90604321e+00, 1.00000000e+01]),
                     class_weight=None, cv=3, dual=False, fit_intercept=True,
                     intercept_scaling=1.0, l1_ratios=None, max_iter=100,
                     multi_class='multinomial', n_jobs=None, penalty='l2',
                     random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.001, verbose=0)

1.0

### 