In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
# 导入数据
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
# 150个样本，4个属性（特征）
X.shape

(150, 4)

In [5]:
# 对应结果
y.shape

(150,)

### Train-Test-Split

>将导入的样本数据分成训练集`train`和测试集`test`两类，一般是2：8
- 分成训练集和测试集
- 需要设置随机种子`seed`

In [6]:
# 观察y的结果可以知道前50个值是0，最后5个是1
# 选择数据的时候不能只选前100个（假设），这样导致值为2的结果没有选中，数据没有随机代表性
# 通过shuffle打乱数据，再进行选择
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
# 将X的索引随机化
shuffle_indexes = np.random.permutation(len(X))

In [8]:
shuffle_indexes

array([ 26, 145,  76,  79, 141, 114,  78,  15,  27, 105,  49,  88,  32,
       123, 140,  91, 107,  73, 129,  20,  70,  30,  56, 134,  28, 117,
        87, 133, 149,  48,  60,  90,  10, 104, 100, 116,  97,  68, 138,
        39,  72, 136, 146,  46, 115,  94,  71,  58, 111, 112,  62,  25,
        38,  29, 103,  64,  51,  31,  66,  92,  12, 143,  44, 126,  61,
         0,  50,  23, 148,   8,  67,  53, 135,  47,  81,  54,  63, 144,
        85,   2,   7, 110, 102,  84,  82,  65,  40,  22,  52,  96,  59,
        14,  93,  35, 108,  98,  86, 122,  17,  11, 127, 147, 121,  57,
       119,  99,   3,  36,  89, 125,   4, 118, 128,  13, 142, 124,  43,
       106,  75,  42,  95,   6,  21, 120,  19,  55,  34, 101,  83,  80,
        16,  24,   1,  37, 132, 113,  41,  18,  45,   5, 109,  74,  33,
       139,   9, 130, 137,  77, 131,  69])

In [9]:
# 选择需要随机测试的数据集
test_ratio = 0.2
test_size = int(len(X) * test_ratio)

In [10]:
test_size

30

In [11]:
# 前20%是测试数据，后80%是训练数据集
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

In [12]:
# 测试数据集中的索引号
test_indexes

array([ 26, 145,  76,  79, 141, 114,  78,  15,  27, 105,  49,  88,  32,
       123, 140,  91, 107,  73, 129,  20,  70,  30,  56, 134,  28, 117,
        87, 133, 149,  48])

In [13]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [14]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [15]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### sklearn中的TTS

In [29]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [34]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [35]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 调用自己写的包和函数

In [16]:
import os
os.getcwd()

'D:\\Python\\datalearning\\统计学习方法\\Third_KNN'

In [17]:
# 当前路径设置成source root：编译路径
# 调用自己进行分类的数据集
from Model_Selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [20]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [21]:
# 调用自己的分类器
from playML_KNN import KNNClassifier

In [22]:
# 初始化过程
my_knn_clf = KNNClassifier(k=3)

In [23]:
# KNN拟合的过程只是将训练数据存入分类器中
my_knn_clf.fit(X_train, y_train)

KNN(k=3)

In [24]:
y_predict = my_knn_clf.predict(X_test)

In [25]:
y_predict

array([2, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 2,
       0, 2, 1, 2, 2, 0, 2, 1])

In [26]:
y_test

array([2, 1, 1, 1, 2, 1, 0, 1, 1, 1, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 2,
       0, 2, 2, 2, 2, 0, 2, 1])

In [27]:
sum(y_predict == y_test)

27

In [28]:
# 查看准确率
sum(y_predict == y_test) / len(y_test)

0.9