In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
# 导入数据
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
# 150个样本，4个属性（特征）
X.shape

(150, 4)

In [5]:
# 对应结果
y.shape

(150,)

### Train-Test-Split

>将导入的样本数据分成训练集`train`和测试集`test`两类，一般是2：8
- 分成训练集和测试集
- 需要设置随机种子`seed`

In [6]:
# 观察y的结果可以知道前50个值是0，最后5个是1
# 选择数据的时候不能只选前100个（假设），这样导致值为2的结果没有选中，数据没有随机代表性
# 通过shuffle打乱数据，再进行选择
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [7]:
# 将X的索引随机化
shuffle_indexes = np.random.permutation(len(X))

In [8]:
shuffle_indexes

array([128,  94,  13, 118, 114, 137,  98,  26,  14, 122,  22,  92,   7,
       139,  21,  46, 130,  54,  44,  20,  31,  53, 123, 117,   8,  95,
        36,  83, 107, 113,  11, 149,  32,  69,  65,  80,  23, 129,   0,
       110,  55, 101,  40,  91, 108,  25,  60, 119,  66,  58,  38,  84,
        61, 141, 115,  51, 145,  82,  63,  24, 148, 111, 105,  19,  43,
        18,  30, 138,   1,  81,  47,  73,  42,  37,  96, 112,   3,  74,
        39, 132,  79,  86,  70,  56,  48,  17, 125,  85,  41, 109,  97,
        27,  71,  75,  67,  77,  33,  78, 140,  10, 127,  35,   6, 106,
        64, 134,  89,  45, 143,  62,  76, 136,  59,  88,  49,   9,  12,
        90, 144, 121, 102,   4, 133,  72,  57, 100, 124, 131,  87, 147,
        99,   5,  28,   2, 126,  15, 104,  93,  34,  68, 120, 103, 146,
        29,  50, 135,  52, 116,  16, 142])

In [9]:
# 选择需要随机测试的数据集
test_ratio = 0.2
test_size = int(len(X) * test_ratio)

In [10]:
test_size

30

In [11]:
# 前20%是测试数据，后80%是训练数据集
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

In [12]:
# 测试数据集中的索引号
test_indexes

array([128,  94,  13, 118, 114, 137,  98,  26,  14, 122,  22,  92,   7,
       139,  21,  46, 130,  54,  44,  20,  31,  53, 123, 117,   8,  95,
        36,  83, 107, 113])

In [13]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [14]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [15]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 调用自己写的包和函数

In [16]:
import os
os.getcwd()

'D:\\Python\\datalearning\\统计学习方法\\Third_KNN'

In [17]:
# 当前路径设置成source root：编译路径
# 调用自己进行分类的数据集
from Model_Selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [20]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [21]:
# 调用自己的分类器
from playML.KNN import KNNClassifier

ModuleNotFoundError: No module named 'playML'

In [None]:
my_knn_clf = KNNClassifier(k=3)

In [None]:
# KNN拟合的过程只是将训练数据存入分类器中
my_knn_clf.fit(X_train, y_train)

In [None]:
y_predict = my_knn_clf.predict(X_test)