In [14]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Bài toán phân loại hoa Iris dử dụng mô hình KNN

In [2]:
# Load the diabetes dataset
iris_X, iris_y = datasets.load_iris(return_X_y=True)
print(iris_X.shape)
print(iris_y.shape)

(150, 4)
(150,)


In [3]:
# Split train:test = 8:2
X_train, X_test, y_train, y_test = train_test_split(
    iris_X, iris_y, test_size=0.2, random_state=42
)

In [6]:
X_train[0]

array([4.6, 3.6, 1. , 0.2])

In [7]:
y_train[0]

np.int64(0)

In [8]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train[0]

array([-1.47393679,  1.20365799, -1.56253475, -1.31260282])

In [11]:
# Build KNN Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

In [15]:
# Predict and Evaluate test set
y_pred = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [18]:
for k in (4, 7, 10, 17):
    # Build KNN Classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train, y_train)

    # Predict and Evaluate test set
    y_pred = knn_classifier.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(k, score)

4 1.0
7 1.0
10 1.0
17 1.0


# Bài toán phân loại văn bản sử dụng mô hình KNN trên bộ dữ liệu đánh giá phim

In [23]:
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# Load IMDB dataset
imdb = load_dataset("imdb")
imdb_train, imdb_test = imdb["train"], imdb["test"]
print(imdb_train.shape)
print(imdb_test.shape)

Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<00:00, 8.48kB/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:03<00:00, 6.65MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:01<00:00, 11.9MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:02<00:00, 17.7MB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 409488.03 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 541700.98 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 576934.74 examples/s]


(25000, 2)
(25000, 2)


In [25]:
# Convert text to vector using BoW
vectorizer = CountVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(imdb_train["text"]).toarray()
X_test = vectorizer.transform(imdb_test["text"]).toarray()
y_train = np.array(imdb_train["label"])
y_test = np.array(imdb_test["label"])

In [28]:
X_train[0]

array([ 0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,
        3,  0,  1,  0,  0,  0,  8,  0,  0,  0,  1,  1,  0,  0,  0,  0,  0,
        0,  2,  0,  1,  0,  1,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,
        0,  2,  0,  0,  0

In [29]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build KNN Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=1, algorithm="ball_tree")
knn_classifier.fit(X_train, y_train)

# predict test set and evaluate
y_pred = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred)