# Get Dataset

Toy Datasets: https://scikit-learn.org/stable/datasets/toy_dataset.html

In [1]:
from sklearn.datasets import load_iris
iris_dict = load_iris()
print(iris_dict.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [2]:
print(f"Features: {iris_dict['feature_names']}")
print(iris_dict['data'][0])

print(f"Target: {iris_dict['target_names']}")
print(iris_dict['target'][0])

Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[5.1 3.5 1.4 0.2]
Target: ['setosa' 'versicolor' 'virginica']
0


In [3]:
X = iris_dict['data']
y = iris_dict['target']

# Create Train and Test Sets

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
print(f"Train: {X_train[25]} -> {y_train[25]}")
print(f"Test: {X_test[3]} -> {y_test[3]}")

Train: [5.7 2.5 5.  2. ] -> 2
Test: [6.  2.9 4.5 1.5] -> 1


# Preprocessing

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # subtract the mean and divide by standard deviation

fit_scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [7]:
scaler = StandardScaler()

fit_scaler = scaler.fit_transform(X_train)

In [8]:
print(f"[Raw] Min: {min(X_train[:,0])}; Max: {max(X_train[:,0])}")
print(f"[Scaled] Min: {min(X_train_scaled[:,0]):.2f}; Max: {max(X_train_scaled[:,0]):.2f}")

[Raw] Min: 4.3; Max: 7.7
[Scaled] Min: -1.86; Max: 2.24


# Basic ML Training and Evaluation

In [9]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
clf = clf.fit(X_train_scaled, y_train)

In [10]:
X_test_scaled = scaler.transform(X_test) # only transform the test data (not fit)

y_pred = clf.predict(X_test_scaled)

In [11]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 1.0


# ML Pipeline

In [12]:
from sklearn.pipeline import Pipeline

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Calculate accuracy
accuracy =  pipeline.score(X_test, y_test) # notice that it handles the scaling of X_test
print(f"Accuracy: {accuracy}")

Accuracy: 1.0


# Training with Grid Search and Cross Validation

In [13]:
from sklearn.model_selection import GridSearchCV

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

# Define hyperparameters to search over
params = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__metric': ['euclidean','manhattan']
}

# Define GridSearch with Cross Validation
grid_search = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')

# Find the best model
grid_search.fit(X_train, y_train)

print(f"Best Model: {grid_search.best_estimator_}")
print(f"Parameters: {grid_search.best_params_}")
print(f"Best Training Score: {grid_search.best_score_}")

# Predict using the pipeline
y_pred = grid_search.best_estimator_.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")


Best Model: Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', KNeighborsClassifier(metric='manhattan'))])
Parameters: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 5}
Best Training Score: 0.9333333333333333
Test Accuracy: 1.0
