In [None]:
import pathlib
import numpy as np
import pandas as pd
import joblib
import warnings
from pprint import pprint

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
DATA_PATH = pathlib.Path("dataset") / "iris.data"
MODEL_PATH = pathlib.Path("model.joblib")
RANDOM_STATE = 42


In [None]:
def load_data(path=DATA_PATH):
    """Load local iris.data file into a DataFrame and return X, y, df."""
    df = pd.read_csv(path, header=None)
    df.columns = ['sepal_length','sepal_width','petal_length','petal_width','class']
    # Ensure class is string
    df['class'] = df['class'].astype(str)
    X = df.iloc[:, :4].values
    y = df['class'].values
    return X, y, df

def build_pipeline():
    """Return a sklearn Pipeline with scaler + KNN placeholder."""
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())
    ])
    return pipe

def tune_hyperparameters(pipe, X_train, y_train, cv=5, n_jobs=-1):
    """Grid search for best hyperparameters for KNN inside the pipeline."""
    param_grid = {
        'knn__n_neighbors': [3, 5, 7, 9],
        'knn__weights': ['uniform', 'distance'],
        'knn__p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
    }
    gs = GridSearchCV(pipe, param_grid, cv=cv, n_jobs=n_jobs, verbose=1)
    gs.fit(X_train, y_train)
    return gs

def evaluate_model(model, X_test, y_test):
    """Evaluate a trained model on test data and print metrics."""
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("\nConfusion matrix:\n", cm)
    print("\nClassification report:\n", report)
    return acc, cm, report

def save_model(model, path=MODEL_PATH):
    """Save the model (pipeline) using joblib."""
    joblib.dump(model, path)
    print(f"Saved model to {path}")

def predict_sample(sample, model_path=MODEL_PATH):
    """
    Load saved pipeline and predict class for a single sample.
    sample: iterable with 4 elements [sepal_length, sepal_width, petal_length, petal_width]
    """
    sample_arr = np.array(sample).reshape(1, -1)
    model = joblib.load(model_path)
    return model.predict(sample_arr)[0]


In [None]:
if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    print("Loading data...")
    X, y, df = load_data()
    print("Rows, cols:", df.shape)
    print("Class distribution:")
    print(df['class'].value_counts())
    print("\nFirst 5 rows:")
    print(df.head(), "\n")

    # Split
    print("Splitting into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )
    print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}\n")

    # Build pipeline
    print("Building pipeline (StandardScaler + KNN)...")
    pipe = build_pipeline()

    # Tune
    print("Starting GridSearchCV to find best hyperparameters for KNN...")
    gs = tune_hyperparameters(pipe, X_train, y_train)
    print("\nBest params found:")
    pprint(gs.best_params_)

    # Best model
    best_model = gs.best_estimator_
    print("\nEvaluating best model on test set...")
    evaluate_model(best_model, X_test, y_test)

    # Save
    save_model(best_model)

    # Example prediction
    example = [5.1, 3.5, 1.4, 0.2]  # typical Iris-setosa sample
    pred = predict_sample(example)
    print(f"\nExample prediction for {example} -> {pred}")