In [1]:
# 1. IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

In [2]:
# 2. LOAD DATASET
url = 'https://raw.githubusercontent.com/pandas-dev/pandas/main/pandas/tests/io/data/csv/iris.csv'
df = pd.read_csv(url)
print("First 5 rows of data:")
print(df.head())

First 5 rows of data:
   SepalLength  SepalWidth  PetalLength  PetalWidth         Name
0          5.1         3.5          1.4         0.2  Iris-setosa
1          4.9         3.0          1.4         0.2  Iris-setosa
2          4.7         3.2          1.3         0.2  Iris-setosa
3          4.6         3.1          1.5         0.2  Iris-setosa
4          5.0         3.6          1.4         0.2  Iris-setosa


In [3]:
# 3. DATA CLEANING & PREPROCESSING
print("\nAny nulls?", df.isnull().sum().any())
# The iris dataset usually doesn't need much cleaning
le = LabelEncoder()
df['Name'] = le.fit_transform(df['Name'])  # Encode species as numbers

# Split data/features and target
X = df.drop('Name', axis=1)
y = df['Name']

# Standardize features (very important for distance-based and regularized methods!)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split for final evaluation after cross-validation
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)



Any nulls? False


In [4]:
# 4. DEFINE & TEST 3 MODELS WITH CROSS-VALIDATION
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

cv_results = {}
for name, model in models.items():
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"\n{name} - Cross-validated Accuracy: {cv_score.mean():.4f} (+/- {cv_score.std():.4f})")
    cv_results[name] = cv_score.mean()
    # Observations:
    if name == 'Logistic Regression':
        print("• Logistic Regression generally works well with multi-class, linearly separable data like Iris.")
    elif name == 'K-Nearest Neighbors':
        print("• KNN requires scaling and can be sensitive to outliers but is simple and interpretable.")
    elif name == 'Decision Tree':
        print("• Decision Trees handle non-linear patterns and are easy to visualize, but can overfit.")



Logistic Regression - Cross-validated Accuracy: 0.9583 (+/- 0.0264)
• Logistic Regression generally works well with multi-class, linearly separable data like Iris.

K-Nearest Neighbors - Cross-validated Accuracy: 0.9583 (+/- 0.0373)
• KNN requires scaling and can be sensitive to outliers but is simple and interpretable.

Decision Tree - Cross-validated Accuracy: 0.9417 (+/- 0.0204)
• Decision Trees handle non-linear patterns and are easy to visualize, but can overfit.


In [5]:
# 5. SELECT BEST MODEL
best_model_name = max(cv_results, key=cv_results.get)
best_model = models[best_model_name]
print(f"\n\nBest model based on CV accuracy: {best_model_name}")



Best model based on CV accuracy: Logistic Regression


In [6]:
# 6. TRAIN BEST MODEL ON all TRAIN DATA AND EVALUATE ON TEST
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print(f"\n{best_model_name} Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"{best_model_name} F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Logistic Regression Test Accuracy: 0.9333
Logistic Regression F1 Score: 0.9333

Classification Report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.90      0.90      0.90        10
 Iris-virginica       0.90      0.90      0.90        10

       accuracy                           0.93        30
      macro avg       0.93      0.93      0.93        30
   weighted avg       0.93      0.93      0.93        30



In [7]:
# 7. SAVE FINAL MODEL
joblib.dump(best_model, 'best_iris_model.joblib')
joblib.dump(scaler, 'iris_scaler.joblib')

print("\nBest model and scaler saved as 'best_iris_model.joblib' and 'iris_scaler.joblib'.")



Best model and scaler saved as 'best_iris_model.joblib' and 'iris_scaler.joblib'.


In [8]:
# 8. OBSERVATIONS & COMMENTS
print("\n### FINAL OBSERVATIONS & COMMENTS ###")
for name, acc in cv_results.items():
    print(f"{name}: CV Accuracy = {acc:.4f}")
print(f"\nBased on accuracy, '{best_model_name}' is selected as the final model. Other models have their own merits; for example, KNN is simple and interpretable, while Decision Trees are robust to non-normalized features. However, Logistic Regression often performs best on relatively simple and well-separated datasets such as this.")



### FINAL OBSERVATIONS & COMMENTS ###
Logistic Regression: CV Accuracy = 0.9583
K-Nearest Neighbors: CV Accuracy = 0.9583
Decision Tree: CV Accuracy = 0.9417

Based on accuracy, 'Logistic Regression' is selected as the final model. Other models have their own merits; for example, KNN is simple and interpretable, while Decision Trees are robust to non-normalized features. However, Logistic Regression often performs best on relatively simple and well-separated datasets such as this.
