# IRIS DATASET FROM KAGGLE

In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [4]:
# Step 2: Load the dataset (replace with your file path if needed)
df = pd.read_csv("Iris.csv")

In [5]:
# Step 3: View the first few rows
print("Dataset preview:")
print(df.head())

Dataset preview:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [6]:
# Step 4: Drop ID column if it exists (it's not useful for prediction)
if 'Id' in df.columns:
    df.drop(columns=['Id'], inplace=True)

In [7]:
# Step 5: Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [9]:
# Step 6: Separate features and target
X = df.drop(columns=['Species'])  # Features
y = df['Species']                 # Target

In [10]:
# Step 8: Encode the target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("\nEncoded classes:")
for original, encoded in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{original} → {encoded}")

 #The preprocessed data is now:
# - Features: X (as DataFrame)
# - Labels: y_encoded (as NumPy array)


Encoded classes:
Iris-setosa → 0
Iris-versicolor → 1
Iris-virginica → 2



# Train a decision tree classifier to predict iris species.


In [11]:
# Step 1: Import required libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score



In [12]:
# Step 2: Split the data into training and testing sets
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# stratify=y_encoded ensures the class proportions are maintained in 
#both train and test sets
#random state 42 - to ensure results are reproduceable

In [13]:
# Step 3: Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)



In [14]:
# Step 4: Train the model using training data
clf.fit(X_train, y_train)

In [15]:
# Step 5: Make predictions on the test set
y_pred = clf.predict(X_test)

In [18]:
# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # macro = average across all classes
recall = recall_score(y_test, y_pred, average='macro')

# average='macro' gives equal weight to each class when computing precision and recall 
# (useful for balanced multiclass problems like Iris)

In [19]:
# Step 7: Print the results
print("🎯 Model Performance Metrics:")
print(f"Accuracy:  {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")

🎯 Model Performance Metrics:
Accuracy:  0.93
Precision: 0.93
Recall:    0.93
