In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
df= pd.read_csv("Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Display basic information about the dataframe
print("\nDataFrame Info:")
df.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [4]:
# Get descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
print(df.describe())


Descriptive Statistics:
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [5]:
# Check for missing values
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [4]:
import pandas as pd

# Load data
df = pd.read_csv("Iris.csv")

# Check data types
print(df.dtypes)

# Check for missing values
print(df.isnull().sum())

# Check class distribution (for classification)
print(df["Species"].value_counts())

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object
Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


In [11]:
from sklearn.datasets import load_iris

# Load data
iris = load_iris()
X = iris.data

# Correct (already 1D)
y = iris.target

# If you accidentally made it 2D:
y_wrong = iris.target.reshape(-1, 1)  # Shape (150, 1)
y_fixed = y_wrong.ravel()  # Now shape (150,)

# Now this will work:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_fixed, test_size=0.2)

In [12]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the Iris dataset
def load_data():
    """
    Load the Iris dataset from scikit-learn.
    Returns features (X) and target labels (y).
    """
    iris = load_iris()
    X = iris.data  # Features (sepal/petal measurements)
    y = iris.target  # Labels (0=setosa, 1=versicolor, 2=virginica)
    
    # Convert to DataFrame for better visualization (optional)
    iris_df = pd.DataFrame(X, columns=iris.feature_names)
    iris_df['species'] = y
    
    print("\nDataset overview:")
    print(iris_df.head())
    print("\nClass distribution:")
    print(iris_df['species'].value_counts())
    
    return X, y

# Step 2: Preprocess the data
def preprocess_data(X, y):
    """
    Handle data preprocessing including:
    - Checking for missing values
    - Encoding labels (if needed)
    """
    # Check for missing values
    print("\nChecking for missing values:")
    print(pd.DataFrame(X).isnull().sum())
    
    # Note: The Iris dataset is typically clean, but here's how we'd handle missing values
    # if pd.DataFrame(X).isnull().sum().any():
    #     from sklearn.impute import SimpleImputer
    #     imputer = SimpleImputer(strategy='mean')
    #     X = imputer.fit_transform(X)
    
    # Label encoding (already done in this dataset, but here's the general approach)
    if isinstance(y[0], str):  # If labels are strings
        print("\nEncoding string labels to numerical values...")
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    return X, y

# Step 3: Split data into training and test sets
def split_data(X, y, test_size=0.3, random_state=42):
    """
    Split dataset into training and test sets.
    Uses stratification to maintain class distribution.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=y
    )
    
    print("\nData split results:")
    print(f"Training set size: {X_train.shape[0]} samples")
    print(f"Test set size: {X_test.shape[0]} samples")
    print("\nClass distribution in training set:")
    print(pd.Series(y_train).value_counts())
    print("\nClass distribution in test set:")
    print(pd.Series(y_test).value_counts())
    
    return X_train, X_test, y_train, y_test

# Step 4: Train Decision Tree Classifier
def train_model(X_train, y_train, random_state=42):
    """
    Train a decision tree classifier on the training data.
    """
    print("\nTraining Decision Tree Classifier...")
    model = DecisionTreeClassifier(random_state=random_state)
    model.fit(X_train, y_train)
    
    # Display feature importances
    print("\nFeature importances:")
    for name, importance in zip(load_iris().feature_names, model.feature_importances_):
        print(f"{name}: {importance:.4f}")
    
    return model

# Step 5: Evaluate the model
def evaluate_model(model, X_test, y_test):
    """
    Evaluate model performance on test data using:
    - Accuracy
    - Precision
    - Recall
    - Classification report
    """
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    
    # Detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=load_iris().target_names))

# Main workflow
def main():
    # Step 1: Load data
    X, y = load_data()
    
    # Step 2: Preprocess data
    X, y = preprocess_data(X, y)
    
    # Step 3: Split data
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Step 4: Train model
    model = train_model(X_train, y_train)
    
    # Step 5: Evaluate model
    evaluate_model(model, X_test, y_test)
    
    return model

if __name__ == "__main__":
    trained_model = main()


Dataset overview:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  

Class distribution:
species
0    50
1    50
2    50
Name: count, dtype: int64

Checking for missing values:
0    0
1    0
2    0
3    0
dtype: int64

Data split results:
Training set size: 105 samples
Test set size: 45 samples

Class distribution in training set:
1    35
0    35
2    35
Name: count, dtype: int64

Class distribution in test set:
2    15
1    15
0    15
Name: count, dtype: int64

Training Decision Tree Classi