# Iris Classification - Data Analysis and Visualization

This notebook explores the Iris dataset and builds classification models.

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries imported successfully!")

## 2. Load and Explore Data

In [None]:
# Load the dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create DataFrame
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(y, iris.target_names)

print("Dataset shape:", df.shape)
df.head()

In [None]:
# Summary statistics
df.describe()

In [None]:
# Class distribution
df['species'].value_counts()

## 3. Data Visualization

In [None]:
# Pairplot to visualize relationships
sns.pairplot(df, hue='species', height=2.5)
plt.suptitle('Iris Dataset - Pairplot', y=1.02)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.iloc[:, :-1].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Box plots for each feature
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, col in enumerate(iris.feature_names):
    sns.boxplot(data=df, x='species', y=col, ax=axes[idx])
    axes[idx].set_title(f'{col} by Species')

plt.tight_layout()
plt.show()

## 4. Model Building

In [None]:
# Prepare data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

In [None]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

## 5. Model Evaluation

In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, 
            yticklabels=iris.target_names)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': iris.feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

print("\nFeature Importance:")
print(feature_importance)

## 6. Conclusion

In this notebook, we:
- Loaded and explored the Iris dataset
- Visualized relationships between features
- Built a Random Forest classification model
- Achieved high accuracy on the test set
- Analyzed feature importance

The Iris dataset is a great starting point for learning classification algorithms!