Step 1: Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris


Step 2: Load the Iris Dataset
You can load the Iris dataset from sklearn.datasets or from a CSV file if you have it locally. Here, we'll load it using sklearn.datasets.

In [2]:
# Load Iris dataset
iris = load_iris()

# Create a DataFrame from the dataset
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Map the target numbers to species names
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Inspect the first few rows of the dataset
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


Step 3: Data Exploration and Visualization
Let's explore the dataset and visualize the distribution of the different Iris species.

3.1 Statistical Summary

In [3]:
# Get statistical summary of the data
print(df.describe())

# Check class distribution
print(df['species'].value_counts())


       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)  
count        150.000000  
mean           1.199333  
std            0.762238  
min            0.100000  
25%            0.300000  
50%            1.300000  
75%            1.800000  
max            2.500000  
species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64


3.2 Visualize Feature Distributions
We'll use pair plots to visualize the relationships between the features and the species.

In [None]:
# Pairplot to visualize the feature distributions
df.replace([np.inf, -np.inf], np.nan, inplace=True)
sns.pairplot(df, hue='species', palette='Set1')
plt.show()

  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


Step 4: Data Preprocessing

4.1 Split the Data into Features and Target
The features are the first four columns, and the target is the last column (species).

In [None]:
# Define the features (X) and the target variable (y)
X = df.drop(columns=['species'])
y = df['species']


4.2 Train-Test Split

We'll split the dataset into training and testing sets (80% train, 20% test).

In [None]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


4.3 Feature Scaling (Optional)

Feature scaling is important for certain machine learning algorithms like Logistic Regression and KNN. We will use StandardScaler to scale the features.

In [None]:
# Standardize the features (important for models like Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Step 5: Model Building

We'll try two models: Logistic Regression and Random Forest Classifier.

5.1 Logistic Regression
python
Copy code


In [None]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr * 100:.2f}%")


5.2 Random Forest Classifier

In [None]:
# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")


Step 6: Model Evaluation

We will evaluate the models using Accuracy, Confusion Matrix, and Classification Report.

6.1 Confusion Matrix and Classification Report for Logistic Regression
python
Copy code


In [None]:
# Confusion Matrix
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(conf_matrix_lr, annot=True, fmt='d', cmap='Blues', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))


6.2 Confusion Matrix and Classification Report for Random Forest

In [None]:
# Confusion Matrix
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))


Step 7: Conclusion

Based on the evaluation metrics, you can choose the model that performs best for your task. Typically, Random Forest should perform well due to its ability to handle non-linear relationships, but Logistic Regression is a simpler model that may also perform adequately.

Final Model
You can save the best-performing model using joblib for future predictions.

In [None]:
import joblib

# Save the Random Forest model
joblib.dump(rf_model, 'iris_flower_rf_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
