# Import data
Lets pull in the data and display it. We have 4 feature columns and one target column called `target`.

In [4]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris(as_frame=True)  # set as_frame=False if older scikit-learn
df = iris.data.copy()  # X as DataFrame
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# High level Analysis
Lets check for nulls and examine the mean, median, etc for the different features across the different targets.

In [3]:
from IPython.display import display

# Check for missing values
print("Null values per column:")
print(df.isna().sum())

# Overall descriptive statistics
print("\nOverall descriptive statistics (features):")
display(df.describe().T)

# Select feature columns
features = df.columns.drop('target')

# Aggregate stats by target
group_stats = df.groupby('target')[features].agg(['count', 'mean', 'median', 'std', 'min', 'max'])
# Map numeric target to class names if `iris` is available
try:
    group_stats.index = group_stats.index.map(lambda i: iris.target_names[i])
except Exception:
    pass

print("\nAggregate statistics by target:")
display(group_stats)

# Interquartile range (IQR) by target
iqr = df.groupby('target')[features].agg(lambda x: x.quantile(0.75) - x.quantile(0.25))
try:
    iqr.index = iqr.index.map(lambda i: iris.target_names[i])
except Exception:
    pass

print("\nInterquartile range (IQR) by target:")
display(iqr)

Null values per column:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

Overall descriptive statistics (features):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length (cm),150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal width (cm),150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal length (cm),150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal width (cm),150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5
target,150.0,1.0,0.819232,0.0,0.0,1.0,2.0,2.0



Aggregate statistics by target:


Unnamed: 0_level_0,sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal length (cm),sepal width (cm),sepal width (cm),sepal width (cm),sepal width (cm),...,petal length (cm),petal length (cm),petal length (cm),petal length (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm),petal width (cm)
Unnamed: 0_level_1,count,mean,median,std,min,max,count,mean,median,std,...,median,std,min,max,count,mean,median,std,min,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50,5.006,5.0,0.35249,4.3,5.8,50,3.428,3.4,0.379064,...,1.5,0.173664,1.0,1.9,50,0.246,0.2,0.105386,0.1,0.6
versicolor,50,5.936,5.9,0.516171,4.9,7.0,50,2.77,2.8,0.313798,...,4.35,0.469911,3.0,5.1,50,1.326,1.3,0.197753,1.0,1.8
virginica,50,6.588,6.5,0.63588,4.9,7.9,50,2.974,3.0,0.322497,...,5.55,0.551895,4.5,6.9,50,2.026,2.0,0.27465,1.4,2.5



Interquartile range (IQR) by target:


Unnamed: 0_level_0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,0.4,0.475,0.175,0.1
versicolor,0.7,0.475,0.6,0.3
virginica,0.675,0.375,0.775,0.5


# Plots
Lets examine how the data looks. Use copilot to generate plots of the data, e.g. use PCA and colour by target.

In [1]:
from sklearn.decomposition import PCA
import numpy as np

import matplotlib.pyplot as plt

# Prepare data for PCA
X = df.drop('target', axis=1)
y = df['target']

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Create plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7, s=100)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA of Iris Dataset')
plt.colorbar(scatter, label='Target')
plt.grid(alpha=0.3)
plt.show()

NameError: name 'df' is not defined

# Optional: Modelling
Use a model for classification from sklearn to try and predict the target based on the features or instead, discuss with copilot what model you can use in the chat!