In [1]:
import pandas as pd
import numpy as np

# Level 1 — Basic Pandas

## Load the iris.csv file using pandas

In [2]:
df = pd.read_csv("Iris.csv")

## Display the first 5 rows of the dataset.

In [3]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


## Display the last 5 rows of the dataset.

In [4]:
df.tail()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


## Find the shape of the dataset (number of rows and columns).

In [5]:
df.shape

(150, 6)

## Check the data types of all columns.

In [6]:
df.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

## Check if the dataset contains any missing values.

In [7]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

## Find all unique values in the Species column.

In [8]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [9]:
df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

## Count the number of samples for each species.

In [10]:
df['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [11]:
df.groupby('Species').size()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

# Level 2 — Filtering & Statistics

## Filter the dataset to show only rows where Species is Setosa.

In [12]:
df[df['Species'] == 'Iris-setosa']

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


## Select all flowers where SepalLengthCm is greater than 6.

In [13]:
df[df['SepalLengthCm'] >= 6]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
50,51,7.0,3.2,4.7,1.4,Iris-versicolor
51,52,6.4,3.2,4.5,1.5,Iris-versicolor
52,53,6.9,3.1,4.9,1.5,Iris-versicolor
54,55,6.5,2.8,4.6,1.5,Iris-versicolor
56,57,6.3,3.3,4.7,1.6,Iris-versicolor
...,...,...,...,...,...,...
144,145,6.7,3.3,5.7,2.5,Iris-virginica
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica


## Calculate the mean, maximum, and minimum of PetalLengthCm.

In [16]:
df['PetalLengthCm'].mean()

np.float64(3.758666666666666)

In [18]:
df[df['PetalLengthCm'] == df['PetalLengthCm'].max()]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
118,119,7.7,2.6,6.9,2.3,Iris-virginica


In [19]:
df[df['PetalLengthCm'] == df['PetalLengthCm'].min()]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
22,23,4.6,3.6,1.0,0.2,Iris-setosa


## Find the average SepalLengthCm for each species.

In [20]:
df.groupby('Species')['SepalLengthCm'].mean()

Species
Iris-setosa        5.006
Iris-versicolor    5.936
Iris-virginica     6.588
Name: SepalLengthCm, dtype: float64

## Filter rows where PetalWidthCm is greater than 1.5.

In [22]:
df[df['PetalWidthCm'] > 1.5]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
56,57,6.3,3.3,4.7,1.6,Iris-versicolor
70,71,5.9,3.2,4.8,1.8,Iris-versicolor
77,78,6.7,3.0,5.0,1.7,Iris-versicolor
83,84,6.0,2.7,5.1,1.6,Iris-versicolor
85,86,6.0,3.4,4.5,1.6,Iris-versicolor
100,101,6.3,3.3,6.0,2.5,Iris-virginica
101,102,5.8,2.7,5.1,1.9,Iris-virginica
102,103,7.1,3.0,5.9,2.1,Iris-virginica
103,104,6.3,2.9,5.6,1.8,Iris-virginica
104,105,6.5,3.0,5.8,2.2,Iris-virginica


## Sort the dataset by SepalLengthCm in descending order.

In [24]:
df.sort_values(by='SepalLengthCm',ascending=False)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
131,132,7.9,3.8,6.4,2.0,Iris-virginica
122,123,7.7,2.8,6.7,2.0,Iris-virginica
118,119,7.7,2.6,6.9,2.3,Iris-virginica
117,118,7.7,3.8,6.7,2.2,Iris-virginica
135,136,7.7,3.0,6.1,2.3,Iris-virginica
...,...,...,...,...,...,...
41,42,4.5,2.3,1.3,0.3,Iris-setosa
42,43,4.4,3.2,1.3,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
38,39,4.4,3.0,1.3,0.2,Iris-setosa


# Level 3 — GroupBy & Analysis

## Group the data by Species and calculate the mean of all numeric columns.

In [31]:
df.groupby('Species').mean().reset_index()

Unnamed: 0,Species,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,Iris-setosa,25.5,5.006,3.418,1.464,0.244
1,Iris-versicolor,75.5,5.936,2.77,4.26,1.326
2,Iris-virginica,125.5,6.588,2.974,5.552,2.026


## Find the maximum PetalLengthCm for each species.

In [34]:
df.groupby('Species')['PetalLengthCm'].max()

Species
Iris-setosa        1.9
Iris-versicolor    5.1
Iris-virginica     6.9
Name: PetalLengthCm, dtype: float64

## Identify which species has the highest average PetalWidthCm.

In [41]:
df.groupby('Species')['PetalWidthCm'].mean().idxmax()

'Iris-virginica'

In [42]:
df.groupby('Species')['PetalWidthCm'].mean().sort_values(ascending=False).head(1)

Species
Iris-virginica    2.026
Name: PetalWidthCm, dtype: float64

## Count how many flowers in each species have SepalWidthCm > 3.

In [52]:
df[df['SepalWidthCm'] > 3].groupby('Species')['SepalWidthCm'].count()

Species
Iris-setosa        42
Iris-versicolor     8
Iris-virginica     17
Name: SepalWidthCm, dtype: int64

# Level 4 — Logic & Feature Engineering

## Create a new column called (PetalSize = PetalLengthCm * PetalWidthCm)

In [54]:
df['PetalSize'] = df['PetalLengthCm'] * df['PetalWidthCm']
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,PetalSize
0,1,5.1,3.5,1.4,0.2,Iris-setosa,0.28
1,2,4.9,3.0,1.4,0.2,Iris-setosa,0.28
2,3,4.7,3.2,1.3,0.2,Iris-setosa,0.26
3,4,4.6,3.1,1.5,0.2,Iris-setosa,0.30
4,5,5.0,3.6,1.4,0.2,Iris-setosa,0.28
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica,11.96
146,147,6.3,2.5,5.0,1.9,Iris-virginica,9.50
147,148,6.5,3.0,5.2,2.0,Iris-virginica,10.40
148,149,6.2,3.4,5.4,2.3,Iris-virginica,12.42


## Determine which species has the largest average PetalSize.

In [56]:
df.groupby('Species')['PetalSize'].mean().idxmax()

'Iris-virginica'

In [57]:
df.groupby('Species')['PetalSize'].mean().sort_values(ascending=False).head(1)

Species
Iris-virginica    11.2962
Name: PetalSize, dtype: float64

## Find the flower with the maximum SepalLengthCm.

In [58]:
df[df['SepalLengthCm'] == df['SepalLengthCm'].max()]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,PetalSize
131,132,7.9,3.8,6.4,2.0,Iris-virginica,12.8


## Calculate the median SepalLengthCm for each species.

In [60]:
df.groupby('Species')['SepalLengthCm'].median()

Species
Iris-setosa        5.0
Iris-versicolor    5.9
Iris-virginica     6.5
Name: SepalLengthCm, dtype: float64

## Save the dataset to a new CSV file without the Id column.

In [63]:
# Save dataset without 'Id' column
df.drop('Id', axis=1).to_csv('iris_new.csv', index=False)

# Bonus — Data Science Thinking

## Create a correlation matrix for all numeric columns.

In [68]:
numeric_df = df.select_dtypes(include='number')  # sirf numeric columns
corr_matrix = numeric_df.corr()
corr_matrix

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,PetalSize
Id,1.0,0.716676,-0.397729,0.882747,0.899759,0.885746
SepalLengthCm,0.716676,1.0,-0.109369,0.871754,0.817954,0.857326
SepalWidthCm,-0.397729,-0.109369,1.0,-0.420516,-0.356544,-0.280612
PetalLengthCm,0.882747,0.871754,-0.420516,1.0,0.962757,0.958472
PetalWidthCm,0.899759,0.817954,-0.356544,0.962757,1.0,0.980229
PetalSize,0.885746,0.857326,-0.280612,0.958472,0.980229,1.0


## Analyze whether the relationship between SepalLengthCm and PetalLengthCm is strong or weak.

In [70]:
df[['SepalLengthCm', 'PetalLengthCm']].corr()

Unnamed: 0,SepalLengthCm,PetalLengthCm
SepalLengthCm,1.0,0.871754
PetalLengthCm,0.871754,1.0
