## EDA: iris dataset - part 1: Basic exploration

### Preparations

Import libraries

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

1) Load iris dataset as iris:

In [60]:
# Load the iris dataset
iris = pd.read_csv(r"DSA_iris_dataset.csv")
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal l. / cm,petal w. / cm,target,species
0,5.1,3.5,1.4,0.2,,setosa
1,4.9,3.0,1.4,0.2,,setosa
2,4.7,3.2,1.3,0.2,,setosa
3,4.6,3.1,1.5,0.2,,setosa
4,5.0,3.6,1.4,0.2,,setosa


2) Overview: head, size, shape, info...

In [61]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal l. / cm      150 non-null    float64
 3   petal w. / cm      150 non-null    float64
 4   target             0 non-null      float64
 5   species            150 non-null    object 
dtypes: float64(5), object(1)
memory usage: 7.2+ KB


3) Quick statistics and summary (value_counts, max, min, describe, nlargest, nsmallest, nunique):

In [62]:
iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal l. / cm,petal w. / cm,target
count,150.0,150.0,150.0,150.0,0.0
mean,5.843333,3.309333,3.758,1.199333,
std,0.828066,3.208637,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,
max,7.9,42.0,6.9,2.5,


In [63]:
# we see 42 as max for sepal width, but 3.3 as 75% ad 3.3 as mean
# suspicous, check the largest few entries:
iris.nlargest(7, columns="sepal width (cm)")

Unnamed: 0,sepal length (cm),sepal width (cm),petal l. / cm,petal w. / cm,target,species
33,5.5,42.0,1.4,0.2,,setosa
15,5.7,4.4,1.5,0.4,,setosa
32,5.2,4.1,1.5,0.1,,setosa
14,5.8,4.0,1.2,0.2,,setosa
5,5.4,3.9,1.7,0.4,,setosa
16,5.4,3.9,1.3,0.4,,setosa
18,5.7,3.8,1.7,0.3,,setosa


In [64]:
iris.nunique(axis=0) # some have coincidentally the same measured values

sepal length (cm)    35
sepal width (cm)     23
petal l. / cm        43
petal w. / cm        22
target                0
species               3
dtype: int64

In [65]:
iris["species"].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [66]:
iris.min()
iris.max()

sepal length (cm)          7.9
sepal width (cm)          42.0
petal l. / cm              6.9
petal w. / cm              2.5
target                     NaN
species              virginica
dtype: object

4) Missing data (isnull) and duplicates (duplicated)

In [67]:
iris.isnull().any()

sepal length (cm)    False
sepal width (cm)     False
petal l. / cm        False
petal w. / cm        False
target                True
species              False
dtype: bool

In [68]:
iris.duplicated().any()
iris[iris.duplicated()==True]

Unnamed: 0,sepal length (cm),sepal width (cm),petal l. / cm,petal w. / cm,target,species
142,5.8,2.7,5.1,1.9,,virginica


5) Drop unnecessary columns and rename columns

In [69]:
iris.drop(columns=["target"], axis=1, inplace=True)
iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal l. / cm,petal w. / cm,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [70]:
iris.rename(columns={"petal l. / cm":"petal length (cm)", "petal w. / cm":"petal width (cm)"}, inplace=True)

In [71]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [72]:
iris["sepal width (cm)"].nlargest(3)

33    42.0
15     4.4
32     4.1
Name: sepal width (cm), dtype: float64

In [74]:
iris.iloc[33,1] = 4.2
iris["sepal width (cm)"].nlargest(3)

15    4.4
33    4.2
32    4.1
Name: sepal width (cm), dtype: float64

6) Export cleaned dataset as "DSA_iris_cleaned.csv" into the same folder

In [None]:
iris.to_csv("DSA_iris_cleaned.csv", index=False)