In [7]:
import pandas as pd
import numpy as np

df = pd.read_csv('IRIS1.csv')
print(df.head())


  sepal_length  sepal_width petal_length  petal_width      species
0          5.1          3.5          1.4          0.2  Iris-setosa
1          4.9          3.0          1.4          0.2  Iris-setosa
2          4.7          3.2          1.3          0.2  Iris-setosa
3          4.6          3.1          1.5          0.2  Iris-setosa
4            5          3.6          1.4          0.2  Iris-setosa


In [8]:

print(df.info())

print(f"Dataset shape: {df.shape}")

print("\nMissing values per column:")
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  149 non-null    object 
 1   sepal_width   150 non-null    float64
 2   petal_length  148 non-null    object 
 3   petal_width   150 non-null    float64
 4   species       149 non-null    object 
dtypes: float64(2), object(3)
memory usage: 6.0+ KB
None
Dataset shape: (150, 5)

Missing values per column:
sepal_length    1
sepal_width     0
petal_length    2
petal_width     0
species         1
dtype: int64


In [9]:

df_dropped_rows = df.dropna()

df_dropped_cols = df.dropna(axis=1)

In [10]:

print(f"Number of duplicate rows: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)
print(f"Number of duplicates after cleaning: {df.duplicated().sum()}")

Number of duplicate rows: 2
Number of duplicates after cleaning: 0


In [11]:
df['petal_length'] = pd.to_numeric(df['petal_length'], errors='coerce')
print("--- Converted 'petal_length' to a numeric type ---")
print("Missing values after handling non-numeric text:")
print(df.isnull().sum())
print("-" * 30)


--- Converted 'petal_length' to a numeric type ---
Missing values after handling non-numeric text:
sepal_length    1
sepal_width     0
petal_length    3
petal_width     0
species         1
dtype: int64
------------------------------


In [12]:
num_duplicates = df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"--- Found and removed {num_duplicates} duplicate row(s) ---")
print("-" * 30)

--- Found and removed 0 duplicate row(s) ---
------------------------------


In [13]:
print("")
df.info()

print("")
print(df.isnull().sum())

print("")
print(df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  147 non-null    object 
 1   sepal_width   148 non-null    float64
 2   petal_length  145 non-null    float64
 3   petal_width   148 non-null    float64
 4   species       147 non-null    object 
dtypes: float64(3), object(2)
memory usage: 6.9+ KB

sepal_length    1
sepal_width     0
petal_length    3
petal_width     0
species         1
dtype: int64

  sepal_length  sepal_width  petal_length  petal_width      species
0          5.1          3.5           1.4          0.2  Iris-setosa
1          4.9          3.0           1.4          0.2  Iris-setosa
2          4.7          3.2           1.3          0.2  Iris-setosa
3          4.6          3.1           1.5          0.2  Iris-setosa
4            5          3.6           1.4          0.2  Iris-setosa


In [14]:

pd.set_option('display.max_rows', 300)

print(df)

    sepal_length  sepal_width  petal_length  petal_width          species
0            5.1          3.5           1.4          0.2      Iris-setosa
1            4.9          3.0           1.4          0.2      Iris-setosa
2            4.7          3.2           1.3          0.2      Iris-setosa
3            4.6          3.1           1.5          0.2      Iris-setosa
4              5          3.6           1.4          0.2      Iris-setosa
5            5.4          3.9           1.7          0.4      Iris-setosa
6            4.6          3.4           1.4          0.3      Iris-setosa
7              5          3.4           1.5          0.2      Iris-setosa
8            4.4          2.9           1.4          0.2      Iris-setosa
9            4.9          3.1           1.5          0.1      Iris-setosa
10           5.4          3.7           NaN          0.2      Iris-setosa
11           4.8          3.4           1.6          0.2      Iris-setosa
12           4.8          3.0         

In [15]:

df.columns = df.columns.str.strip()


df['sepal_length'] = pd.to_numeric(df['sepal_length'], errors='coerce')

print("--- Successfully converted 'sepal_length' to a numeric type ---")

--- Successfully converted 'sepal_length' to a numeric type ---


In [16]:

pd.set_option('display.max_rows', 300)

print(df)

     sepal_length  sepal_width  petal_length  petal_width          species
0             5.1          3.5           1.4          0.2      Iris-setosa
1             4.9          3.0           1.4          0.2      Iris-setosa
2             4.7          3.2           1.3          0.2      Iris-setosa
3             4.6          3.1           1.5          0.2      Iris-setosa
4             5.0          3.6           1.4          0.2      Iris-setosa
5             5.4          3.9           1.7          0.4      Iris-setosa
6             4.6          3.4           1.4          0.3      Iris-setosa
7             5.0          3.4           1.5          0.2      Iris-setosa
8             4.4          2.9           1.4          0.2      Iris-setosa
9             4.9          3.1           1.5          0.1      Iris-setosa
10            5.4          3.7           NaN          0.2      Iris-setosa
11            4.8          3.4           1.6          0.2      Iris-setosa
12            4.8        

In [17]:
df['species'] = df['species'].fillna(mode_species)

# Verify that there are no more missing values
print("--- Successfully cleaned the 'species' column ---")
print("Missing values in 'species' after cleaning:")
print(df['species'].isnull().sum())

--- Successfully cleaned the 'species' column ---
Missing values in 'species' after cleaning:
0


In [18]:
df


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [20]:
df.ffill(inplace=True)

In [21]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
