In [3]:
# Task 1: Data Cleaning
#This notebook demonstrates the steps for cleaning the Iris dataset, including handling missing data, identifying and removing duplicates, detecting and managing outliers, and scaling features as needed. 


In [17]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.stats import zscore

# Load Iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['species'] = iris.target

# Display the first few rows
data.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [18]:
#Step 1: Identify and handle missing data:
#Use methods like imputation (mean, median, mode) or remove rows/columns with missing data.
#First, we will check for missing values. If any are found, we can either fill them using imputation (e.g., mean, median) or drop the affected rows or columns.


In [19]:
# Introduce missing values for demonstration (in a real scenario, this step wouldn't be needed)

data.loc[5:10, 'sepal length (cm)'] = np.nan
data.loc[15:20, 'sepal width (cm)'] = np.nan

# Check for missing values
print("Missing values in each column:\n", data.isnull().sum())

# Fill missing values with column mean
data_filled = data.fillna(data.mean())

# Display data before and after handling missing values
print("Data before filling missing values:\n", data.head(12))
print("\nData after filling missing values:\n", data_filled.head(12))


Missing values in each column:
 sepal length (cm)    6
sepal width (cm)     6
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64
Data before filling missing values:
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                 5.1               3.5                1.4               0.2   
1                 4.9               3.0                1.4               0.2   
2                 4.7               3.2                1.3               0.2   
3                 4.6               3.1                1.5               0.2   
4                 5.0               3.6                1.4               0.2   
5                 NaN               3.9                1.7               0.4   
6                 NaN               3.4                1.4               0.3   
7                 NaN               3.4                1.5               0.2   
8                 NaN               2.9                1.4               0.2   
9  

In [20]:
# Step 2: Identifying and Removing Duplicate Records
#Next, we’ll check for duplicate records in the dataset. If any duplicates are found, they will be removed.

In [21]:
# Check for duplicate records
duplicates = data_filled.duplicated()
print("Number of duplicate rows:", duplicates.sum())

# Remove duplicates
data_no_duplicates = data_filled.drop_duplicates()

# Show data before and after removing duplicates
print("Data before removing duplicates:\n", data_filled)
print("\nData after removing duplicates:\n", data_no_duplicates)


Number of duplicate rows: 1
Data before removing duplicates:
      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0       

In [22]:
# Step 3: Detect and handle outliers using appropriate techniques (e.g., z-scores, IQR).

#We’ll identify outliers using two techniques:
#1. **Z-score**: Flags values more than 3 standard deviations from the mean.
#2. **Interquartile Range (IQR)**: Flags values outside the 1.5 * IQR range.


In [25]:
# Detecting outliers using Z-score
z_scores = data_no_duplicates.iloc[:, :-1].apply(zscore)
outliers_z = (z_scores > 3) | (z_scores < -3)
data_no_outliers_z = data_no_duplicates[~outliers_z.any(axis=1)]

# Detecting outliers using IQR
def detect_outliers_iqr(df):
    outliers = pd.DataFrame()
    for col in df.columns[:-1]:  # Exclude species
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[col] = (df[col] < lower_bound) | (df[col] > upper_bound)
    return outliers

outliers_iqr = detect_outliers_iqr(data_no_duplicates)
data_no_outliers_iqr = data_no_duplicates[~outliers_iqr.any(axis=1)]

# Display data after removing outliers (using IQR method)
print("Data after removing outliers (IQR method):\n", data_no_outliers_iqr.head())


Data after removing outliers (IQR method):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  


In [26]:
# Step 4: Normalize or scale features if necessary.

#Finally, we scale the features to ensure all values are on a comparable scale. We’ll apply both Min-Max scaling and Z-score scaling.


In [29]:
# Min-Max Scaling
scaler_minmax = MinMaxScaler()
data_minmax_scaled = data_no_outliers_iqr.copy()
data_minmax_scaled.iloc[:, :-1] = scaler_minmax.fit_transform(data_no_outliers_iqr.iloc[:, :-1])

# Standard Scaling
scaler_standard = StandardScaler()
data_standard_scaled = data_no_outliers_iqr.copy()
data_standard_scaled.iloc[:, :-1] = scaler_standard.fit_transform(data_no_outliers_iqr.iloc[:, :-1])

# Display Min-Max and Standard Scaled data
print("Min-Max Scaled Data:\n", data_minmax_scaled.head())
print("\nStandard Scaled Data:\n", data_standard_scaled.head())


Min-Max Scaled Data:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0           0.222222            0.8125           0.067797          0.041667   
1           0.166667            0.5000           0.067797          0.041667   
2           0.111111            0.6250           0.050847          0.041667   
3           0.083333            0.5625           0.084746          0.041667   
4           0.194444            0.8750           0.067797          0.041667   

   species  
0        0  
1        0  
2        0  
3        0  
4        0  

Standard Scaled Data:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0          -0.979398          1.363950          -1.380452         -1.354603   
1          -1.225739         -0.020348          -1.380452         -1.354603   
2          -1.472079          0.533371          -1.437623         -1.354603   
3          -1.595249          0.256512          -1.323280         -1.354603   
4     

In [None]:
# Summary

#In this notebook, I cleaned the Iris dataset through the following steps:
#1. **Handled Missing Values**: Filled missing values with column means.
#2. **Removed Duplicates**: Ensured data uniqueness by removing duplicate rows.
#3. **Detected and Removed Outliers**: Used Z-score and IQR methods to identify and remove extreme values.
#4. **Scaled Features**: Used Min-Max and Standard scaling to normalize features for better comparability in model training.

This cleaned dataset is now ready for further analysis or machine learning.
