In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iris-dataset/Iris.csv


# Lab: Data Cleaning, Transformation, and Aggregation on the Iris Dataset

## Scenario Overview:
*You are analyzing the Iris dataset, which contains information on the physical characteristics of three species of iris flowers. Your goal is to clean the dataset, perform transformations, handle missing data, and apply aggregation techniques to summarize the data. Additionally, you'll reshape the data to prepare it for deeper analysis.*

In [2]:
df = pd.read_csv('/kaggle/input/iris-dataset/Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [4]:
df.shape

(150, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


# Tasks:

## Task 1: Identifying and Imputing Missing Data
* **Locate Missing Data:** Examine the dataset to locate any missing values. Identify the columns with missing data and report how many missing values are present in each column.

In [6]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

* **Handle Missing Data in Numerical Columns:** Fill in missing values for numeric columns (i.e., sepal_length, sepal_width, petal_length, petal_width) using the median value of each column. Justify why you chose this approach.

In [7]:
if not df.isna().sum().empty:
    numeric_cols = df.select_dtypes(include=np.number).columns
    for col in numeric_cols:
        df[col] = df[col].transform(lambda x: x.fillna(x.median()))

* **Handle Missing Data in Categorical Columns:** Identify if there are missing values in the species column. If so, impute them with the most frequent value (mode) in the column.

In [8]:
if not df.isna().sum().empty:
    df.fillna({'Species': df['Species'].mode().iloc[0]}, inplace = True)

## Task 2: Data Integrity and Transformation
* **Remove Duplicate Records:** Review the dataset for duplicate rows (where all values in a row are identical) and remove any duplicates found. Ensure that only one unique record per flower remains in the dataset.

In [9]:
df.drop_duplicates()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


* **Feature Engineering:** Create a new feature called total_area by adding the areas of both the sepal and petal. To do this, create separate columns for the sepal area and petal area and then add them to form the total_area column.

In [10]:
df['sepal_area'] = df['SepalLengthCm'] * df['SepalWidthCm']
df['petal_area'] = df['PetalLengthCm'] * df['PetalWidthCm']
df['total_area'] = df['sepal_area'] + df['petal_area']
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,sepal_area,petal_area,total_area
0,1,5.1,3.5,1.4,0.2,Iris-setosa,17.85,0.28,18.13
1,2,4.9,3.0,1.4,0.2,Iris-setosa,14.7,0.28,14.98
2,3,4.7,3.2,1.3,0.2,Iris-setosa,15.04,0.26,15.3
3,4,4.6,3.1,1.5,0.2,Iris-setosa,14.26,0.3,14.56
4,5,5.0,3.6,1.4,0.2,Iris-setosa,18.0,0.28,18.28


* **Handling Missing Values Again:** After imputing missing data, inspect the dataset again and drop any rows that still have missing values in any of the columns.

In [11]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
sepal_area       0
petal_area       0
total_area       0
dtype: int64

In [12]:
df.dropna()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,sepal_area,petal_area,total_area
0,1,5.1,3.5,1.4,0.2,Iris-setosa,17.85,0.28,18.13
1,2,4.9,3.0,1.4,0.2,Iris-setosa,14.70,0.28,14.98
2,3,4.7,3.2,1.3,0.2,Iris-setosa,15.04,0.26,15.30
3,4,4.6,3.1,1.5,0.2,Iris-setosa,14.26,0.30,14.56
4,5,5.0,3.6,1.4,0.2,Iris-setosa,18.00,0.28,18.28
...,...,...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica,20.10,11.96,32.06
146,147,6.3,2.5,5.0,1.9,Iris-virginica,15.75,9.50,25.25
147,148,6.5,3.0,5.2,2.0,Iris-virginica,19.50,10.40,29.90
148,149,6.2,3.4,5.4,2.3,Iris-virginica,21.08,12.42,33.50


## Task 3: Aggregation and Data Transformation
* **Numerical Conversion of Categorical Data:** Convert the species column, which is categorical, into a numerical format by assigning each species a unique number (e.g., 0, 1, 2).

In [13]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [14]:
unique_species = df['Species'].unique()
species_mapping = {species: index for index, species in enumerate(unique_species)}

df['Species_Numerical'] = df['Species'].map(species_mapping)
df[['Species','Species_Numerical']]

Unnamed: 0,Species,Species_Numerical
0,Iris-setosa,0
1,Iris-setosa,0
2,Iris-setosa,0
3,Iris-setosa,0
4,Iris-setosa,0
...,...,...
145,Iris-virginica,2
146,Iris-virginica,2
147,Iris-virginica,2
148,Iris-virginica,2


* **Apply Grouped Aggregation:** Using the transformed data, group the flowers by species and calculate the total sum of the numeric columns (sepal_length, sepal_width, petal_length, petal_width). Present the results in a table that shows the sum of each feature per species.

In [15]:
grouped_data = df.groupby('Species')[['SepalLengthCm','SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].sum()
grouped_data

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,250.3,170.9,73.2,12.2
Iris-versicolor,296.8,138.5,213.0,66.3
Iris-virginica,329.4,148.7,277.6,101.3


## Task 4: Data Reshaping
* **Reshape the Dataset into a Long Format:** Reshape the dataset from a wide format to a long format. The goal is to create a new version of the dataset where each row corresponds to a single measurement (sepal length, sepal width, petal length, or petal width) for each flower.

In [16]:
df_long = df.melt(id_vars='Id', 
                 var_name='Measurement', 
                 value_name='Value')
df_long

Unnamed: 0,Id,Measurement,Value
0,1,SepalLengthCm,5.1
1,2,SepalLengthCm,4.9
2,3,SepalLengthCm,4.7
3,4,SepalLengthCm,4.6
4,5,SepalLengthCm,5.0
...,...,...,...
1345,146,Species_Numerical,2
1346,147,Species_Numerical,2
1347,148,Species_Numerical,2
1348,149,Species_Numerical,2
