**Load the Iris dataset (available in seaborn) into a pandas DataFrame**

In [42]:
import seaborn as sns
import pandas as pd

In [43]:
iris_df = pd.read_csv("Datasets\A01.csv")

In [44]:
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


1. Find the mean and median of the 'sepal_length' column.

In [45]:
sepal_length_mean = iris_df['sepal_length'].mean()
sepal_length_median = iris_df['sepal_length'].median()

In [46]:
print("Mean of 'sepal_length':", sepal_length_mean)
print("Median of 'sepal_length':", sepal_length_median)

Mean of 'sepal_length': 5.843333333333334
Median of 'sepal_length': 5.8


2. Calculate the 75th percentile of the 'petal_width' column for each species in the Iris dataset.

In [47]:
petal_width_percentile_75 = iris_df.groupby('species')['petal_width'].quantile(0.75)

In [48]:
print("75th percentile of 'petal_width' for each species:\n", petal_width_percentile_75)

75th percentile of 'petal_width' for each species:
 species
setosa        0.3
versicolor    1.5
virginica     2.3
Name: petal_width, dtype: float64


3. Create a new column in the Iris DataFrame called 'sepal_area', which is the product of 'sepal_length' and 'sepal_width'.

In [49]:
iris_df['sepal_area'] = iris_df['sepal_length'] * iris_df['sepal_width']

In [50]:
iris_df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
0,5.1,3.5,1.4,0.2,setosa,17.85
1,4.9,3.0,1.4,0.2,setosa,14.7
2,4.7,3.2,1.3,0.2,setosa,15.04
3,4.6,3.1,1.5,0.2,setosa,14.26
4,5.0,3.6,1.4,0.2,setosa,18.0


4. Remove all rows in the Iris DataFrame where 'petal_length' is greater than twice the standard deviation of 'petal_length' for that species.

In [51]:
def remove_outliers_by_species(group):
    return group[abs(group['petal_length'] - group['petal_length'].mean()) <= 2 * group['petal_length'].std()]

iris_df = iris_df.groupby('species').apply(remove_outliers_by_species).reset_index(drop=True)

5. Normalize all numerical columns in the Iris DataFrame (except the 'species' column) using Min-Max scaling.

In [52]:
numeric_cols = iris_df.select_dtypes(include='number').columns
iris_df[numeric_cols] = (iris_df[numeric_cols] - iris_df[numeric_cols].min()) / (iris_df[numeric_cols].max() - iris_df[numeric_cols].min())

6. Find the three most common combinations of 'sepal_length', 'sepal_width', and 'petal_length' in the Iris dataset.

In [53]:
most_common_combinations = iris_df.groupby(['sepal_length', 'sepal_width', 'petal_length']).size().nlargest(3)

In [54]:
print("Three most common combinations of 'sepal_length', 'sepal_width', and 'petal_length':\n", most_common_combinations)

Three most common combinations of 'sepal_length', 'sepal_width', and 'petal_length':
 sepal_length  sepal_width  petal_length
0.114286      0.416667     0.037037        2
0.142857      0.458333     0.055556        2
0.200000      0.625000     0.037037        2
dtype: int64


7. Group the Iris DataFrame by 'species' and find the row with the highest 'sepal_width' for each group.

In [55]:
max_sepal_width_rows = iris_df.loc[iris_df.groupby('species')['sepal_width'].idxmax()]

In [56]:
print("Row with the highest 'sepal_width' for each species:\n", max_sepal_width_rows)

Row with the highest 'sepal_width' for each species:
      sepal_length  sepal_width  petal_length  petal_width     species  \
14       0.371429     1.000000      0.055556     0.125000      setosa   
80       0.457143     0.583333      0.611111     0.625000  versicolor   
121      1.000000     0.750000      0.962963     0.791667   virginica   

     sepal_area  
14     0.753247  
80     0.519481  
121    1.000000  


8. Replace all negative values in the 'petal_width' column of the Iris DataFrame with the mean of the non-negative values in that column.

In [57]:
non_negative_mean_petal_width = iris_df['petal_width'].replace(iris_df[iris_df['petal_width'] >= 0]['petal_width'].mean())

In [58]:
print("Negative values replaced with the mean of non-negative values in 'petal_width':\n", non_negative_mean_petal_width)

Negative values replaced with the mean of non-negative values in 'petal_width':
 0      0.041667
1      0.041667
2      0.041667
3      0.041667
4      0.041667
         ...   
135    0.916667
136    0.750000
137    0.791667
138    0.916667
139    0.708333
Name: petal_width, Length: 140, dtype: float64


9. Calculate the correlation matrix for the 'sepal_length', 'sepal_width', 'petal_length', and 'petal_width' columns in the Iris dataset and find the feature with the highest absolute correlation with 'petal_width'

In [59]:
correlation_matrix = iris_df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].corr()
highest_correlation_feature = correlation_matrix['petal_width'].abs().idxmax()

In [60]:
print("Correlation matrix:\n", correlation_matrix)
print("Feature with the highest absolute correlation with 'petal_width':", highest_correlation_feature)

Correlation matrix:
               sepal_length  sepal_width  petal_length  petal_width
sepal_length      1.000000    -0.142432      0.861591     0.812154
sepal_width      -0.142432     1.000000     -0.458157    -0.381079
petal_length      0.861591    -0.458157      1.000000     0.962317
petal_width       0.812154    -0.381079      0.962317     1.000000
Feature with the highest absolute correlation with 'petal_width': petal_width
