In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

print(df.head())
print("\nDataset shape:", df.shape)
print("\nSummary statistics:\n", df.describe())

X = df.drop(columns=['species'])

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  

Dataset shape: (150, 5)

Summary statistics:
        sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             

In [2]:
scaler = StandardScaler()
df_standardized = scaler.fit_transform(X)
df_standardized = pd.DataFrame(df_standardized, columns=X.columns)

print("\nStandardized Data (first 5 rows):\n", df_standardized.head())


Standardized Data (first 5 rows):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -0.900681          1.019004          -1.340227         -1.315444
1          -1.143017         -0.131979          -1.340227         -1.315444
2          -1.385353          0.328414          -1.397064         -1.315444
3          -1.506521          0.098217          -1.283389         -1.315444
4          -1.021849          1.249201          -1.340227         -1.315444


In [3]:
normalizer = MinMaxScaler()
df_normalized = normalizer.fit_transform(X)
df_normalized = pd.DataFrame(df_normalized, columns=X.columns)

print("\nNormalized Data (first 5 rows):\n", df_normalized.head())


Normalized Data (first 5 rows):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.222222          0.625000           0.067797          0.041667
1           0.166667          0.416667           0.067797          0.041667
2           0.111111          0.500000           0.050847          0.041667
3           0.083333          0.458333           0.084746          0.041667
4           0.194444          0.666667           0.067797          0.041667


In [4]:
df_transformed = df.copy()
df_transformed["sepal width (cm)"] = np.log1p(df_transformed["sepal width (cm)"])

print("\nTransformed (log) Data (first 5 rows):\n", df_transformed.head())


Transformed (log) Data (first 5 rows):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1          1.504077                1.4               0.2   
1                4.9          1.386294                1.4               0.2   
2                4.7          1.435085                1.3               0.2   
3                4.6          1.410987                1.5               0.2   
4                5.0          1.526056                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


In [5]:
agg_df = df.groupby("species", observed=False).mean()
print("\nAggregated Mean by Species:\n", agg_df)


Aggregated Mean by Species:
             sepal length (cm)  sepal width (cm)  petal length (cm)  \
species                                                              
setosa                  5.006             3.428              1.462   
versicolor              5.936             2.770              4.260   
virginica               6.588             2.974              5.552   

            petal width (cm)  
species                       
setosa                 0.246  
versicolor             1.326  
virginica              2.026  


In [6]:
bins = [0, 2.5, 5, 7.5]
labels = ['Small', 'Medium', 'Large']

df_discrete = df.copy()
df_discrete['custom_petal_length'] = pd.cut(df['petal length (cm)'], bins=bins, labels=labels)

print("\nDiscretized Data (Petal Length into 3 bins):\n", df_discrete.head())


Discretized Data (Petal Length into 3 bins):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species custom_petal_length  
0  setosa               Small  
1  setosa               Small  
2  setosa               Small  
3  setosa               Small  
4  setosa               Small  


In [7]:
binarizer = Binarizer(threshold=1.0)
df_binarized = df.copy()
df_binarized["wide_petal"] = binarizer.fit_transform(df[["petal width (cm)"]])

print("\nBinarized Data (Petal width >= 1.0 -> 1):\n", df_binarized.head())


Binarized Data (Petal width >= 1.0 -> 1):
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  wide_petal  
0  setosa         0.0  
1  setosa         0.0  
2  setosa         0.0  
3  setosa         0.0  
4  setosa         0.0  


In [8]:
sample_df = df.sample(frac=0.2, random_state=42)
print("\nSampled 20% of the Data:\n", sample_df)


Sampled 20% of the Data:
      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
73                 6.1               2.8                4.7               1.2   
18                 5.7               3.8                1.7               0.3   
118                7.7               2.6                6.9               2.3   
78                 6.0               2.9                4.5               1.5   
76                 6.8               2.8                4.8               1.4   
31                 5.4               3.4                1.5               0.4   
64                 5.6               2.9                3.6               1.3   
141                6.9               3.1                5.1               2.3   
68                 6.2               2.2                4.5               1.5   
82                 5.8               2.7                3.9               1.2   
110                6.5               3.2                5.1               2.0   
1