In [2]:
!pip install scikit-learn
!pip install pandas



In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

# Generate random dataset
data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D'], size=10),  # Categorical column
    'Value1': np.random.randint(1, 100, size=10),  # Numerical column
    'Value2': np.random.choice([None, 10, 20, 30, 40], size=10),  # Numerical column with missing values
}
df = pd.DataFrame(data)

print("Initial Dataset:\n", df)

Initial Dataset:
   Category  Value1 Value2
0        A      26     30
1        C      73   None
2        A      25     20
3        A      62     40
4        B      39     30
5        A       8   None
6        A      88     40
7        D      55     30
8        A      55     30
9        D      66     20


In [4]:
# Fill missing values with the mean
print("\n--- Handling Missing Values ---")
print("Before Filling Missing Values:\n", df)

df['Value2'] = df['Value2'].fillna(df['Value2'].mean())
print("After Filling Missing Values:\n", df)


--- Handling Missing Values ---
Before Filling Missing Values:
   Category  Value1 Value2
0        A      26     30
1        C      73   None
2        A      25     20
3        A      62     40
4        B      39     30
5        A       8   None
6        A      88     40
7        D      55     30
8        A      55     30
9        D      66     20
After Filling Missing Values:
   Category  Value1  Value2
0        A      26    30.0
1        C      73    30.0
2        A      25    20.0
3        A      62    40.0
4        B      39    30.0
5        A       8    30.0
6        A      88    40.0
7        D      55    30.0
8        A      55    30.0
9        D      66    20.0


  df['Value2'] = df['Value2'].fillna(df['Value2'].mean())


In [5]:
# Apply Label Encoding
print("\n--- Label Encoding ---")
print("Before Label Encoding:\n", df)

label_encoder = LabelEncoder()
df['Category_LabelEncoded'] = label_encoder.fit_transform(df['Category'])
print("After Label Encoding:\n", df)


--- Label Encoding ---
Before Label Encoding:
   Category  Value1  Value2
0        A      26    30.0
1        C      73    30.0
2        A      25    20.0
3        A      62    40.0
4        B      39    30.0
5        A       8    30.0
6        A      88    40.0
7        D      55    30.0
8        A      55    30.0
9        D      66    20.0
After Label Encoding:
   Category  Value1  Value2  Category_LabelEncoded
0        A      26    30.0                      0
1        C      73    30.0                      2
2        A      25    20.0                      0
3        A      62    40.0                      0
4        B      39    30.0                      1
5        A       8    30.0                      0
6        A      88    40.0                      0
7        D      55    30.0                      3
8        A      55    30.0                      0
9        D      66    20.0                      3


In [6]:
# Apply One-Hot Encoding
print("\n--- One-Hot Encoding ---")
print("Before One-Hot Encoding:\n", df)

one_hot_encoded = pd.get_dummies(df['Category'], prefix='Category')
df = pd.concat([df, one_hot_encoded], axis=1)
print("After One-Hot Encoding:\n", df)


--- One-Hot Encoding ---
Before One-Hot Encoding:
   Category  Value1  Value2  Category_LabelEncoded
0        A      26    30.0                      0
1        C      73    30.0                      2
2        A      25    20.0                      0
3        A      62    40.0                      0
4        B      39    30.0                      1
5        A       8    30.0                      0
6        A      88    40.0                      0
7        D      55    30.0                      3
8        A      55    30.0                      0
9        D      66    20.0                      3
After One-Hot Encoding:
   Category  Value1  Value2  Category_LabelEncoded  Category_A  Category_B  \
0        A      26    30.0                      0        True       False   
1        C      73    30.0                      2       False       False   
2        A      25    20.0                      0        True       False   
3        A      62    40.0                      0        True    

In [7]:
# Apply Standard Scaling
print("\n--- Standard Scaling ---")
scaler = StandardScaler()
columns_to_scale = ['Value1', 'Value2']
print("Before Standard Scaling:\n", df[columns_to_scale])

df_scaled = scaler.fit_transform(df[columns_to_scale])
scaled_df = pd.DataFrame(df_scaled, columns=[f"{col}_StandardScaled" for col in columns_to_scale])
df = pd.concat([df, scaled_df], axis=1)
print("After Standard Scaling:\n", scaled_df)


--- Standard Scaling ---
Before Standard Scaling:
    Value1  Value2
0      26    30.0
1      73    30.0
2      25    20.0
3      62    40.0
4      39    30.0
5       8    30.0
6      88    40.0
7      55    30.0
8      55    30.0
9      66    20.0
After Standard Scaling:
    Value1_StandardScaled  Value2_StandardScaled
0              -1.009828               0.000000
1               0.992785               0.000000
2              -1.052437              -1.581139
3               0.524088               1.581139
4              -0.455914               0.000000
5              -1.776786               0.000000
6               1.631916               1.581139
7               0.225827               0.000000
8               0.225827               0.000000
9               0.694523              -1.581139


In [9]:
  # Apply Normalization
print("\n--- Normalization ---")
normalizer = MinMaxScaler()
print("Before Normalization:\n", df[columns_to_scale])

normalized_data = normalizer.fit_transform(df[columns_to_scale])
normalized_df = pd.DataFrame(normalized_data, columns=[f"{col}_Normalized" for col in columns_to_scale])
df = pd.concat([df, normalized_df], axis=1)
print("After Normalization:\n", normalized_df)



--- Normalization ---
Before Normalization:
    Value1  Value2
0      26    30.0
1      73    30.0
2      25    20.0
3      62    40.0
4      39    30.0
5       8    30.0
6      88    40.0
7      55    30.0
8      55    30.0
9      66    20.0
After Normalization:
    Value1_Normalized  Value2_Normalized
0             0.2250                0.5
1             0.8125                0.5
2             0.2125                0.0
3             0.6750                1.0
4             0.3875                0.5
5             0.0000                0.5
6             1.0000                1.0
7             0.5875                0.5
8             0.5875                0.5
9             0.7250                0.0
