In [1]:
import pandas as pd
import numpy as np #mathematical library

In [2]:
df = pd.read_csv('supershops.csv')

In [3]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


1. Normalization
2. Standardization
3. Log Transformation
4. Robust Scaler
5. Max Absolute Scaler


Python data types handle in pandas dataframe

<img src="https://raw.githubusercontent.com/mszahid/class/main/Class%2005%20-%20Feature%20Transform/data_types.png" style="width:780px; height:300px">
For more details: https://pbpython.com/pandas_dtypes.html

At frist we have to identify the numrical columns and string columns

In [4]:
# Identify the columns with string values
string_cols = df.select_dtypes(include=['object']).columns

# Identify the columns with numerical values
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Normalization

In [5]:
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #default feature_range=(0,1)

# creates a new DataFrame called df_normalization that is a copy of the original DataFrame df
df_normalization = df.copy()

# Perform Normalization using loop

In [7]:
# Ignore string columns and process numerical columns
# This loop iterates over the numerical columns in the DataFrame.
for col in numerical_cols:
    # Check if the column is already normalized
    # This condition checks if the minimum value of the column is greater than or equal to 0 and 
    #the maximum value is less than or equal to 1.
    if df_normalization[col].min() >= 0 and df_normalization[col].max() <= 1:
        # Skip normalization if already normalized
        continue
        
    # This line uses the `scaler.fit_transform()` method to apply normalization to the column
    df_normalization[[col]] = scaler.fit_transform(df_normalization[[col]])


# This loop iterates over the remaining columns in the DataFrame and ignore string columns
for col in df_normalization.columns:
    if col not in string_cols:
        df_normalization[col]
        
        
# Print the first five rows of the normalized DataFrame.
df_normalization.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,Dhaka,1.0
1,0.983359,0.761972,0.940893,Ctg,0.997355
2,0.927985,0.379579,0.864664,Rangpur,0.993178
3,0.873136,0.512998,0.812235,Dhaka,0.947292
4,0.859438,0.305328,0.776136,Rangpur,0.853171


# Standardization

In [8]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
standard_scaler = StandardScaler()

# creates a new DataFrame called df_standardized that is a copy of the original DataFrame df
df_standardized = df.copy()

In [9]:
# Iterate over the numerical columns in the DataFrame.
for col in numerical_cols:
    # Skip string columns.
    if col in string_cols:
        continue
        
    # Apply the standardization to the column.
    df_standardized[[col]] = standard_scaler.fit_transform(df_standardized[[col]])
    
# Print the first five rows of the standardized DataFrame.
df_standardized.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.897913,0.560753,2.165287,Dhaka,2.011203
1,1.95586,1.082807,1.929843,Ctg,1.99943
2,1.754364,-0.728257,1.626191,Rangpur,1.980842
3,1.554784,-0.096365,1.417348,Dhaka,1.776627
4,1.504937,-1.079919,1.27355,Rangpur,1.35774


# Log Transformation

In [10]:
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [11]:
from sklearn.preprocessing import FunctionTransformer

# Define a function for log transformation
log_transformer = FunctionTransformer(np.log1p) #np.log1p will avoid 0

# creates a new DataFrame called df_log_transformer that is a copy of the original DataFrame df
df_log_transformer = df.copy()

In [12]:
# Iterate over the numerical columns in the DataFrame.
for col in numerical_cols:
    # Skip string columns.
    if col in string_cols:
        continue
        
    # Apply the log transformation to the column.
    df_log_transformer[[col]] = log_transformer.transform(df_log_transformer[[col]])
    
# Print the first five rows of the log transformation DataFrame.
df_log_transformer.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,11.648545,11.826997,13.064279,Dhaka,12.166619
1,11.99904,11.927539,13.003354,Ctg,12.164172
2,11.941081,11.524326,12.918864,Rangpur,12.160298
3,11.880158,11.684126,12.856314,Dhaka,12.116711
4,11.864345,11.422922,12.810851,Rangpur,12.020881


# Robust Scaler

In [13]:
from sklearn.preprocessing import RobustScaler

# Create a RobustScaler instance
robust_scaler = RobustScaler()

# creates a new DataFrame called df_normalization that is a copy of the original DataFrame df
df_robust = df.copy()

In [14]:
# Iterate over the numerical columns in the DataFrame.
for col in numerical_cols:
    # Skip string columns.
    if col in string_cols:
        continue
        
    # Apply the robust scaler to the column.
    df_robust[[col]] = robust_scaler.fit_transform(df_robust[[col]])
    
# Print the first five rows of the robust scaler DataFrame.
df_robust.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.67253,0.345355,1.552016,Dhaka,1.69834
1,1.452113,0.697565,1.383714,Ctg,1.688874
2,1.303634,-0.52429,1.166654,Rangpur,1.673929
3,1.156567,-0.097977,1.017368,Dhaka,1.509736
4,1.119836,-0.761543,0.914576,Rangpur,1.172943


# Max Absolute Scaler

In [15]:
from sklearn.preprocessing import MaxAbsScaler

# Create a MaxAbsScaler instance
maxabs_scaler = MaxAbsScaler()

# creates a new DataFrame called df_maxabs that is a copy of the original DataFrame df
df_maxabs = df.copy()

In [16]:
# Iterate over the numerical columns in the DataFrame.
for col in numerical_cols:
    # Skip string columns.
    if col in string_cols:
        continue
        
    # Apply the max absolute to the column.
    df_maxabs[[col]] = maxabs_scaler.fit_transform(df_maxabs[[col]])
    
# Print the first five rows of the max absolute scaler DataFrame.
df_maxabs.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.749527,1.0,Dhaka,1.0
1,0.983359,0.828805,0.940893,Ctg,0.997557
2,0.927985,0.553781,0.864664,Rangpur,0.993699
3,0.873136,0.649738,0.812235,Dhaka,0.951317
4,0.859438,0.500378,0.776136,Rangpur,0.864383
