In [3]:
# Data Transformation

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Sample Data
data= {
    'Feature1': [10,20,30,40,50],
    'Feature 2': [100,200,300,400,500]
}
df= pd.DataFrame(data)
df

Unnamed: 0,Feature1,Feature 2
0,10,100
1,20,200
2,30,300
3,40,400
4,50,500


In [11]:
# Min-Max scaling
scaler= MinMaxScaler()
df_min_max_scaled= pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print("Min-Max Scaled Data:\n",df_min_max_scaled)

Min-Max Scaled Data:
    Feature1  Feature 2
0      0.00       0.00
1      0.25       0.25
2      0.50       0.50
3      0.75       0.75
4      1.00       1.00


In [19]:
# Standardization
scaler= StandardScaler()
df_standard_scaled= pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
print("Standardized Data:\n",df_standard_scaled)

Standardized Data:
    Feature1  Feature 2
0 -1.414214  -1.414214
1 -0.707107  -0.707107
2  0.000000   0.000000
3  0.707107   0.707107
4  1.414214   1.414214


In [21]:
# Example

from sklearn.preprocessing import LabelEncoder

# Sample Data
data= {
    'City': ['New York', 'Los Angeles','Chicago','Houston','Phoneix']
}
df1= pd.DataFrame(data)
df1

Unnamed: 0,City
0,New York
1,Los Angeles
2,Chicago
3,Houston
4,Phoneix


In [23]:
# Label Encoding
label_encoder= LabelEncoder()
df1['City_Label_Encoded']= label_encoder.fit_transform(df1['City'])
print("Label Encoded Data:\n",df1)

Label Encoded Data:
           City  City_Label_Encoded
0     New York                   3
1  Los Angeles                   2
2      Chicago                   0
3      Houston                   1
4      Phoneix                   4


In [25]:
# One-Hot Encoding Using Pandas
df_one_hot= pd.get_dummies(df1['City'],prefix='City')
df2= pd.concat([df1,df_one_hot],axis=1)
print('One-Hot Encoded Data:\n',df2)

One-Hot Encoded Data:
           City  City_Label_Encoded  City_Chicago  City_Houston  \
0     New York                   3         False         False   
1  Los Angeles                   2         False         False   
2      Chicago                   0          True         False   
3      Houston                   1         False          True   
4      Phoneix                   4         False         False   

   City_Los Angeles  City_New York  City_Phoneix  
0             False           True         False  
1              True          False         False  
2             False          False         False  
3             False          False         False  
4             False          False          True  


In [37]:
# Exercise: Scaling!

import pandas as pd

# Creating a toy dataset
data1 = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan', 'Judy'],
    'Age': [25, 45, 35, 50, 23, 52, 30, 40, 48, 28],
    'Income($)': [50000, 120000, 80000, 150000, 45000, 130000, 70000, 110000, 140000, 65000]
}

# Converting the data to a pandas DataFrame
df_e = pd.DataFrame(data1)

# Displaying the dataset
print(df_e)

      Name  Age  Income($)
0    Alice   25      50000
1      Bob   45     120000
2  Charlie   35      80000
3    David   50     150000
4      Eve   23      45000
5    Frank   52     130000
6    Grace   30      70000
7    Heidi   40     110000
8     Ivan   48     140000
9     Judy   28      65000


In [61]:
scaler= MinMaxScaler()
df_min_max_scaled1= pd.DataFrame(scaler.fit_transform(df_e[['Age','Income($)']]), columns=['Age','Income($)'])
print("Min-Max Scaled Data:\n",df_min_max_scaled1)

Min-Max Scaled Data:
         Age  Income($)
0  0.068966   0.047619
1  0.758621   0.714286
2  0.413793   0.333333
3  0.931034   1.000000
4  0.000000   0.000000
5  1.000000   0.809524
6  0.241379   0.238095
7  0.586207   0.619048
8  0.862069   0.904762
9  0.172414   0.190476


In [65]:
# Standardization
scaler= StandardScaler()
df_standard_scaled= pd.DataFrame(scaler.fit_transform(df_e[['Age','Income($)']]),columns=['Age','Income($)'])
print("Standardized Data:\n",df_standard_scaled)

Standardized Data:
         Age  Income($)
0 -1.224745  -1.257093
1  0.719295   0.655875
2 -0.252725  -0.437250
3  1.205304   1.475718
4 -1.419149  -1.393734
5  1.399708   0.929156
6 -0.738735  -0.710531
7  0.233285   0.382594
8  1.010901   1.202437
9 -0.933139  -0.847172


In [49]:
# Label Encoding
label_encoder= LabelEncoder()
df_e['Income_Label_Encoded']= label_encoder.fit_transform(df_e['Income($)'])
print("Label Encoded Data:\n",df_e)

Label Encoded Data:
       Name  Age  Income($)  Income_Label_Encoded
0    Alice   25      50000                     1
1      Bob   45     120000                     6
2  Charlie   35      80000                     4
3    David   50     150000                     9
4      Eve   23      45000                     0
5    Frank   52     130000                     7
6    Grace   30      70000                     3
7    Heidi   40     110000                     5
8     Ivan   48     140000                     8
9     Judy   28      65000                     2


In [51]:
# One-Hot Encoding Using Pandas
df_one_hot= pd.get_dummies(df_e['Income($)'],prefix='Income($)')
df2= pd.concat([df_e,df_one_hot],axis=1)
print('One-Hot Encoded Data:\n',df2)

One-Hot Encoded Data:
       Name  Age  Income($)  Income_Label_Encoded  Income($)_45000  \
0    Alice   25      50000                     1            False   
1      Bob   45     120000                     6            False   
2  Charlie   35      80000                     4            False   
3    David   50     150000                     9            False   
4      Eve   23      45000                     0             True   
5    Frank   52     130000                     7            False   
6    Grace   30      70000                     3            False   
7    Heidi   40     110000                     5            False   
8     Ivan   48     140000                     8            False   
9     Judy   28      65000                     2            False   

   Income($)_50000  Income($)_65000  Income($)_70000  Income($)_80000  \
0             True            False            False            False   
1            False            False            False            False  