## **Handle Categoricsl Data**

In [1]:
!pip install category_encoders



In [2]:
import pandas as pd
import numpy as np
import category_encoders as ce

In [3]:
data = pd.DataFrame({
    'gender' : ['Male', 'Female', 'Male', 'Female', 'Female'],
    'class' : ['A', 'B', 'C', 'D', 'A'],
    'city': ['Cairo', 'Giza', 'Cairo', 'Cairo', 'Giza']
})
data

Unnamed: 0,gender,class,city
0,Male,A,Cairo
1,Female,B,Giza
2,Male,C,Cairo
3,Female,D,Cairo
4,Female,A,Giza


### **1- One Hot Encoder (2 Category)**

In [4]:
one_hot_encoder = ce.OneHotEncoder(cols = ['gender', 'city'])

#use fit_transform one hot encoder on the data
new_data = one_hot_encoder.fit_transform(data)
new_data

Unnamed: 0,gender_1,gender_2,class,city_1,city_2
0,1,0,A,1,0
1,0,1,B,0,1
2,1,0,C,1,0
3,0,1,D,1,0
4,0,1,A,0,1


In [6]:
new_data = new_data.rename(columns = {'gender_1' : 'Male', "gender_2": "Female", 'city_1': 'Cairo', 
                                     'city_2': 'Giza'})

new_data

Unnamed: 0,Male,Female,class,Cairo,Giza
0,1,0,A,1,0
1,0,1,B,0,1
2,1,0,C,1,0
3,0,1,D,1,0
4,0,1,A,0,1


### **2- Binary Encoder**

In [7]:
new_data['class'].unique()

array(['A', 'B', 'C', 'D'], dtype=object)

In [9]:
binary_encoder = ce.BinaryEncoder(cols = ['class'])

#fit and transform data with binary encoder
new_data = binary_encoder.fit_transform(new_data)

new_data

Unnamed: 0,Male,Female,class_0,class_1,class_2,Cairo,Giza
0,1,0,0,0,1,1,0
1,0,1,0,1,0,0,1
2,1,0,0,1,1,1,0
3,0,1,1,0,0,1,0
4,0,1,0,0,1,0,1


In [18]:
data = pd.DataFrame({
    'gender': ['Male', 'Female', 'Male', 'Female', 'Female', 'Male'],
    'school': ['school1', 'school1', 'school2', 'school2', 'school3', 'school3'],
    'courses': ['AI', 'AI', 'Web', 'Flutter', 'Web', 'Cs'],
    'city': ['Delhi', 'Greece', 'Delhi', 'Delhi', 'Greece', 'Delhi']
})

In [19]:
data

Unnamed: 0,gender,school,courses,city
0,Male,school1,AI,Delhi
1,Female,school1,AI,Greece
2,Male,school2,Web,Delhi
3,Female,school2,Flutter,Delhi
4,Female,school3,Web,Greece
5,Male,school3,Cs,Delhi


In [20]:
#one hot encoding (gender, city)
one_hot_encoder = ce.OneHotEncoder(cols = ['gender', 'city'])
encoded_data = one_hot_encoder.fit_transform(data)
encoded_data

Unnamed: 0,gender_1,gender_2,school,courses,city_1,city_2
0,1,0,school1,AI,1,0
1,0,1,school1,AI,0,1
2,1,0,school2,Web,1,0
3,0,1,school2,Flutter,1,0
4,0,1,school3,Web,0,1
5,1,0,school3,Cs,1,0


In [21]:
encoded_data = encoded_data.rename(columns = {"gender_1" : 'Male', 'gender_2': 'Female', 
                                             'city_1' : 'Delhi', 'city_2': 'Greece'})
encoded_data

Unnamed: 0,Male,Female,school,courses,Delhi,Greece
0,1,0,school1,AI,1,0
1,0,1,school1,AI,0,1
2,1,0,school2,Web,1,0
3,0,1,school2,Flutter,1,0
4,0,1,school3,Web,0,1
5,1,0,school3,Cs,1,0


In [22]:
#apply binary encoder on school and courses
binary_encoder = ce.BinaryEncoder(cols = ['school', 'courses'])
#fit and transform
encoded_data = binary_encoder.fit_transform(encoded_data)
encoded_data

Unnamed: 0,Male,Female,school_0,school_1,courses_0,courses_1,courses_2,Delhi,Greece
0,1,0,0,1,0,0,1,1,0
1,0,1,0,1,0,0,1,0,1
2,1,0,1,0,0,1,0,1,0
3,0,1,1,0,0,1,1,1,0
4,0,1,1,1,0,1,0,0,1
5,1,0,1,1,1,0,0,1,0


### **3- Label Encoding**

In [23]:
#import label encoder
from sklearn.preprocessing import LabelEncoder

In [26]:
data = pd.DataFrame({
    'gender': ['Male', 'Female', 'Male', 'Female', 'Female', 'Male'],
    'school': ['school1', 'school1', 'school2', 'school2', 'school3', 'school3'],
    'courses': ['AI', 'AI', 'Web', 'Flutter', 'Web', 'Cs'],
    'city': ['Delhi', 'Greece', 'Delhi', 'Delhi', 'Greece', 'Delhi']
})
data

Unnamed: 0,gender,school,courses,city
0,Male,school1,AI,Delhi
1,Female,school1,AI,Greece
2,Male,school2,Web,Delhi
3,Female,school2,Flutter,Delhi
4,Female,school3,Web,Greece
5,Male,school3,Cs,Delhi


In [None]:
ai --> 0
web ---> 1
flutter ---> 2
cs ---> 3

In [87]:
#call label encoder from sklearn
label_encoder = LabelEncoder()

#fit tansform
encoded_data = label_encoder.fit_transform(data['school'])

In [88]:
encoded_data

array([0, 0, 1, 1, 2, 2])

In [89]:
encoded_data = pd.DataFrame(encoded_data)

In [90]:
encoded_data = encoded_data.rename(columns = {0: 'Encoded_school'})

In [91]:
encoded_data

Unnamed: 0,Encoded_school
0,0
1,0
2,1
3,1
4,2
5,2


In [92]:
encoded_data = pd.concat([data, encoded_data], axis = 1)

In [93]:
encoded_data

Unnamed: 0,gender,school,courses,city,Encoded_school
0,Male,school1,AI,Delhi,0
1,Female,school1,AI,Greece,0
2,Male,school2,Web,Delhi,1
3,Female,school2,Flutter,Delhi,1
4,Female,school3,Web,Greece,2
5,Male,school3,Cs,Delhi,2


In [94]:
encoded_data.drop(columns = "school", inplace = True)

In [95]:
encoded_data

Unnamed: 0,gender,courses,city,Encoded_school
0,Male,AI,Delhi,0
1,Female,AI,Greece,0
2,Male,Web,Delhi,1
3,Female,Flutter,Delhi,1
4,Female,Web,Greece,2
5,Male,Cs,Delhi,2


In [96]:
#call label encoder from sklearn
label_encoder = LabelEncoder()

#fit tansform
encoded_courses = label_encoder.fit_transform(data['courses'])
encoded_courses

array([0, 0, 3, 2, 3, 1])

In [97]:
encoded_courses = pd.DataFrame(encoded_courses)

In [98]:
encoded_courses = encoded_courses.rename(columns = {0: 'Encoded_courses'})
encoded_courses

Unnamed: 0,Encoded_courses
0,0
1,0
2,3
3,2
4,3
5,1


In [100]:
encoded_data = pd.concat([encoded_courses, encoded_data], axis = 1)
encoded_data

Unnamed: 0,Encoded_courses,gender,courses,city,Encoded_school
0,0,Male,AI,Delhi,0
1,0,Female,AI,Greece,0
2,3,Male,Web,Delhi,1
3,2,Female,Flutter,Delhi,1
4,3,Female,Web,Greece,2
5,1,Male,Cs,Delhi,2


In [101]:
encoded_data.drop(columns = "courses", inplace = True)

In [102]:
encoded_data

Unnamed: 0,Encoded_courses,gender,city,Encoded_school
0,0,Male,Delhi,0
1,0,Female,Greece,0
2,3,Male,Delhi,1
3,2,Female,Delhi,1
4,3,Female,Greece,2
5,1,Male,Delhi,2


### **4- Ordinal Encoding**

In [103]:
data = pd.DataFrame({
    'height': ['meduim', 'short', 'tall', 'meduim', 'short', 'tall', 'meduim', 'short', 'tall']
})
data

Unnamed: 0,height
0,meduim
1,short
2,tall
3,meduim
4,short
5,tall
6,meduim
7,short
8,tall


In [104]:
ordinal_encoding = ce.OrdinalEncoder(cols = ['height'],
                                    return_df = True,
                                    mapping = [{'col' : 'height',
                                               'mapping': {
                                                   'None' : 0,
                                                   'tall' : 1,
                                                   'meduim' : 2,
                                                   'short' :3
                                               }}])

In [105]:
data['encoding'] = ordinal_encoding.fit_transform(data)
data

Unnamed: 0,height,encoding
0,meduim,2
1,short,3
2,tall,1
3,meduim,2
4,short,3
5,tall,1
6,meduim,2
7,short,3
8,tall,1


In [106]:
data.drop('height', axis = 1, inplace = True )
data

Unnamed: 0,encoding
0,2
1,3
2,1
3,2
4,3
5,1
6,2
7,3
8,1


### **5- Target Encoding**

In [111]:
df = pd.DataFrame({
    'name' :['alex', 'john', 'mary','alex', 'john', 'mary'],
    'marks' : [100, 240, 307, 650, 170, 480]
})
df

Unnamed: 0,name,marks
0,alex,100
1,john,240
2,mary,307
3,alex,650
4,john,170
5,mary,480


In [112]:
#apply target encoding
target_encoder = ce.TargetEncoder(cols = 'name')

#fit transform target encoding
target_encoder.fit_transform(df['name'], df['marks'])

Unnamed: 0,name
0,331.663479
1,307.548798
2,334.287723
3,331.663479
4,307.548798
5,334.287723


### **6- Hash Encoding**

In [113]:
data = pd.DataFrame({
    'color' : ['blue', 'blue', 'green', 'black', 'blue', 'yellow', 'black', 'green']
})
data

Unnamed: 0,color
0,blue
1,blue
2,green
3,black
4,blue
5,yellow
6,black
7,green


In [114]:
data['color'].unique()

array(['blue', 'green', 'black', 'yellow'], dtype=object)

In [116]:
#fit & transform hash encoding
hash_encoder = ce.HashingEncoder(cols = ['color'], n_components = 4)
hash_encoder.fit_transform(data)

Unnamed: 0,col_0,col_1,col_2,col_3
0,0,0,0,1
1,0,0,0,1
2,1,0,0,0
3,0,1,0,0
4,0,0,0,1
5,0,1,0,0
6,0,1,0,0
7,1,0,0,0


## **Example**

In [203]:
data = pd.read_csv("C:/Users/HP/Downloads/Data (2).csv")

In [204]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,India,34.0,92000.0,Yes
1,Sri lanka,22.0,25000.0,Yes
2,China,31.0,74000.0,Yes
3,Sri lanka,29.0,,No
4,China,55.0,98000.0,Yes


In [205]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [206]:
new_df = data.copy()
new_df

Unnamed: 0,Country,Age,Salary,Purchased
0,India,34.0,92000.0,Yes
1,Sri lanka,22.0,25000.0,Yes
2,China,31.0,74000.0,Yes
3,Sri lanka,29.0,,No
4,China,55.0,98000.0,Yes
5,India,24.0,30000.0,No
6,Sri lanka,28.0,40000.0,No
7,India,,60000.0,No
8,China,51.0,89000.0,Yes
9,India,44.0,78000.0,Yes


In [207]:
new_df['Age'].fillna(new_df['Age'].mean(), inplace = True)

In [208]:
new_df['Salary'].fillna(new_df['Salary'].mean(), inplace = True)

In [209]:
#new_df.fillna(new_df.mean(), inplace = True)

In [210]:
new_df.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

In [211]:
new_df

Unnamed: 0,Country,Age,Salary,Purchased
0,India,34.0,92000.0,Yes
1,Sri lanka,22.0,25000.0,Yes
2,China,31.0,74000.0,Yes
3,Sri lanka,29.0,54857.142857,No
4,China,55.0,98000.0,Yes
5,India,24.0,30000.0,No
6,Sri lanka,28.0,40000.0,No
7,India,33.714286,60000.0,No
8,China,51.0,89000.0,Yes
9,India,44.0,78000.0,Yes


In [212]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [213]:
new_df['Country'].unique()

array(['India', 'Sri lanka', 'China'], dtype=object)

In [214]:
label_encoder = LabelEncoder()
encoded_data = label_encoder.fit_transform(new_df['Country'])

In [215]:
encoded_data

array([1, 2, 0, 2, 0, 1, 2, 1, 0, 1, 2, 0, 1, 1, 2])

In [216]:
#convert array ---> dataframe
encoded_data = pd.DataFrame(encoded_data)

In [217]:
encoded_data

Unnamed: 0,0
0,1
1,2
2,0
3,2
4,0
5,1
6,2
7,1
8,0
9,1


In [218]:
#rename column
encoded_data.rename(columns = {0: "Encoded_country"}, inplace = True)

In [219]:
encoded_data

Unnamed: 0,Encoded_country
0,1
1,2
2,0
3,2
4,0
5,1
6,2
7,1
8,0
9,1


In [220]:
new_df = pd.concat([new_df, encoded_data], axis = 1, join = 'inner')

In [221]:
new_df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Encoded_country
0,India,34.0,92000.0,Yes,1
1,Sri lanka,22.0,25000.0,Yes,2
2,China,31.0,74000.0,Yes,0
3,Sri lanka,29.0,54857.142857,No,2
4,China,55.0,98000.0,Yes,0


In [222]:
new_df['Purchased'].value_counts()

Yes    9
No     6
Name: Purchased, dtype: int64

In [223]:
#call OneHotEncoder
one_hot_encoder = OneHotEncoder()

#fit & transform OneHotEncoder
encoded_purchased = one_hot_encoder.fit_transform(new_df[['Purchased']]).toarray()

In [224]:
encoded_purchased

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [225]:
encoded_purchased = pd.DataFrame(encoded_purchased)

In [226]:
encoded_purchased

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,0.0,1.0
5,1.0,0.0
6,1.0,0.0
7,1.0,0.0
8,0.0,1.0
9,0.0,1.0


In [227]:
encoded_purchased.rename(columns = {0: "Not Purchased", 1: 'Already Purchased'}, inplace = True)

In [228]:
encoded_purchased

Unnamed: 0,Not Purchased,Already Purchased
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,0.0,1.0
5,1.0,0.0
6,1.0,0.0
7,1.0,0.0
8,0.0,1.0
9,0.0,1.0


In [229]:
new_df = pd.concat([new_df, encoded_purchased], axis = 1, join = 'inner')
new_df.head()

Unnamed: 0,Country,Age,Salary,Purchased,Encoded_country,Not Purchased,Already Purchased
0,India,34.0,92000.0,Yes,1,0.0,1.0
1,Sri lanka,22.0,25000.0,Yes,2,0.0,1.0
2,China,31.0,74000.0,Yes,0,0.0,1.0
3,Sri lanka,29.0,54857.142857,No,2,1.0,0.0
4,China,55.0,98000.0,Yes,0,0.0,1.0


In [230]:
new_df.drop(columns = ['Country', 'Purchased'], inplace = True)

In [231]:
new_df.head()

Unnamed: 0,Age,Salary,Encoded_country,Not Purchased,Already Purchased
0,34.0,92000.0,1,0.0,1.0
1,22.0,25000.0,2,0.0,1.0
2,31.0,74000.0,0,0.0,1.0
3,29.0,54857.142857,2,1.0,0.0
4,55.0,98000.0,0,0.0,1.0


## **Feature Scaling**

### **1- Min-Max Scaler**

Min-Max Scaling (Normalization): This method scales the features to a fixed range, usually between 0 and 1. 

The formula for min-max scaling is:
X_scaled = (X - X_min) / (X_max - X_min) 

where X is the original feature value, X_min is the minimum value of the feature, and X_max is the maximum value of the feature.

In [None]:
8,9,3,4,2
8 ---> (8-2)/ (9-2)
6/7 = 0.85 -->(0,1)

In [232]:
from sklearn.preprocessing import MinMaxScaler

In [233]:
data = np.array([[10,1000], [5,500], [3,300], [8,800]])
data

array([[  10, 1000],
       [   5,  500],
       [   3,  300],
       [   8,  800]])

In [234]:
#min max scaler
min_max_sacler = MinMaxScaler()
#fit & transform MinMaxScaler on the data
data_Scaled = min_max_sacler.fit_transform(data)

In [235]:
data_Scaled

array([[1.        , 1.        ],
       [0.28571429, 0.28571429],
       [0.        , 0.        ],
       [0.71428571, 0.71428571]])

### **2- Standardization**

In [236]:
from sklearn.preprocessing import StandardScaler

In [237]:
#call StandardScaler
scaler = StandardScaler()

#fit & transform StandardScaler
scaled_data = scaler.fit_transform(data)

In [238]:
scaled_data

array([[ 1.29986737,  1.29986737],
       [-0.55708601, -0.55708601],
       [-1.29986737, -1.29986737],
       [ 0.55708601,  0.55708601]])

In [239]:
# inverse transform to go to the original data
scaled_data = scaler.inverse_transform(scaled_data)

In [240]:
scaled_data

array([[  10., 1000.],
       [   5.,  500.],
       [   3.,  300.],
       [   8.,  800.]])

### **3- Max Abs Scaling**

Max Abs Scaling
Similar to Min-Max Scaling, but instead of scaling to a specific range, it scales the data to the absolute maximum value, preserving the sign of the original data. The formula is:

X_scaled = X / max(abs(X))

In [242]:
from sklearn.preprocessing import MaxAbsScaler

In [243]:
#call MaxAbsScaler
scaler = MaxAbsScaler()

#fit &transform on the data
scaled_data = scaler.fit_transform(data)

scaled_data

array([[1. , 1. ],
       [0.5, 0.5],
       [0.3, 0.3],
       [0.8, 0.8]])

### **4- Robust Scaler**

In [244]:
from sklearn.preprocessing import RobustScaler

In [246]:
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)
scaled_data

array([[ 0.875,  0.875],
       [-0.375, -0.375],
       [-0.875, -0.875],
       [ 0.375,  0.375]])

### **5- Power Transformation**

In [248]:
from sklearn.preprocessing import PowerTransformer

In [249]:
scaler = PowerTransformer()
scaled_data = scaler.fit_transform(data)
scaled_data

array([[ 1.2468821 ,  1.23763356],
       [-0.49225774, -0.47854137],
       [-1.35665849, -1.36725832],
       [ 0.60203413,  0.60816613]])

### **6- Mean Normalization**

In [250]:
mean_data = np.mean(data)

In [251]:
mean_data

328.25

In [253]:
scaled_data = data-mean_data
scaled_data

array([[-318.25,  671.75],
       [-323.25,  171.75],
       [-325.25,  -28.25],
       [-320.25,  471.75]])

## **Example**

In [None]:
1- read the dataset
2- info
3- describe
4- unique

minmax scaler
standard scaler
max abs scaler
robust scaler
power transformation
mean normalization

In [254]:
data = pd.read_csv("SampleFile.csv")
data.head()

Unnamed: 0,LotArea,MSSubClass
0,8450,60
1,9600,20
2,11250,60
3,9550,70
4,14260,60


In [255]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   LotArea     1460 non-null   int64
 1   MSSubClass  1460 non-null   int64
dtypes: int64(2)
memory usage: 22.9 KB


In [256]:
data['LotArea'].unique()

array([ 8450,  9600, 11250, ..., 17217, 13175,  9717], dtype=int64)

In [257]:
data['MSSubClass'].unique()

array([ 60,  20,  70,  50, 190,  45,  90, 120,  30,  85,  80, 160,  75,
       180,  40], dtype=int64)

In [258]:
max_values = np.max(data)
max_values

  return reduction(axis=axis, out=out, **passkwargs)


LotArea       215245
MSSubClass       190
dtype: int64

In [259]:
min_values = np.min(data)
min_values

  return reduction(axis=axis, out=out, **passkwargs)


LotArea       1300
MSSubClass      20
dtype: int64

In [260]:
# 1- min max scaler
min_max_scaler = MinMaxScaler()

min_max_scaler_data = min_max_scaler.fit_transform(data)

In [261]:
min_max_scaler_data

array([[0.0334198 , 0.23529412],
       [0.03879502, 0.        ],
       [0.04650728, 0.23529412],
       ...,
       [0.03618687, 0.29411765],
       [0.03934189, 0.        ],
       [0.04037019, 0.        ]])

In [262]:
min_max_scaler_data = pd.DataFrame(min_max_scaler_data, columns = data.columns)
min_max_scaler_data

Unnamed: 0,LotArea,MSSubClass
0,0.033420,0.235294
1,0.038795,0.000000
2,0.046507,0.235294
3,0.038561,0.294118
4,0.060576,0.235294
...,...,...
1455,0.030929,0.235294
1456,0.055505,0.000000
1457,0.036187,0.294118
1458,0.039342,0.000000


In [263]:
max_value_minmax = np.max(min_max_scaler_data)
max_value_minmax

  return reduction(axis=axis, out=out, **passkwargs)


LotArea       1.0
MSSubClass    1.0
dtype: float64

In [264]:
min_value_minmax = np.min(min_max_scaler_data)
min_value_minmax

  return reduction(axis=axis, out=out, **passkwargs)


LotArea       0.0
MSSubClass    0.0
dtype: float64

In [265]:
# 2- Standard scaler
std_scaler = StandardScaler()
std_scaler_data = std_scaler.fit_transform(data)
std_scaler_data

array([[-0.20714171,  0.07337496],
       [-0.09188637, -0.87256276],
       [ 0.07347998,  0.07337496],
       ...,
       [-0.14781027,  0.30985939],
       [-0.08016039, -0.87256276],
       [-0.05811155, -0.87256276]])

In [266]:
std_scaler_data = pd.DataFrame(std_scaler_data, columns = data.columns)
std_scaler_data

Unnamed: 0,LotArea,MSSubClass
0,-0.207142,0.073375
1,-0.091886,-0.872563
2,0.073480,0.073375
3,-0.096897,0.309859
4,0.375148,0.073375
...,...,...
1455,-0.260560,0.073375
1456,0.266407,-0.872563
1457,-0.147810,0.309859
1458,-0.080160,-0.872563


In [267]:
print(np.max(std_scaler_data))

LotArea       20.518273
MSSubClass     3.147673
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [268]:
print(np.min(std_scaler_data))

LotArea      -0.923729
MSSubClass   -0.872563
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [270]:
# 3- max Abs scaler
max_abs_scaler = MaxAbsScaler()
max_abs_scaler_data = max_abs_scaler.fit_transform(data)
max_abs_scaler_data = pd.DataFrame(max_abs_scaler_data, columns = data.columns)
max_abs_scaler_data.head()

Unnamed: 0,LotArea,MSSubClass
0,0.039258,0.315789
1,0.0446,0.105263
2,0.052266,0.315789
3,0.044368,0.368421
4,0.06625,0.315789


In [271]:
print(np.max(max_abs_scaler_data))

LotArea       1.0
MSSubClass    1.0
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [272]:
print(np.min(max_abs_scaler_data))

LotArea       0.006040
MSSubClass    0.105263
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [273]:
# 4- Robust Scaler
robust_scaler = RobustScaler()
robust_scaler_data = robust_scaler.fit_transform(data)
robust_scaler_data = pd.DataFrame(robust_scaler_data, columns = data.columns)
robust_scaler_data.head()

Unnamed: 0,LotArea,MSSubClass
0,-0.254076,0.2
1,0.030015,-0.6
2,0.437624,0.2
3,0.017663,0.4
4,1.181201,0.2


In [274]:
print(np.max(robust_scaler_data))

LotArea       50.831645
MSSubClass     2.800000
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [275]:
print(np.min(robust_scaler_data))

LotArea      -2.02038
MSSubClass   -0.60000
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [276]:
# 5- Power Transformer
power_transformer = PowerTransformer()
power_transformer_data = power_transformer.fit_transform(data)
power_transformer_data = pd.DataFrame(power_transformer_data, columns = data.columns)
power_transformer_data.head()

Unnamed: 0,LotArea,MSSubClass
0,-0.141171,0.49346
1,0.105609,-1.164269
2,0.413721,0.49346
3,0.09549,0.698191
4,0.877127,0.49346


In [277]:
print(np.max(power_transformer_data))

LotArea       6.431170
MSSubClass    1.871536
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [278]:
print(np.min(power_transformer_data))

LotArea      -3.651210
MSSubClass   -1.164269
dtype: float64


  return reduction(axis=axis, out=out, **passkwargs)


In [279]:
# 6- Mean Normalization
mean_data = np.mean(data)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [280]:
mean_data

LotArea       10516.828082
MSSubClass       56.897260
dtype: float64

In [281]:
normalized_data = data - mean_data

In [282]:
normalized_data

Unnamed: 0,LotArea,MSSubClass
0,-2066.828082,3.10274
1,-916.828082,-36.89726
2,733.171918,3.10274
3,-966.828082,13.10274
4,3743.171918,3.10274
...,...,...
1455,-2599.828082,3.10274
1456,2658.171918,-36.89726
1457,-1474.828082,13.10274
1458,-799.828082,-36.89726


In [None]:
Preprocessing:
    1- Handle Missing Values
    2- Handle Categorical data
    3- Scaling Data
    4- Remove Duplicates records
    5- Handle/Remove Outliers
    6- Handle Imbalanced data
    7- Split data into train and test dataset