# Data Transformation Techniques

In [1]:
#import packages
import pandas as pd
from sklearn import preprocessing

In [2]:
#import dataset
df = pd.read_csv('Life Expectancy Data.csv')

In [14]:
#view data
df.head(10)

Unnamed: 0,infant deaths
0,62
1,64
2,66
3,69
4,71
5,74
6,77
7,80
8,82
9,84


## Normalization

###  Maximum Absolute Scaling

<img src="files/max.png">

####  Using pandas

In [4]:
# copy the data 
df_maxabs_scaled = df.copy() 

#drop first three columns as they contain categorical values
df_maxabs_scaled.drop(df_maxabs_scaled.columns[[0, 1, 2]], axis = 1, inplace = True) 
  
# apply max absolute scaling 
for column in df_maxabs_scaled.columns: 
    df_maxabs_scaled[column] = df_maxabs_scaled[column]  / df_maxabs_scaled[column].abs().max() 
      
# view normalized data 
df_maxabs_scaled.head(10)


Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0.730337,0.363762,0.034444,0.00056,0.003659,0.656566,0.005439,0.218786,0.0332,0.060606,0.463636,0.656566,0.001976,0.004903,0.026074,0.620939,0.604895,0.505274,0.487923
1,0.673034,0.374827,0.035556,0.00056,0.003774,0.626263,0.002319,0.213058,0.0344,0.585859,0.464773,0.626263,0.001976,0.005141,0.000253,0.631769,0.611888,0.50211,0.483092
2,0.673034,0.370678,0.036667,0.00056,0.003759,0.646465,0.002027,0.207331,0.0356,0.626263,0.461932,0.646465,0.001976,0.005301,0.024525,0.638989,0.618881,0.495781,0.478261
3,0.668539,0.37621,0.038333,0.00056,0.004014,0.676768,0.013135,0.201604,0.0372,0.676768,0.484091,0.676768,0.001976,0.005622,0.002857,0.646209,0.629371,0.488397,0.47343
4,0.665169,0.38036,0.039444,0.00056,0.000364,0.686869,0.0142,0.197022,0.0388,0.686869,0.447159,0.686869,0.001976,0.000533,0.002302,0.65704,0.636364,0.478903,0.458937
5,0.660674,0.385892,0.041111,0.00056,0.00409,0.666667,0.009374,0.191294,0.0408,0.666667,0.522727,0.666667,0.001976,0.004643,0.002228,0.66426,0.643357,0.472574,0.444444
6,0.658427,0.388658,0.042778,0.00056,0.002914,0.636364,0.013484,0.185567,0.0424,0.636364,0.535227,0.636364,0.001976,0.003742,0.00022,0.67148,0.653846,0.457806,0.429952
7,0.652809,0.396957,0.044444,0.001679,0.001328,0.646465,0.007536,0.17984,0.044,0.646465,0.473295,0.646465,0.001976,0.003133,0.00211,0.6787,0.660839,0.456751,0.42029
8,0.646067,0.408022,0.045556,0.001119,0.00056,0.636364,0.005377,0.174112,0.0452,0.636364,0.382386,0.636364,0.001976,0.003103,0.020572,0.685921,0.667832,0.437764,0.405797
9,0.64382,0.408022,0.046667,0.001679,0.000881,0.646465,0.009379,0.168385,0.0464,0.585859,0.422159,0.585859,0.001976,0.002287,0.002001,0.693141,0.674825,0.427215,0.391304


#### Using sklearn

In [5]:
# copy the data 
df_maxabs_scaled = df.copy() 

#drop first three columns as they contain categorical values
df_maxabs_scaled.drop(df_maxabs_scaled.columns[[0, 1, 2]], axis = 1, inplace = True) 

# create an max_abs object from MaxAbsScaler class of sklearn package
max_abs = preprocessing.MaxAbsScaler()

# transform the data using the method
df_scaled = max_abs.fit_transform(df_maxabs_scaled)

# store the results in a data frame
df_scaled = pd.DataFrame(df_scaled, columns=df_maxabs_scaled.columns)

# visualize the scaled data frame
df_scaled.head(10)

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0.730337,0.363762,0.034444,0.00056,0.003659,0.656566,0.005439,0.218786,0.0332,0.060606,0.463636,0.656566,0.001976,0.004903,0.026074,0.620939,0.604895,0.505274,0.487923
1,0.673034,0.374827,0.035556,0.00056,0.003774,0.626263,0.002319,0.213058,0.0344,0.585859,0.464773,0.626263,0.001976,0.005141,0.000253,0.631769,0.611888,0.50211,0.483092
2,0.673034,0.370678,0.036667,0.00056,0.003759,0.646465,0.002027,0.207331,0.0356,0.626263,0.461932,0.646465,0.001976,0.005301,0.024525,0.638989,0.618881,0.495781,0.478261
3,0.668539,0.37621,0.038333,0.00056,0.004014,0.676768,0.013135,0.201604,0.0372,0.676768,0.484091,0.676768,0.001976,0.005622,0.002857,0.646209,0.629371,0.488397,0.47343
4,0.665169,0.38036,0.039444,0.00056,0.000364,0.686869,0.0142,0.197022,0.0388,0.686869,0.447159,0.686869,0.001976,0.000533,0.002302,0.65704,0.636364,0.478903,0.458937
5,0.660674,0.385892,0.041111,0.00056,0.00409,0.666667,0.009374,0.191294,0.0408,0.666667,0.522727,0.666667,0.001976,0.004643,0.002228,0.66426,0.643357,0.472574,0.444444
6,0.658427,0.388658,0.042778,0.00056,0.002914,0.636364,0.013484,0.185567,0.0424,0.636364,0.535227,0.636364,0.001976,0.003742,0.00022,0.67148,0.653846,0.457806,0.429952
7,0.652809,0.396957,0.044444,0.001679,0.001328,0.646465,0.007536,0.17984,0.044,0.646465,0.473295,0.646465,0.001976,0.003133,0.00211,0.6787,0.660839,0.456751,0.42029
8,0.646067,0.408022,0.045556,0.001119,0.00056,0.636364,0.005377,0.174112,0.0452,0.636364,0.382386,0.636364,0.001976,0.003103,0.020572,0.685921,0.667832,0.437764,0.405797
9,0.64382,0.408022,0.046667,0.001679,0.000881,0.646465,0.009379,0.168385,0.0464,0.585859,0.422159,0.585859,0.001976,0.002287,0.002001,0.693141,0.674825,0.427215,0.391304


###  MinMax Scaling

<img src="files/min.png">

#### Using pandas

In [6]:
# copy the data 
df_minmax = df.copy()

#drop first three columns as they contain categorical values
df_minmax.drop(df_minmax.columns[[0, 1, 2]], axis = 1, inplace = True) 
  
# apply normalization techniques 
for column in df_minmax.columns: 
    df_minmax[column] = (df_minmax[column] - df_minmax[column].min()) / (df_minmax[column].max() - df_minmax[column].min())     

# view normalized data 
df_minmax.head(10)

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0.544592,0.362881,0.034444,0.0,0.003659,0.653061,0.005439,0.209733,0.0332,0.03125,0.452118,0.649485,0.0,0.004889,0.026074,0.619565,0.603509,0.505274,0.487923
1,0.447818,0.373961,0.035556,0.0,0.003774,0.622449,0.002319,0.20394,0.0344,0.572917,0.453279,0.618557,0.0,0.005127,0.000253,0.630435,0.610526,0.50211,0.483092
2,0.447818,0.369806,0.036667,0.0,0.003759,0.642857,0.002027,0.198146,0.0356,0.614583,0.450377,0.639175,0.0,0.005287,0.024525,0.637681,0.617544,0.495781,0.478261
3,0.440228,0.375346,0.038333,0.0,0.004014,0.673469,0.013135,0.192352,0.0372,0.666667,0.473012,0.670103,0.0,0.005608,0.002857,0.644928,0.62807,0.488397,0.47343
4,0.434535,0.379501,0.039444,0.0,0.000364,0.683673,0.0142,0.187717,0.0388,0.677083,0.435287,0.680412,0.0,0.000519,0.002302,0.655797,0.635088,0.478903,0.458937
5,0.426945,0.385042,0.041111,0.0,0.00409,0.663265,0.009374,0.181924,0.0408,0.65625,0.512478,0.659794,0.0,0.004629,0.002228,0.663043,0.642105,0.472574,0.444444
6,0.42315,0.387812,0.042778,0.0,0.002914,0.632653,0.013484,0.17613,0.0424,0.625,0.525247,0.628866,0.0,0.003728,0.00022,0.67029,0.652632,0.457806,0.429952
7,0.413662,0.396122,0.044444,0.00112,0.001328,0.642857,0.007536,0.170336,0.044,0.635417,0.461985,0.639175,0.0,0.003119,0.00211,0.677536,0.659649,0.456751,0.42029
8,0.402277,0.407202,0.045556,0.00056,0.00056,0.632653,0.005377,0.164542,0.0452,0.625,0.369124,0.628866,0.0,0.003089,0.020572,0.684783,0.666667,0.437764,0.405797
9,0.398482,0.407202,0.046667,0.00112,0.000881,0.642857,0.009379,0.158749,0.0464,0.572917,0.40975,0.57732,0.0,0.002273,0.002001,0.692029,0.673684,0.427215,0.391304


#### Using sklearn

In [7]:
# copy the data 
df_minmax = df.copy()

#drop first three columns as they contain categorical values
df_minmax.drop(df_minmax.columns[[0, 1, 2]], axis = 1, inplace = True) 

# create a scaler object
min_scaler = preprocessing.MinMaxScaler()

# fit and transform the data
df_norm = pd.DataFrame(min_scaler.fit_transform(df_minmax), columns=df_minmax.columns)

#view scaled data
df_norm.head(10)

  return self.partial_fit(X, y)


Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0.544592,0.362881,0.034444,0.0,0.003659,0.653061,0.005439,0.209733,0.0332,0.03125,0.452118,0.649485,0.0,0.004889,0.026074,0.619565,0.603509,0.505274,0.487923
1,0.447818,0.373961,0.035556,0.0,0.003774,0.622449,0.002319,0.20394,0.0344,0.572917,0.453279,0.618557,0.0,0.005127,0.000253,0.630435,0.610526,0.50211,0.483092
2,0.447818,0.369806,0.036667,0.0,0.003759,0.642857,0.002027,0.198146,0.0356,0.614583,0.450377,0.639175,0.0,0.005287,0.024525,0.637681,0.617544,0.495781,0.478261
3,0.440228,0.375346,0.038333,0.0,0.004014,0.673469,0.013135,0.192352,0.0372,0.666667,0.473012,0.670103,0.0,0.005608,0.002857,0.644928,0.62807,0.488397,0.47343
4,0.434535,0.379501,0.039444,0.0,0.000364,0.683673,0.0142,0.187717,0.0388,0.677083,0.435287,0.680412,0.0,0.000519,0.002302,0.655797,0.635088,0.478903,0.458937
5,0.426945,0.385042,0.041111,0.0,0.00409,0.663265,0.009374,0.181924,0.0408,0.65625,0.512478,0.659794,0.0,0.004629,0.002228,0.663043,0.642105,0.472574,0.444444
6,0.42315,0.387812,0.042778,0.0,0.002914,0.632653,0.013484,0.17613,0.0424,0.625,0.525247,0.628866,0.0,0.003728,0.00022,0.67029,0.652632,0.457806,0.429952
7,0.413662,0.396122,0.044444,0.00112,0.001328,0.642857,0.007536,0.170336,0.044,0.635417,0.461985,0.639175,0.0,0.003119,0.00211,0.677536,0.659649,0.456751,0.42029
8,0.402277,0.407202,0.045556,0.00056,0.00056,0.632653,0.005377,0.164542,0.0452,0.625,0.369124,0.628866,0.0,0.003089,0.020572,0.684783,0.666667,0.437764,0.405797
9,0.398482,0.407202,0.046667,0.00112,0.000881,0.642857,0.009379,0.158749,0.0464,0.572917,0.40975,0.57732,0.0,0.002273,0.002001,0.692029,0.673684,0.427215,0.391304


## Standardization or Z-Score

<img src="files/zscore.png">

#### Using pandas

In [8]:
# copy the data 
df_zscore = df.copy()

#drop first three columns as they contain categorical values
df_zscore.drop(df_zscore.columns[[0, 1, 2]], axis = 1, inplace = True) 
  
# apply normalization techniques 
for column in df_zscore.columns: 
    df_zscore[column] = (df_zscore[column] -
                           df_zscore[column].mean()) / df_zscore[column].std()     

# view normalized data    
df_zscore.head(10)

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,-0.443615,0.790103,0.268778,-1.133365,-0.335513,-0.635838,-0.110366,-0.958951,0.255316,-3.267459,0.889322,-0.730453,-0.32339,-0.483449,0.343917,2.796324,2.756711,-0.704355,-0.563512
1,-0.979112,0.854468,0.285738,-1.133365,-0.334384,-0.755503,-0.168095,-0.983896,0.274014,-1.047897,0.897327,-0.856945,-0.32339,-0.481456,-0.203661,2.864194,2.801068,-0.71858,-0.593284
2,-0.979112,0.830331,0.302697,-1.133365,-0.334537,-0.675726,-0.173502,-1.008841,0.292712,-0.877162,0.877314,-0.772617,-0.32339,-0.480121,0.311058,2.909441,2.845424,-0.747029,-0.623055
3,-1.021112,0.862513,0.328137,-1.133365,-0.33204,-0.556061,0.03204,-1.033786,0.317642,-0.663742,1.033419,-0.646125,-0.32339,-0.477443,-0.148436,2.954688,2.91196,-0.780219,-0.652827
4,-1.052611,0.88665,0.345097,-1.133365,-0.3678,-0.516173,0.051748,-1.053742,0.342573,-0.621059,0.773244,-0.603961,-0.32339,-0.519939,-0.16021,3.022558,2.956317,-0.822893,-0.742141
5,-1.094611,0.918832,0.370536,-1.133365,-0.331288,-0.595949,-0.03755,-1.078687,0.373736,-0.706426,1.305602,-0.688289,-0.32339,-0.485616,-0.161775,3.067805,3.000674,-0.851342,-0.831456
6,-1.115611,0.934923,0.395976,-1.133365,-0.342816,-0.715614,0.038493,-1.103632,0.398666,-0.834478,1.393661,-0.814781,-0.32339,-0.493145,-0.20437,3.113052,3.067209,-0.917723,-0.92077
7,-1.168111,0.983197,0.421415,-1.128429,-0.358354,-0.675726,-0.071559,-1.128578,0.423597,-0.791794,0.957368,-0.772617,-0.32339,-0.498228,-0.164294,3.158299,3.111566,-0.922465,-0.980313
8,-1.23111,1.047561,0.438375,-1.130897,-0.365881,-0.715614,-0.111499,-1.153523,0.442295,-0.834478,0.316937,-0.814781,-0.32339,-0.498475,0.227224,3.203546,3.155923,-1.007812,-1.069627
9,-1.25211,1.047561,0.455335,-1.128429,-0.362732,-0.675726,-0.037462,-1.178468,0.460993,-1.047897,0.597126,-1.025601,-0.32339,-0.505291,-0.16659,3.248792,3.20028,-1.055227,-1.158942


#### Using sklearn

In [9]:
# copy the data 
df_zscore = df.copy()

#drop first three columns as they contain categorical values
df_zscore.drop(df_zscore.columns[[0, 1, 2]], axis = 1, inplace = True) 

# create a scaler object
zscore_scaler = preprocessing.StandardScaler()

# fit and transform the data
df_zscore = pd.DataFrame(zscore_scaler.fit_transform(df_zscore), columns=df_zscore.columns)

#view scaled data
df_zscore.head(10)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,-0.443691,0.790238,0.268824,-1.133571,-0.33557,-0.635971,-0.110384,-0.959116,0.255359,-3.268019,0.889486,-0.730578,-0.323445,-0.483546,0.343993,2.796805,2.757185,-0.704483,-0.563614
1,-0.979279,0.854614,0.285786,-1.133571,-0.334441,-0.755661,-0.168124,-0.984066,0.27406,-1.048077,0.897493,-0.857092,-0.323445,-0.481553,-0.203706,2.864687,2.80155,-0.71871,-0.593391
2,-0.979279,0.830473,0.302749,-1.133571,-0.334594,-0.675868,-0.173531,-1.009015,0.292761,-0.877312,0.877476,-0.772749,-0.323445,-0.480218,0.311126,2.909942,2.845914,-0.747164,-0.623168
3,-1.021286,0.86266,0.328193,-1.133571,-0.332096,-0.556178,0.032045,-1.033964,0.317696,-0.663856,1.033609,-0.646235,-0.323445,-0.477539,-0.148469,2.955197,2.912461,-0.78036,-0.652944
4,-1.052791,0.886801,0.345155,-1.133571,-0.367862,-0.516281,0.051757,-1.053924,0.342631,-0.621165,0.773387,-0.604064,-0.323445,-0.520044,-0.160246,3.023079,2.956826,-0.823042,-0.742275
5,-1.094798,0.918989,0.370599,-1.133571,-0.331344,-0.596074,-0.037556,-1.078873,0.3738,-0.706547,1.305842,-0.688407,-0.323445,-0.485714,-0.16181,3.068333,3.00119,-0.851496,-0.831606
6,-1.115802,0.935083,0.396043,-1.133571,-0.342874,-0.715764,0.038499,-1.103823,0.398734,-0.834621,1.393918,-0.81492,-0.323445,-0.493244,-0.204415,3.113588,3.067737,-0.917889,-0.920936
7,-1.16831,0.983365,0.421487,-1.128635,-0.358415,-0.675868,-0.071572,-1.128772,0.423669,-0.79193,0.957544,-0.772749,-0.323445,-0.498328,-0.16433,3.158843,3.112102,-0.922631,-0.98049
8,-1.231321,1.04774,0.43845,-1.131103,-0.365944,-0.715764,-0.111518,-1.153721,0.44237,-0.834621,0.316996,-0.81492,-0.323445,-0.498575,0.227274,3.204097,3.156466,-1.007994,-1.06982
9,-1.252324,1.04774,0.455412,-1.128635,-0.362793,-0.675868,-0.037469,-1.178671,0.461071,-1.048077,0.597236,-1.025777,-0.323445,-0.505393,-0.166627,3.249352,3.200831,-1.055417,-1.159151


#### Yielding same results for z-score as sklearn with pandas

In [10]:
# copy the data 
df_zscore = df.copy()

#drop first three columns as they contain categorical values
df_zscore.drop(df_zscore.columns[[0, 1, 2]], axis = 1, inplace = True) 
  
# apply normalization techniques with paramater as ddof=0 in std() function
for column in df_zscore.columns: 
    df_zscore[column] = (df_zscore[column] -
                           df_zscore[column].mean()) / df_zscore[column].std(ddof=0)     

#view normalized data    
df_zscore.head(10)

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,-0.443691,0.790238,0.268824,-1.133571,-0.33557,-0.635971,-0.110384,-0.959116,0.255359,-3.268019,0.889486,-0.730578,-0.323445,-0.483546,0.343993,2.796805,2.757185,-0.704483,-0.563614
1,-0.979279,0.854614,0.285786,-1.133571,-0.334441,-0.755661,-0.168124,-0.984066,0.27406,-1.048077,0.897493,-0.857092,-0.323445,-0.481553,-0.203706,2.864687,2.80155,-0.71871,-0.593391
2,-0.979279,0.830473,0.302749,-1.133571,-0.334594,-0.675868,-0.173531,-1.009015,0.292761,-0.877312,0.877476,-0.772749,-0.323445,-0.480218,0.311126,2.909942,2.845914,-0.747164,-0.623168
3,-1.021286,0.86266,0.328193,-1.133571,-0.332096,-0.556178,0.032045,-1.033964,0.317696,-0.663856,1.033609,-0.646235,-0.323445,-0.477539,-0.148469,2.955197,2.912461,-0.78036,-0.652944
4,-1.052791,0.886801,0.345155,-1.133571,-0.367862,-0.516281,0.051757,-1.053924,0.342631,-0.621165,0.773387,-0.604064,-0.323445,-0.520044,-0.160246,3.023079,2.956826,-0.823042,-0.742275
5,-1.094798,0.918989,0.370599,-1.133571,-0.331344,-0.596074,-0.037556,-1.078873,0.3738,-0.706547,1.305842,-0.688407,-0.323445,-0.485714,-0.16181,3.068333,3.00119,-0.851496,-0.831606
6,-1.115802,0.935083,0.396043,-1.133571,-0.342874,-0.715764,0.038499,-1.103823,0.398734,-0.834621,1.393918,-0.81492,-0.323445,-0.493244,-0.204415,3.113588,3.067737,-0.917889,-0.920936
7,-1.16831,0.983365,0.421487,-1.128635,-0.358415,-0.675868,-0.071572,-1.128772,0.423669,-0.79193,0.957544,-0.772749,-0.323445,-0.498328,-0.16433,3.158843,3.112102,-0.922631,-0.98049
8,-1.231321,1.04774,0.43845,-1.131103,-0.365944,-0.715764,-0.111518,-1.153721,0.44237,-0.834621,0.316996,-0.81492,-0.323445,-0.498575,0.227274,3.204097,3.156466,-1.007994,-1.06982
9,-1.252324,1.04774,0.455412,-1.128635,-0.362793,-0.675868,-0.037469,-1.178671,0.461071,-1.048077,0.597236,-1.025777,-0.323445,-0.505393,-0.166627,3.249352,3.200831,-1.055417,-1.159151


## Discretization

In [11]:
# copy the data 
df_discrete = df.copy()

# extract two numeric columns from dataframe
df_discrete = df_discrete[['Year','infant deaths']]

# get the values in array
data = df_discrete.values[:, :-1]

# perform a uniform discretization transform of the dataset
trans = preprocessing.KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')

#fit the data
data = trans.fit_transform(data)

# convert the array back to a dataframe
df_discrete = pd.DataFrame(data)

#view the scaled data
df_discrete.head(10)

Unnamed: 0,0
0,9.0
1,9.0
2,8.0
3,8.0
4,7.0
5,6.0
6,6.0
7,5.0
8,4.0
9,4.0
