In [11]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

df = pd.read_csv('./data/diabetes.csv')
df = df[2:]

#Lib
X = df.drop(columns=['class'])
X = X.apply(pd.to_numeric)
X_scaled = MinMaxScaler().fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

#Manual
X_scaled_manual = pd.DataFrame()
for col in X.columns:
    min = X[col].min()
    max = X[col].max()
    X_scaled_manual[col] = (X[col] - min)/(max-min)

print(X_scaled_df.head())
print(X_scaled_manual.head())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.352941  0.743719       0.590164       0.353535  0.000000  0.500745   
1     0.058824  0.427136       0.540984       0.292929  0.000000  0.396423   
2     0.470588  0.919598       0.524590       0.000000  0.000000  0.347243   
3     0.058824  0.447236       0.540984       0.232323  0.111111  0.418778   
4     0.000000  0.688442       0.327869       0.353535  0.198582  0.642325   

   DiabetesPedigreeFunction       Age  
0                  0.234415  0.483333  
1                  0.116567  0.166667  
2                  0.253629  0.183333  
3                  0.038002  0.000000  
4                  0.943638  0.200000  
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
2     0.352941  0.743719       0.590164       0.353535  0.000000  0.500745   
3     0.058824  0.427136       0.540984       0.292929  0.000000  0.396423   
4     0.470588  0.919598       0.524590       0.000000  0

In [12]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

#Manual
X_scaled_manual = pd.DataFrame()
for col in X.columns:
  X_scaled_manual[col] = (X[col] - X[col].mean())/X[col].std()

print(X_scaled_df.head())
print(X_scaled_manual.head())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013   
1    -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422   
2     1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255   
3    -0.844885 -0.998208      -0.160546       0.154533  0.123302 -0.494043   
4    -1.141852  0.504055      -1.504687       0.907270  0.765836  1.409746   

   DiabetesPedigreeFunction       Age  
0                  0.468492  1.425995  
1                 -0.365061 -0.190672  
2                  0.604397 -0.105584  
3                 -0.920763 -1.041549  
4                  5.484909 -0.020496  
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
2     0.639530  0.847771       0.149543       0.906679 -0.692439  0.203880   
3    -0.844335 -1.122665      -0.160441       0.530556 -0.692439 -0.683976   
4     1.233077  1.942458      -0.263769      -1.287373 -0

In [13]:
from sklearn.preprocessing import Normalizer
import pandas as pd
import numpy as np

X_scaled = Normalizer().fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df['length'] = X_scaled_df.pow(2).sum(axis=1)

print(X_scaled_df.head())

feature_cols = X.columns

#Manual
X['L2_Norm'] = np.sqrt(X[feature_cols].pow(2).sum(axis=1))
X_scaled_norm_manual = X[feature_cols].div(X['L2_Norm'], axis=0)

print(X_scaled_norm_manual.head())


   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.033552  0.827625       0.402628       0.195722  0.000000  0.187893   
1     0.008424  0.716040       0.555984       0.244296  0.000000  0.224079   
2     0.040398  0.924097       0.323181       0.000000  0.000000  0.117658   
3     0.006612  0.588467       0.436392       0.152076  0.621527  0.185797   
4     0.000000  0.596386       0.174127       0.152361  0.731335  0.187622   

   DiabetesPedigreeFunction       Age  length  
0                  0.003506  0.279603     1.0  
1                  0.002957  0.261144     1.0  
2                  0.003393  0.161591     1.0  
3                  0.001104  0.138852     1.0  
4                  0.009960  0.143655     1.0  
   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
2     0.033552  0.827625       0.402628       0.195722  0.000000  0.187893   
3     0.008424  0.716040       0.555984       0.244296  0.000000  0.224079   
4     0.0

In [14]:
from sklearn.preprocessing import LabelEncoder

y = df['class']

#Lib
y_transformed = LabelEncoder().fit_transform(y)
print(y_transformed)


[1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0
 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1
 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1
 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0
 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0
 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1
 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1
 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1
 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1
 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 