In [1]:
#data preprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# load sample data
from sklearn.datasets import load_breast_cancer
cancer= load_breast_cancer()
print(cancer.feature_names)
# print(cancer.data)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [15]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df["target"] = cancer.target

In [21]:
#data overview 
print("Dataset shape:", df.shape)
print("\nhead: \n",df.head())
print("\ninfo: \n",df.info())
print("\nstatistical summery: \n",df.describe())


Dataset shape: (569, 31)

head: 
    mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perim

In [25]:
#check missing value
print("\nmissing value: ",df.isna().sum())


missing value:  mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [27]:
#data preprocessing
from sklearn.preprocessing import StandardScaler
# Standardization
scaler = StandardScaler()
features = df.columns[:-1]  # Exclude target column
df_scaled = pd.DataFrame(
    scaler.fit_transform(df[features]),
    columns=features
)
df_scaled['target'] = df['target']

print("\nScaled data:")
print(df_scaled.head())
print("\nScaled data statistics:")
print(df_scaled.describe())


Scaled data:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.768909      0.253732       -0.592687  -0.764464         3.283553   
4     1.750297     -1.151816        1.776573   1.826229         0.280372   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0          3.283515        2.652874             2.532475       2.217515   
1         -0.487072       -0.023846             0.548144       0.001392   
2          1.052926        1.363478             2.037231       0.939685   
3          3.402909        1.915897             1.451707       2.867383   
4          0.539340        1.371011             1.428493      -0.009560   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \


In [29]:
#Label Encoding Example

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample data
df = pd.DataFrame({
    'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']
})

# Apply Label Encoding
le = LabelEncoder()
df['Color_LabelEncoded'] = le.fit_transform(df['Color'])

print(df)

   Color  Color_LabelEncoded
0    Red                   2
1  Green                   1
2   Blue                   0
3  Green                   1
4    Red                   2


In [31]:
#One-Hot Encoding Example
# Apply One-Hot Encoding
df_onehot = pd.get_dummies(df['Color'], prefix='Color')

# Concatenate with original DataFrame (optional)
df_combined = pd.concat([df, df_onehot], axis=1)

print(df_combined)


   Color  Color_LabelEncoded  Color_Blue  Color_Green  Color_Red
0    Red                   2       False        False       True
1  Green                   1       False         True      False
2   Blue                   0        True        False      False
3  Green                   1       False         True      False
4    Red                   2       False        False       True
