In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer

In [74]:
df = pd.read_csv("breast-cancer.csv", 
    names=["age", "menopause", "tumor_size", 
           "inv-nodes", "node-caps", "deg-malig", 
           "breast", "breast-quad", "irradiat", "Class"],
    #na_values=["?", ""]
)
df

Unnamed: 0,age,menopause,tumor_size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events
...,...,...,...,...,...,...,...,...,...,...
281,50-59,ge40,30-34,6-8,yes,2,left,left_low,no,no-recurrence-events
282,50-59,premeno,25-29,3-5,yes,2,left,left_low,yes,no-recurrence-events
283,30-39,premeno,30-34,6-8,yes,2,right,right_up,no,no-recurrence-events
284,50-59,premeno,15-19,0-2,no,2,right,left_low,no,no-recurrence-events


In [75]:
print(df.isna().sum(), "\n")
df.dropna(inplace=True)
df.isna().sum()

age            0
menopause      0
tumor_size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
Class          0
dtype: int64 



age            0
menopause      0
tumor_size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
Class          0
dtype: int64

In [76]:
X = df.drop(columns=["Class"])
y = df["Class"]

In [77]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=43, test_size=0.2,
)

In [78]:
for col in X.columns:
    unique_values = X[col].unique()
    print(f"{col}: {unique_values}; Number: {len(unique_values)}\n")

age: ['40-49' '50-59' '60-69' '30-39' '70-79' '20-29']; Number: 6

menopause: ['premeno' 'ge40' 'lt40']; Number: 3

tumor_size: ['15-19' '35-39' '30-34' '25-29' '40-44' '10-14' '0-4' '20-24' '45-49'
 '50-54' '5-9']; Number: 11

inv-nodes: ['0-2' '3-5' '15-17' '6-8' '9-11' '12-14']; Number: 6

node-caps: ['yes' 'no']; Number: 2

deg-malig: [3 1 2]; Number: 3

breast: ['right' 'left']; Number: 2

breast-quad: ['left_up' 'central' 'left_low' 'right_up' 'right_low']; Number: 5

irradiat: ['no' 'yes']; Number: 2



In [79]:
ohe = OneHotEncoder()
ohe_cols = ["node-caps", "breast", "breast-quad", "irradiat"]
ohe.fit(X_train.loc[:, ohe_cols])
ohe.transform(X_test[ohe_cols][:10]).toarray()

array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.]])

In [80]:
ohe.get_feature_names_out(ohe_cols)

array(['node-caps_no', 'node-caps_yes', 'breast_left', 'breast_right',
       'breast-quad_central', 'breast-quad_left_low',
       'breast-quad_left_up', 'breast-quad_right_low',
       'breast-quad_right_up', 'irradiat_no', 'irradiat_yes'],
      dtype=object)

In [86]:
oe_cols = ["menopause", "age", "tumor_size","inv-nodes", "deg-malig"]
categories = [
    ["lt40", "premeno", "ge40"], #menopause"
    ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'], #age
    ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', #tumor-size
     '35-39', '40-44', '45-49', '50-54', '55-59'],
    ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', #inv-nodes
     '24-26', '27-29', '30-32', '33-35', '36-39'],
    [1, 2, 3], #deg-malig
]

In [87]:
oe = OrdinalEncoder(categories=categories)
oe.fit(X_train.loc[:, oe_cols])
oe.transform(X_test.loc[:, oe_cols])[:10]

array([[2., 5., 2., 0., 1.],
       [2., 5., 2., 0., 0.],
       [2., 5., 4., 5., 2.],
       [1., 4., 5., 1., 1.],
       [2., 5., 5., 0., 2.],
       [1., 2., 1., 0., 1.],
       [1., 2., 8., 0., 1.],
       [2., 5., 2., 0., 0.],
       [2., 5., 5., 0., 2.],
       [1., 2., 3., 0., 0.]])

In [88]:
column_transformer = make_column_transformer(
    (ohe, ohe_cols),
    (oe, oe_cols),
    remainder="passthrough"
)
column_transformer.fit(X_train)
column_transformer.transform(X_test)[:2]

array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 5., 2., 0., 1.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 5., 2., 0., 0.]])