Step 1: Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

Step 2: Load cleaned dataset

In [None]:
input_path = r"path/to/your/input.csv" # Replace with your actual input file path
df = pd.read_csv(input_path)

Step 3: Convert string ranges to numeric midpoints

In [None]:
def range_to_mid(val):
    if isinstance(val, str) and '-' in val:
        try:
            low, high = map(int, val.split('-'))
            return (low + high) / 2
        except:
            return np.nan
    return val

df['age_mid'] = df['age'].apply(range_to_mid)
df['tumor_size_mid'] = df['tumor-size'].apply(range_to_mid)
df['inv_nodes_mid'] = df['inv-nodes'].apply(range_to_mid)

Step 4: Encode class label to binary BEFORE one-hot encoding

In [None]:
df['class_binary'] = df['Class'].map({
    'no-recurrence-events': 0,
    'recurrence-events': 1
})

Step 5: One-hot encode selected categorical variables

In [None]:
categorical_cols = ['menopause', 'node-caps', 'breast', 'breast-quad', 'irradiat']
df_dummies = pd.get_dummies(df[categorical_cols], prefix_sep='_')

# Merge one-hot encoded columns with original dataframe
df_combined = pd.concat([df, df_dummies], axis=1)

Step 6: Create additional engineered features

In [None]:

# Width of range-based features
def range_width(val):
    if isinstance(val, str) and '-' in val:
        try:
            low, high = map(int, val.split('-'))
            return high - low
        except:
            return np.nan
    return np.nan

df_combined['age_range_width'] = df_combined['age'].apply(range_width)
df_combined['tumor_size_range_width'] = df_combined['tumor-size'].apply(range_width)
df_combined['inv_nodes_range_width'] = df_combined['inv-nodes'].apply(range_width)

# Label encode range columns
for col in ['age', 'tumor-size', 'inv-nodes']:
    df_combined[f'{col}_label'] = LabelEncoder().fit_transform(df_combined[col])

# New feature malignancy score
df_combined['malignancy_score'] = df_combined['deg-malig'] * df_combined['tumor_size_mid']


Step 7: Reorder columns with original features first

In [None]:
original_columns = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat', 'deg-malig', 'Class']
engineered_columns = [col for col in df_combined.columns if col not in original_columns]

final_columns = original_columns + engineered_columns
df_final = df_combined[final_columns]

Step 8: Save final dataset

In [None]:
output_path = r"path/to/your/output.csv" # Replace with your actual output file path
df_final.to_csv(output_path, index=False)

print(f"✅ Feature-engineered dataset saved to: {output_path}")
print(f"📊 Final shape: {df_final.shape}")
df_final.head()


✅ Feature-engineered dataset saved to: C:\Users\mustafaerensoyhan\Downloads\breast cancer clustering classification\breast_cancer_feature_eng.csv
📊 Final shape: (286, 35)


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,breast,breast-quad,irradiat,deg-malig,Class,...,breast-quad_right_up,irradiat_no,irradiat_yes,age_range_width,tumor_size_range_width,inv_nodes_range_width,age_label,tumor-size_label,inv-nodes_label,malignancy_score
0,40-49,premeno,15-19,0-2,yes,right,left_up,no,3,recurrence-events,...,False,True,False,9,4,2,2,2,0,51.0
1,50-59,ge40,15-19,0-2,no,right,central,no,1,no-recurrence-events,...,False,True,False,9,4,2,3,2,0,17.0
2,50-59,ge40,35-39,0-2,no,left,left_low,no,2,recurrence-events,...,False,True,False,9,4,2,3,6,0,74.0
3,40-49,premeno,35-39,0-2,yes,right,left_low,yes,3,no-recurrence-events,...,False,False,True,9,4,2,2,6,0,111.0
4,40-49,premeno,30-34,3-5,yes,left,right_up,no,2,recurrence-events,...,True,True,False,9,4,2,2,5,4,64.0
