In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
column_names = [
    'ID', 'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 
    'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 
    'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 
    'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 
    'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3'
]

df = pd.read_csv('../data/wdbc.data', header=None, names=column_names)

print(df.head())
print(df['Diagnosis'].value_counts())  

df = df.drop('ID', axis=1)

         ID Diagnosis  radius1  texture1  perimeter1   area1  smoothness1  \
0    842302         M    17.99     10.38      122.80  1001.0      0.11840   
1    842517         M    20.57     17.77      132.90  1326.0      0.08474   
2  84300903         M    19.69     21.25      130.00  1203.0      0.10960   
3  84348301         M    11.42     20.38       77.58   386.1      0.14250   
4  84358402         M    20.29     14.34      135.10  1297.0      0.10030   

   compactness1  concavity1  concave_points1  ...  radius3  texture3  \
0       0.27760      0.3001          0.14710  ...    25.38     17.33   
1       0.07864      0.0869          0.07017  ...    24.99     23.41   
2       0.15990      0.1974          0.12790  ...    23.57     25.53   
3       0.28390      0.2414          0.10520  ...    14.91     26.50   
4       0.13280      0.1980          0.10430  ...    22.54     16.67   

   perimeter3   area3  smoothness3  compactness3  concavity3  concave_points3  \
0      184.60  2019.0  

In [4]:
# 2. Encode the target variable (diagnosis: M → 1, B → 0)
le = LabelEncoder()

# Fits on unique values and transforms
df['Diagnosis'] = le.fit_transform(df['Diagnosis'])  

print(le.classes_)  
print(df['Diagnosis'].head()) 

['B' 'M']
0    1
1    1
2    1
3    1
4    1
Name: Diagnosis, dtype: int64


In [5]:
# Train/Test Split
X = df.drop('Diagnosis', axis=1)    
y = df['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)   

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 455 samples
Test set size: 114 samples


In [6]:
# Load your raw data (adjust path/filename)
df = pd.read_csv('..\data\wdbc.data', header=None)  # or your saved raw CSV

# Standard column names
columns = [
    'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
    'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
    'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
    'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]

df_features = df.iloc[:, 2:]          # skip id & diagnosis
df_features.columns = columns

# Print stats for your top features
top_features = [
    'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
    'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
    'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
    'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]

for col in top_features:
    print(f"{col:22} | min: {df_features[col].min():6.3f} | max: {df_features[col].max():6.3f} | "
          f"mean: {df_features[col].mean():6.3f} | median: {df_features[col].median():6.3f}")

radius_mean            | min:  6.981 | max: 28.110 | mean: 14.127 | median: 13.370
texture_mean           | min:  9.710 | max: 39.280 | mean: 19.290 | median: 18.840
perimeter_mean         | min: 43.790 | max: 188.500 | mean: 91.969 | median: 86.240
area_mean              | min: 143.500 | max: 2501.000 | mean: 654.889 | median: 551.100
smoothness_mean        | min:  0.053 | max:  0.163 | mean:  0.096 | median:  0.096
compactness_mean       | min:  0.019 | max:  0.345 | mean:  0.104 | median:  0.093
concavity_mean         | min:  0.000 | max:  0.427 | mean:  0.089 | median:  0.062
concave points_mean    | min:  0.000 | max:  0.201 | mean:  0.049 | median:  0.034
symmetry_mean          | min:  0.106 | max:  0.304 | mean:  0.181 | median:  0.179
fractal_dimension_mean | min:  0.050 | max:  0.097 | mean:  0.063 | median:  0.062
radius_se              | min:  0.112 | max:  2.873 | mean:  0.405 | median:  0.324
texture_se             | min:  0.360 | max:  4.885 | mean:  1.217 | median:  1.10

  df = pd.read_csv('..\data\wdbc.data', header=None)  # or your saved raw CSV


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

# Quick check: Convert back to DF for viewing
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
print(X_train_scaled_df.describe())  

            radius1      texture1    perimeter1         area1   smoothness1  \
count  4.550000e+02  4.550000e+02  4.550000e+02  4.550000e+02  4.550000e+02   
mean  -1.737316e-16  3.904081e-16  4.704418e-16 -1.171224e-16  7.242070e-16   
std    1.001101e+00  1.001101e+00  1.001101e+00  1.001101e+00  1.001101e+00   
min   -2.009730e+00 -2.265011e+00 -1.961360e+00 -1.433461e+00 -2.342455e+00   
25%   -6.869865e-01 -7.192578e-01 -6.877646e-01 -6.643427e-01 -7.599676e-01   
50%   -2.310610e-01 -1.207886e-01 -2.444667e-01 -3.143644e-01 -5.267576e-02   
75%    4.947835e-01  5.628428e-01  4.975359e-01  3.775369e-01  6.231343e-01   
max    3.900239e+00  4.634299e+00  3.899731e+00  5.114742e+00  4.715773e+00   

       compactness1    concavity1  concave_points1     symmetry1  \
count  4.550000e+02  4.550000e+02     4.550000e+02  4.550000e+02   
mean  -5.075305e-17 -4.489693e-17     2.928061e-17  2.342449e-17   
std    1.001101e+00  1.001101e+00     1.001101e+00  1.001101e+00   
min   -1.568307e

In [8]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
print(X_train_scaled_df.describe().round(2))

       radius1  texture1  perimeter1   area1  smoothness1  compactness1  \
count   455.00    455.00      455.00  455.00       455.00        455.00   
mean     -0.00      0.00        0.00   -0.00         0.00         -0.00   
std       1.00      1.00        1.00    1.00         1.00          1.00   
min      -2.01     -2.27       -1.96   -1.43        -2.34         -1.57   
25%      -0.69     -0.72       -0.69   -0.66        -0.76         -0.74   
50%      -0.23     -0.12       -0.24   -0.31        -0.05         -0.21   
75%       0.49      0.56        0.50    0.38         0.62          0.49   
max       3.90      4.63        3.90    5.11         4.72          4.49   

       concavity1  concave_points1  symmetry1  fractal_dimension1  ...  \
count      455.00           455.00     455.00              455.00  ...   
mean        -0.00             0.00       0.00                0.00  ...   
std          1.00             1.00       1.00                1.00  ...   
min         -1.09           

In [9]:
import os

# Create a data folder if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

np.save('../data/processed/X_train_scaled.npy', X_train_scaled)
np.save('../data/processed/X_test_scaled.npy',  X_test_scaled)

joblib.dump(scaler, '../models/scaler.joblib') 

np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_test.npy', y_test)

print("Saved processed data successfully!")

# Optional: also save the original (unscaled) splits if you want them later
np.save('../data/processed/X_train.npy', X_train.values) 
np.save('../data/processed/X_test.npy',  X_test.values)

Saved processed data successfully!
