In [2]:
# Import libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Load the dataset


data_path = os.path.join("..", "data", "breast-cancer.csv")

data = pd.read_csv(data_path)

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(data.head())

# Display basic information
print("\nDataset Info:")
print(data.info())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


# Handle missing values (if any)
# Fill missing numerical values with the median
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col] = data[col].fillna(data[col].median())

# Fill missing categorical values with the mode
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna(data[col].mode()[0])

print("\nAfter handling missing values:")
print(data.isnull().sum())



# Save the ID column before dropping it
if 'id' in data.columns:
    ids = data['id']
    data = data.drop(columns=['id'])

# The 'ids' variable can be used later if needed
print("Saved IDs for future reference:")
print(ids.head())


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\breast-cancer.csv'

In [7]:
# Identify the target column
target_column = 'diagnosis'  # Replace 'diagnosis' with the actual target column name

# Encode the target variable
label_encoder = LabelEncoder()
data[target_column] = label_encoder.fit_transform(data[target_column])

# Check the unique values in the target variable
print("\nTarget Variable Classes:")
print(data[target_column].value_counts())


Target Variable Classes:
diagnosis
0    357
1    212
Name: count, dtype: int64


In [None]:
# Separate features and target
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target

# Identify numerical features
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns

# Standardize the numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

print("\nStandardized Numerical Features:")
print(X.head())


Standardized Numerical Features:
   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.768909      0.253732       -0.592687  -0.764464         3.283553   
4     1.750297     -1.151816        1.776573   1.826229         0.280372   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
0          3.283515        2.652874             2.532475       2.217515   
1         -0.487072       -0.023846             0.548144       0.001392   
2          1.052926        1.363478             2.037231       0.939685   
3          3.402909        1.915897             1.451707       2.867383   
4          0.539340        1.371011             1.428493      -0.009560   

   fractal_dimension_mean  ...  radius_worst  texture_wors