In [None]:
import pandas as pd
import numpy as np      
from ucimlrepo import fetch_ucirepo 

In [None]:
# Fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# Split data into features and target variable
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# Prints basic information about the dataset
print(breast_cancer_wisconsin_diagnostic.metadata) 

# Shows the table with information about every column:
print(breast_cancer_wisconsin_diagnostic.variables) 


In [None]:
# Basic info on X and y
print(X.info())
print(X.describe()) 
print(X.head())

print(y.info())
print(y.describe())
print(y.head())

# Check for missing values
print(X.isnull().sum())
print(y.isnull().sum())



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ── Distributions ────────────────────────────────
X.hist(bins=15, figsize=(15, 11), color='cornflowerblue', edgecolor='black')
plt.suptitle('Feature Distributions – Breast Cancer Dataset', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

# ── Correlation Heatmap ──────────────────────────
plt.figure(figsize=(16, 14))
corr = X.corr().round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))  

sns.heatmap(corr, mask=mask, annot=True, fmt='.2f',
            cmap='coolwarm', vmin=-1, vmax=1, center=0,
            square=True, linewidths=0.4)
plt.title('Feature Correlation Matrix', fontsize=15, pad=20)
plt.tight_layout()
plt.show()

# ── Class Balance ────────────────────────────────
plt.figure(figsize=(6,4))
sns.countplot(x=y.values.flatten(), hue = y.squeeze(), palette='Set2', legend=False)
plt.title('Class Balance – Diagnosis', fontsize=13)
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
plt.ylabel('Count')
plt.show()


In [None]:
"""
Data Exploration Summary
------------------------
The Breast Cancer Wisconsin Diagnostic dataset contains 569 samples with
30 numerical features describing tumor characteristics.

The dataset has no missing values, ensuring data integrity for analysis.
Feature distributions vary widely, with some exhibiting skewness that may
require transformation for certain modeling techniques. Correlation
analysis shows that several features are strongly correlated, indicating
potential redundancy that could be addressed through feature selection or
dimensionality reduction.

The target variable (diagnosis) is imbalanced, with benign cases occurring
more frequently than malignant ones. This class imbalance should be
considered during model training to avoid biased predictions.
"""
